2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_double
53 * Electrostatics interaction: CubicSplineTable
54 * VdW interaction: LennardJones
55 * Geometry: Water4-Water4
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
89 int vdwjidx0A
,vdwjidx0B
;
90 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
91 int vdwjidx1A
,vdwjidx1B
;
92 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
93 int vdwjidx2A
,vdwjidx2B
;
94 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
95 int vdwjidx3A
,vdwjidx3B
;
96 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
97 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
98 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
99 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
100 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
101 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
102 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
103 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
104 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
105 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
106 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
107 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
110 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
113 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
114 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
116 __m128i ifour
= _mm_set1_epi32(4);
117 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
119 __m128d dummy_mask
,cutoff_mask
;
120 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
121 __m128d one
= _mm_set1_pd(1.0);
122 __m128d two
= _mm_set1_pd(2.0);
128 jindex
= nlist
->jindex
;
130 shiftidx
= nlist
->shift
;
132 shiftvec
= fr
->shift_vec
[0];
133 fshift
= fr
->fshift
[0];
134 facel
= _mm_set1_pd(fr
->epsfac
);
135 charge
= mdatoms
->chargeA
;
136 nvdwtype
= fr
->ntype
;
138 vdwtype
= mdatoms
->typeA
;
140 vftab
= kernel_data
->table_elec
->data
;
141 vftabscale
= _mm_set1_pd(kernel_data
->table_elec
->scale
);
143 /* Setup water-specific parameters */
144 inr
= nlist
->iinr
[0];
145 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
146 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
147 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
148 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
150 jq1
= _mm_set1_pd(charge
[inr
+1]);
151 jq2
= _mm_set1_pd(charge
[inr
+2]);
152 jq3
= _mm_set1_pd(charge
[inr
+3]);
153 vdwjidx0A
= 2*vdwtype
[inr
+0];
154 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
155 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
156 qq11
= _mm_mul_pd(iq1
,jq1
);
157 qq12
= _mm_mul_pd(iq1
,jq2
);
158 qq13
= _mm_mul_pd(iq1
,jq3
);
159 qq21
= _mm_mul_pd(iq2
,jq1
);
160 qq22
= _mm_mul_pd(iq2
,jq2
);
161 qq23
= _mm_mul_pd(iq2
,jq3
);
162 qq31
= _mm_mul_pd(iq3
,jq1
);
163 qq32
= _mm_mul_pd(iq3
,jq2
);
164 qq33
= _mm_mul_pd(iq3
,jq3
);
166 /* Avoid stupid compiler warnings */
174 /* Start outer loop over neighborlists */
175 for(iidx
=0; iidx
<nri
; iidx
++)
177 /* Load shift vector for this list */
178 i_shift_offset
= DIM
*shiftidx
[iidx
];
180 /* Load limits for loop over neighbors */
181 j_index_start
= jindex
[iidx
];
182 j_index_end
= jindex
[iidx
+1];
184 /* Get outer coordinate index */
186 i_coord_offset
= DIM
*inr
;
188 /* Load i particle coords and add shift vector */
189 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
190 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
192 fix0
= _mm_setzero_pd();
193 fiy0
= _mm_setzero_pd();
194 fiz0
= _mm_setzero_pd();
195 fix1
= _mm_setzero_pd();
196 fiy1
= _mm_setzero_pd();
197 fiz1
= _mm_setzero_pd();
198 fix2
= _mm_setzero_pd();
199 fiy2
= _mm_setzero_pd();
200 fiz2
= _mm_setzero_pd();
201 fix3
= _mm_setzero_pd();
202 fiy3
= _mm_setzero_pd();
203 fiz3
= _mm_setzero_pd();
205 /* Reset potential sums */
206 velecsum
= _mm_setzero_pd();
207 vvdwsum
= _mm_setzero_pd();
209 /* Start inner kernel loop */
210 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
213 /* Get j neighbor index, and coordinate index */
216 j_coord_offsetA
= DIM
*jnrA
;
217 j_coord_offsetB
= DIM
*jnrB
;
219 /* load j atom coordinates */
220 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
221 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
222 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
224 /* Calculate displacement vector */
225 dx00
= _mm_sub_pd(ix0
,jx0
);
226 dy00
= _mm_sub_pd(iy0
,jy0
);
227 dz00
= _mm_sub_pd(iz0
,jz0
);
228 dx11
= _mm_sub_pd(ix1
,jx1
);
229 dy11
= _mm_sub_pd(iy1
,jy1
);
230 dz11
= _mm_sub_pd(iz1
,jz1
);
231 dx12
= _mm_sub_pd(ix1
,jx2
);
232 dy12
= _mm_sub_pd(iy1
,jy2
);
233 dz12
= _mm_sub_pd(iz1
,jz2
);
234 dx13
= _mm_sub_pd(ix1
,jx3
);
235 dy13
= _mm_sub_pd(iy1
,jy3
);
236 dz13
= _mm_sub_pd(iz1
,jz3
);
237 dx21
= _mm_sub_pd(ix2
,jx1
);
238 dy21
= _mm_sub_pd(iy2
,jy1
);
239 dz21
= _mm_sub_pd(iz2
,jz1
);
240 dx22
= _mm_sub_pd(ix2
,jx2
);
241 dy22
= _mm_sub_pd(iy2
,jy2
);
242 dz22
= _mm_sub_pd(iz2
,jz2
);
243 dx23
= _mm_sub_pd(ix2
,jx3
);
244 dy23
= _mm_sub_pd(iy2
,jy3
);
245 dz23
= _mm_sub_pd(iz2
,jz3
);
246 dx31
= _mm_sub_pd(ix3
,jx1
);
247 dy31
= _mm_sub_pd(iy3
,jy1
);
248 dz31
= _mm_sub_pd(iz3
,jz1
);
249 dx32
= _mm_sub_pd(ix3
,jx2
);
250 dy32
= _mm_sub_pd(iy3
,jy2
);
251 dz32
= _mm_sub_pd(iz3
,jz2
);
252 dx33
= _mm_sub_pd(ix3
,jx3
);
253 dy33
= _mm_sub_pd(iy3
,jy3
);
254 dz33
= _mm_sub_pd(iz3
,jz3
);
256 /* Calculate squared distance and things based on it */
257 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
258 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
259 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
260 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
261 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
262 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
263 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
264 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
265 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
266 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
268 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
269 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
270 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
271 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
272 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
273 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
274 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
275 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
276 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
278 rinvsq00
= gmx_mm_inv_pd(rsq00
);
280 fjx0
= _mm_setzero_pd();
281 fjy0
= _mm_setzero_pd();
282 fjz0
= _mm_setzero_pd();
283 fjx1
= _mm_setzero_pd();
284 fjy1
= _mm_setzero_pd();
285 fjz1
= _mm_setzero_pd();
286 fjx2
= _mm_setzero_pd();
287 fjy2
= _mm_setzero_pd();
288 fjz2
= _mm_setzero_pd();
289 fjx3
= _mm_setzero_pd();
290 fjy3
= _mm_setzero_pd();
291 fjz3
= _mm_setzero_pd();
293 /**************************
294 * CALCULATE INTERACTIONS *
295 **************************/
297 /* LENNARD-JONES DISPERSION/REPULSION */
299 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
300 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
301 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
302 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
303 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
305 /* Update potential sum for this i atom from the interaction with this j atom. */
306 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
310 /* Calculate temporary vectorial force */
311 tx
= _mm_mul_pd(fscal
,dx00
);
312 ty
= _mm_mul_pd(fscal
,dy00
);
313 tz
= _mm_mul_pd(fscal
,dz00
);
315 /* Update vectorial force */
316 fix0
= _mm_add_pd(fix0
,tx
);
317 fiy0
= _mm_add_pd(fiy0
,ty
);
318 fiz0
= _mm_add_pd(fiz0
,tz
);
320 fjx0
= _mm_add_pd(fjx0
,tx
);
321 fjy0
= _mm_add_pd(fjy0
,ty
);
322 fjz0
= _mm_add_pd(fjz0
,tz
);
324 /**************************
325 * CALCULATE INTERACTIONS *
326 **************************/
328 r11
= _mm_mul_pd(rsq11
,rinv11
);
330 /* Calculate table index by multiplying r with table scale and truncate to integer */
331 rt
= _mm_mul_pd(r11
,vftabscale
);
332 vfitab
= _mm_cvttpd_epi32(rt
);
333 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
334 vfitab
= _mm_slli_epi32(vfitab
,2);
336 /* CUBIC SPLINE TABLE ELECTROSTATICS */
337 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
338 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
339 GMX_MM_TRANSPOSE2_PD(Y
,F
);
340 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
341 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
342 GMX_MM_TRANSPOSE2_PD(G
,H
);
343 Heps
= _mm_mul_pd(vfeps
,H
);
344 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
345 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
346 velec
= _mm_mul_pd(qq11
,VV
);
347 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
348 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
350 /* Update potential sum for this i atom from the interaction with this j atom. */
351 velecsum
= _mm_add_pd(velecsum
,velec
);
355 /* Calculate temporary vectorial force */
356 tx
= _mm_mul_pd(fscal
,dx11
);
357 ty
= _mm_mul_pd(fscal
,dy11
);
358 tz
= _mm_mul_pd(fscal
,dz11
);
360 /* Update vectorial force */
361 fix1
= _mm_add_pd(fix1
,tx
);
362 fiy1
= _mm_add_pd(fiy1
,ty
);
363 fiz1
= _mm_add_pd(fiz1
,tz
);
365 fjx1
= _mm_add_pd(fjx1
,tx
);
366 fjy1
= _mm_add_pd(fjy1
,ty
);
367 fjz1
= _mm_add_pd(fjz1
,tz
);
369 /**************************
370 * CALCULATE INTERACTIONS *
371 **************************/
373 r12
= _mm_mul_pd(rsq12
,rinv12
);
375 /* Calculate table index by multiplying r with table scale and truncate to integer */
376 rt
= _mm_mul_pd(r12
,vftabscale
);
377 vfitab
= _mm_cvttpd_epi32(rt
);
378 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
379 vfitab
= _mm_slli_epi32(vfitab
,2);
381 /* CUBIC SPLINE TABLE ELECTROSTATICS */
382 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
383 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
384 GMX_MM_TRANSPOSE2_PD(Y
,F
);
385 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
386 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
387 GMX_MM_TRANSPOSE2_PD(G
,H
);
388 Heps
= _mm_mul_pd(vfeps
,H
);
389 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
390 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
391 velec
= _mm_mul_pd(qq12
,VV
);
392 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
393 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
395 /* Update potential sum for this i atom from the interaction with this j atom. */
396 velecsum
= _mm_add_pd(velecsum
,velec
);
400 /* Calculate temporary vectorial force */
401 tx
= _mm_mul_pd(fscal
,dx12
);
402 ty
= _mm_mul_pd(fscal
,dy12
);
403 tz
= _mm_mul_pd(fscal
,dz12
);
405 /* Update vectorial force */
406 fix1
= _mm_add_pd(fix1
,tx
);
407 fiy1
= _mm_add_pd(fiy1
,ty
);
408 fiz1
= _mm_add_pd(fiz1
,tz
);
410 fjx2
= _mm_add_pd(fjx2
,tx
);
411 fjy2
= _mm_add_pd(fjy2
,ty
);
412 fjz2
= _mm_add_pd(fjz2
,tz
);
414 /**************************
415 * CALCULATE INTERACTIONS *
416 **************************/
418 r13
= _mm_mul_pd(rsq13
,rinv13
);
420 /* Calculate table index by multiplying r with table scale and truncate to integer */
421 rt
= _mm_mul_pd(r13
,vftabscale
);
422 vfitab
= _mm_cvttpd_epi32(rt
);
423 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
424 vfitab
= _mm_slli_epi32(vfitab
,2);
426 /* CUBIC SPLINE TABLE ELECTROSTATICS */
427 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
428 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
429 GMX_MM_TRANSPOSE2_PD(Y
,F
);
430 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
431 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
432 GMX_MM_TRANSPOSE2_PD(G
,H
);
433 Heps
= _mm_mul_pd(vfeps
,H
);
434 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
435 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
436 velec
= _mm_mul_pd(qq13
,VV
);
437 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
438 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq13
,FF
),_mm_mul_pd(vftabscale
,rinv13
)));
440 /* Update potential sum for this i atom from the interaction with this j atom. */
441 velecsum
= _mm_add_pd(velecsum
,velec
);
445 /* Calculate temporary vectorial force */
446 tx
= _mm_mul_pd(fscal
,dx13
);
447 ty
= _mm_mul_pd(fscal
,dy13
);
448 tz
= _mm_mul_pd(fscal
,dz13
);
450 /* Update vectorial force */
451 fix1
= _mm_add_pd(fix1
,tx
);
452 fiy1
= _mm_add_pd(fiy1
,ty
);
453 fiz1
= _mm_add_pd(fiz1
,tz
);
455 fjx3
= _mm_add_pd(fjx3
,tx
);
456 fjy3
= _mm_add_pd(fjy3
,ty
);
457 fjz3
= _mm_add_pd(fjz3
,tz
);
459 /**************************
460 * CALCULATE INTERACTIONS *
461 **************************/
463 r21
= _mm_mul_pd(rsq21
,rinv21
);
465 /* Calculate table index by multiplying r with table scale and truncate to integer */
466 rt
= _mm_mul_pd(r21
,vftabscale
);
467 vfitab
= _mm_cvttpd_epi32(rt
);
468 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
469 vfitab
= _mm_slli_epi32(vfitab
,2);
471 /* CUBIC SPLINE TABLE ELECTROSTATICS */
472 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
473 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
474 GMX_MM_TRANSPOSE2_PD(Y
,F
);
475 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
476 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
477 GMX_MM_TRANSPOSE2_PD(G
,H
);
478 Heps
= _mm_mul_pd(vfeps
,H
);
479 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
480 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
481 velec
= _mm_mul_pd(qq21
,VV
);
482 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
483 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
485 /* Update potential sum for this i atom from the interaction with this j atom. */
486 velecsum
= _mm_add_pd(velecsum
,velec
);
490 /* Calculate temporary vectorial force */
491 tx
= _mm_mul_pd(fscal
,dx21
);
492 ty
= _mm_mul_pd(fscal
,dy21
);
493 tz
= _mm_mul_pd(fscal
,dz21
);
495 /* Update vectorial force */
496 fix2
= _mm_add_pd(fix2
,tx
);
497 fiy2
= _mm_add_pd(fiy2
,ty
);
498 fiz2
= _mm_add_pd(fiz2
,tz
);
500 fjx1
= _mm_add_pd(fjx1
,tx
);
501 fjy1
= _mm_add_pd(fjy1
,ty
);
502 fjz1
= _mm_add_pd(fjz1
,tz
);
504 /**************************
505 * CALCULATE INTERACTIONS *
506 **************************/
508 r22
= _mm_mul_pd(rsq22
,rinv22
);
510 /* Calculate table index by multiplying r with table scale and truncate to integer */
511 rt
= _mm_mul_pd(r22
,vftabscale
);
512 vfitab
= _mm_cvttpd_epi32(rt
);
513 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
514 vfitab
= _mm_slli_epi32(vfitab
,2);
516 /* CUBIC SPLINE TABLE ELECTROSTATICS */
517 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
518 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
519 GMX_MM_TRANSPOSE2_PD(Y
,F
);
520 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
521 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
522 GMX_MM_TRANSPOSE2_PD(G
,H
);
523 Heps
= _mm_mul_pd(vfeps
,H
);
524 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
525 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
526 velec
= _mm_mul_pd(qq22
,VV
);
527 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
528 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
530 /* Update potential sum for this i atom from the interaction with this j atom. */
531 velecsum
= _mm_add_pd(velecsum
,velec
);
535 /* Calculate temporary vectorial force */
536 tx
= _mm_mul_pd(fscal
,dx22
);
537 ty
= _mm_mul_pd(fscal
,dy22
);
538 tz
= _mm_mul_pd(fscal
,dz22
);
540 /* Update vectorial force */
541 fix2
= _mm_add_pd(fix2
,tx
);
542 fiy2
= _mm_add_pd(fiy2
,ty
);
543 fiz2
= _mm_add_pd(fiz2
,tz
);
545 fjx2
= _mm_add_pd(fjx2
,tx
);
546 fjy2
= _mm_add_pd(fjy2
,ty
);
547 fjz2
= _mm_add_pd(fjz2
,tz
);
549 /**************************
550 * CALCULATE INTERACTIONS *
551 **************************/
553 r23
= _mm_mul_pd(rsq23
,rinv23
);
555 /* Calculate table index by multiplying r with table scale and truncate to integer */
556 rt
= _mm_mul_pd(r23
,vftabscale
);
557 vfitab
= _mm_cvttpd_epi32(rt
);
558 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
559 vfitab
= _mm_slli_epi32(vfitab
,2);
561 /* CUBIC SPLINE TABLE ELECTROSTATICS */
562 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
563 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
564 GMX_MM_TRANSPOSE2_PD(Y
,F
);
565 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
566 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
567 GMX_MM_TRANSPOSE2_PD(G
,H
);
568 Heps
= _mm_mul_pd(vfeps
,H
);
569 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
570 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
571 velec
= _mm_mul_pd(qq23
,VV
);
572 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
573 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq23
,FF
),_mm_mul_pd(vftabscale
,rinv23
)));
575 /* Update potential sum for this i atom from the interaction with this j atom. */
576 velecsum
= _mm_add_pd(velecsum
,velec
);
580 /* Calculate temporary vectorial force */
581 tx
= _mm_mul_pd(fscal
,dx23
);
582 ty
= _mm_mul_pd(fscal
,dy23
);
583 tz
= _mm_mul_pd(fscal
,dz23
);
585 /* Update vectorial force */
586 fix2
= _mm_add_pd(fix2
,tx
);
587 fiy2
= _mm_add_pd(fiy2
,ty
);
588 fiz2
= _mm_add_pd(fiz2
,tz
);
590 fjx3
= _mm_add_pd(fjx3
,tx
);
591 fjy3
= _mm_add_pd(fjy3
,ty
);
592 fjz3
= _mm_add_pd(fjz3
,tz
);
594 /**************************
595 * CALCULATE INTERACTIONS *
596 **************************/
598 r31
= _mm_mul_pd(rsq31
,rinv31
);
600 /* Calculate table index by multiplying r with table scale and truncate to integer */
601 rt
= _mm_mul_pd(r31
,vftabscale
);
602 vfitab
= _mm_cvttpd_epi32(rt
);
603 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
604 vfitab
= _mm_slli_epi32(vfitab
,2);
606 /* CUBIC SPLINE TABLE ELECTROSTATICS */
607 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
608 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
609 GMX_MM_TRANSPOSE2_PD(Y
,F
);
610 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
611 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
612 GMX_MM_TRANSPOSE2_PD(G
,H
);
613 Heps
= _mm_mul_pd(vfeps
,H
);
614 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
615 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
616 velec
= _mm_mul_pd(qq31
,VV
);
617 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
618 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq31
,FF
),_mm_mul_pd(vftabscale
,rinv31
)));
620 /* Update potential sum for this i atom from the interaction with this j atom. */
621 velecsum
= _mm_add_pd(velecsum
,velec
);
625 /* Calculate temporary vectorial force */
626 tx
= _mm_mul_pd(fscal
,dx31
);
627 ty
= _mm_mul_pd(fscal
,dy31
);
628 tz
= _mm_mul_pd(fscal
,dz31
);
630 /* Update vectorial force */
631 fix3
= _mm_add_pd(fix3
,tx
);
632 fiy3
= _mm_add_pd(fiy3
,ty
);
633 fiz3
= _mm_add_pd(fiz3
,tz
);
635 fjx1
= _mm_add_pd(fjx1
,tx
);
636 fjy1
= _mm_add_pd(fjy1
,ty
);
637 fjz1
= _mm_add_pd(fjz1
,tz
);
639 /**************************
640 * CALCULATE INTERACTIONS *
641 **************************/
643 r32
= _mm_mul_pd(rsq32
,rinv32
);
645 /* Calculate table index by multiplying r with table scale and truncate to integer */
646 rt
= _mm_mul_pd(r32
,vftabscale
);
647 vfitab
= _mm_cvttpd_epi32(rt
);
648 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
649 vfitab
= _mm_slli_epi32(vfitab
,2);
651 /* CUBIC SPLINE TABLE ELECTROSTATICS */
652 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
653 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
654 GMX_MM_TRANSPOSE2_PD(Y
,F
);
655 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
656 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
657 GMX_MM_TRANSPOSE2_PD(G
,H
);
658 Heps
= _mm_mul_pd(vfeps
,H
);
659 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
660 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
661 velec
= _mm_mul_pd(qq32
,VV
);
662 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
663 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq32
,FF
),_mm_mul_pd(vftabscale
,rinv32
)));
665 /* Update potential sum for this i atom from the interaction with this j atom. */
666 velecsum
= _mm_add_pd(velecsum
,velec
);
670 /* Calculate temporary vectorial force */
671 tx
= _mm_mul_pd(fscal
,dx32
);
672 ty
= _mm_mul_pd(fscal
,dy32
);
673 tz
= _mm_mul_pd(fscal
,dz32
);
675 /* Update vectorial force */
676 fix3
= _mm_add_pd(fix3
,tx
);
677 fiy3
= _mm_add_pd(fiy3
,ty
);
678 fiz3
= _mm_add_pd(fiz3
,tz
);
680 fjx2
= _mm_add_pd(fjx2
,tx
);
681 fjy2
= _mm_add_pd(fjy2
,ty
);
682 fjz2
= _mm_add_pd(fjz2
,tz
);
684 /**************************
685 * CALCULATE INTERACTIONS *
686 **************************/
688 r33
= _mm_mul_pd(rsq33
,rinv33
);
690 /* Calculate table index by multiplying r with table scale and truncate to integer */
691 rt
= _mm_mul_pd(r33
,vftabscale
);
692 vfitab
= _mm_cvttpd_epi32(rt
);
693 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
694 vfitab
= _mm_slli_epi32(vfitab
,2);
696 /* CUBIC SPLINE TABLE ELECTROSTATICS */
697 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
698 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
699 GMX_MM_TRANSPOSE2_PD(Y
,F
);
700 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
701 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
702 GMX_MM_TRANSPOSE2_PD(G
,H
);
703 Heps
= _mm_mul_pd(vfeps
,H
);
704 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
705 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
706 velec
= _mm_mul_pd(qq33
,VV
);
707 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
708 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq33
,FF
),_mm_mul_pd(vftabscale
,rinv33
)));
710 /* Update potential sum for this i atom from the interaction with this j atom. */
711 velecsum
= _mm_add_pd(velecsum
,velec
);
715 /* Calculate temporary vectorial force */
716 tx
= _mm_mul_pd(fscal
,dx33
);
717 ty
= _mm_mul_pd(fscal
,dy33
);
718 tz
= _mm_mul_pd(fscal
,dz33
);
720 /* Update vectorial force */
721 fix3
= _mm_add_pd(fix3
,tx
);
722 fiy3
= _mm_add_pd(fiy3
,ty
);
723 fiz3
= _mm_add_pd(fiz3
,tz
);
725 fjx3
= _mm_add_pd(fjx3
,tx
);
726 fjy3
= _mm_add_pd(fjy3
,ty
);
727 fjz3
= _mm_add_pd(fjz3
,tz
);
729 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
731 /* Inner loop uses 422 flops */
738 j_coord_offsetA
= DIM
*jnrA
;
740 /* load j atom coordinates */
741 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
742 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
743 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
745 /* Calculate displacement vector */
746 dx00
= _mm_sub_pd(ix0
,jx0
);
747 dy00
= _mm_sub_pd(iy0
,jy0
);
748 dz00
= _mm_sub_pd(iz0
,jz0
);
749 dx11
= _mm_sub_pd(ix1
,jx1
);
750 dy11
= _mm_sub_pd(iy1
,jy1
);
751 dz11
= _mm_sub_pd(iz1
,jz1
);
752 dx12
= _mm_sub_pd(ix1
,jx2
);
753 dy12
= _mm_sub_pd(iy1
,jy2
);
754 dz12
= _mm_sub_pd(iz1
,jz2
);
755 dx13
= _mm_sub_pd(ix1
,jx3
);
756 dy13
= _mm_sub_pd(iy1
,jy3
);
757 dz13
= _mm_sub_pd(iz1
,jz3
);
758 dx21
= _mm_sub_pd(ix2
,jx1
);
759 dy21
= _mm_sub_pd(iy2
,jy1
);
760 dz21
= _mm_sub_pd(iz2
,jz1
);
761 dx22
= _mm_sub_pd(ix2
,jx2
);
762 dy22
= _mm_sub_pd(iy2
,jy2
);
763 dz22
= _mm_sub_pd(iz2
,jz2
);
764 dx23
= _mm_sub_pd(ix2
,jx3
);
765 dy23
= _mm_sub_pd(iy2
,jy3
);
766 dz23
= _mm_sub_pd(iz2
,jz3
);
767 dx31
= _mm_sub_pd(ix3
,jx1
);
768 dy31
= _mm_sub_pd(iy3
,jy1
);
769 dz31
= _mm_sub_pd(iz3
,jz1
);
770 dx32
= _mm_sub_pd(ix3
,jx2
);
771 dy32
= _mm_sub_pd(iy3
,jy2
);
772 dz32
= _mm_sub_pd(iz3
,jz2
);
773 dx33
= _mm_sub_pd(ix3
,jx3
);
774 dy33
= _mm_sub_pd(iy3
,jy3
);
775 dz33
= _mm_sub_pd(iz3
,jz3
);
777 /* Calculate squared distance and things based on it */
778 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
779 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
780 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
781 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
782 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
783 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
784 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
785 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
786 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
787 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
789 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
790 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
791 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
792 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
793 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
794 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
795 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
796 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
797 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
799 rinvsq00
= gmx_mm_inv_pd(rsq00
);
801 fjx0
= _mm_setzero_pd();
802 fjy0
= _mm_setzero_pd();
803 fjz0
= _mm_setzero_pd();
804 fjx1
= _mm_setzero_pd();
805 fjy1
= _mm_setzero_pd();
806 fjz1
= _mm_setzero_pd();
807 fjx2
= _mm_setzero_pd();
808 fjy2
= _mm_setzero_pd();
809 fjz2
= _mm_setzero_pd();
810 fjx3
= _mm_setzero_pd();
811 fjy3
= _mm_setzero_pd();
812 fjz3
= _mm_setzero_pd();
814 /**************************
815 * CALCULATE INTERACTIONS *
816 **************************/
818 /* LENNARD-JONES DISPERSION/REPULSION */
820 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
821 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
822 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
823 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
824 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
826 /* Update potential sum for this i atom from the interaction with this j atom. */
827 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
828 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
832 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
834 /* Calculate temporary vectorial force */
835 tx
= _mm_mul_pd(fscal
,dx00
);
836 ty
= _mm_mul_pd(fscal
,dy00
);
837 tz
= _mm_mul_pd(fscal
,dz00
);
839 /* Update vectorial force */
840 fix0
= _mm_add_pd(fix0
,tx
);
841 fiy0
= _mm_add_pd(fiy0
,ty
);
842 fiz0
= _mm_add_pd(fiz0
,tz
);
844 fjx0
= _mm_add_pd(fjx0
,tx
);
845 fjy0
= _mm_add_pd(fjy0
,ty
);
846 fjz0
= _mm_add_pd(fjz0
,tz
);
848 /**************************
849 * CALCULATE INTERACTIONS *
850 **************************/
852 r11
= _mm_mul_pd(rsq11
,rinv11
);
854 /* Calculate table index by multiplying r with table scale and truncate to integer */
855 rt
= _mm_mul_pd(r11
,vftabscale
);
856 vfitab
= _mm_cvttpd_epi32(rt
);
857 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
858 vfitab
= _mm_slli_epi32(vfitab
,2);
860 /* CUBIC SPLINE TABLE ELECTROSTATICS */
861 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
862 F
= _mm_setzero_pd();
863 GMX_MM_TRANSPOSE2_PD(Y
,F
);
864 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
865 H
= _mm_setzero_pd();
866 GMX_MM_TRANSPOSE2_PD(G
,H
);
867 Heps
= _mm_mul_pd(vfeps
,H
);
868 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
869 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
870 velec
= _mm_mul_pd(qq11
,VV
);
871 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
872 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
874 /* Update potential sum for this i atom from the interaction with this j atom. */
875 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
876 velecsum
= _mm_add_pd(velecsum
,velec
);
880 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
882 /* Calculate temporary vectorial force */
883 tx
= _mm_mul_pd(fscal
,dx11
);
884 ty
= _mm_mul_pd(fscal
,dy11
);
885 tz
= _mm_mul_pd(fscal
,dz11
);
887 /* Update vectorial force */
888 fix1
= _mm_add_pd(fix1
,tx
);
889 fiy1
= _mm_add_pd(fiy1
,ty
);
890 fiz1
= _mm_add_pd(fiz1
,tz
);
892 fjx1
= _mm_add_pd(fjx1
,tx
);
893 fjy1
= _mm_add_pd(fjy1
,ty
);
894 fjz1
= _mm_add_pd(fjz1
,tz
);
896 /**************************
897 * CALCULATE INTERACTIONS *
898 **************************/
900 r12
= _mm_mul_pd(rsq12
,rinv12
);
902 /* Calculate table index by multiplying r with table scale and truncate to integer */
903 rt
= _mm_mul_pd(r12
,vftabscale
);
904 vfitab
= _mm_cvttpd_epi32(rt
);
905 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
906 vfitab
= _mm_slli_epi32(vfitab
,2);
908 /* CUBIC SPLINE TABLE ELECTROSTATICS */
909 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
910 F
= _mm_setzero_pd();
911 GMX_MM_TRANSPOSE2_PD(Y
,F
);
912 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
913 H
= _mm_setzero_pd();
914 GMX_MM_TRANSPOSE2_PD(G
,H
);
915 Heps
= _mm_mul_pd(vfeps
,H
);
916 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
917 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
918 velec
= _mm_mul_pd(qq12
,VV
);
919 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
920 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
922 /* Update potential sum for this i atom from the interaction with this j atom. */
923 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
924 velecsum
= _mm_add_pd(velecsum
,velec
);
928 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
930 /* Calculate temporary vectorial force */
931 tx
= _mm_mul_pd(fscal
,dx12
);
932 ty
= _mm_mul_pd(fscal
,dy12
);
933 tz
= _mm_mul_pd(fscal
,dz12
);
935 /* Update vectorial force */
936 fix1
= _mm_add_pd(fix1
,tx
);
937 fiy1
= _mm_add_pd(fiy1
,ty
);
938 fiz1
= _mm_add_pd(fiz1
,tz
);
940 fjx2
= _mm_add_pd(fjx2
,tx
);
941 fjy2
= _mm_add_pd(fjy2
,ty
);
942 fjz2
= _mm_add_pd(fjz2
,tz
);
944 /**************************
945 * CALCULATE INTERACTIONS *
946 **************************/
948 r13
= _mm_mul_pd(rsq13
,rinv13
);
950 /* Calculate table index by multiplying r with table scale and truncate to integer */
951 rt
= _mm_mul_pd(r13
,vftabscale
);
952 vfitab
= _mm_cvttpd_epi32(rt
);
953 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
954 vfitab
= _mm_slli_epi32(vfitab
,2);
956 /* CUBIC SPLINE TABLE ELECTROSTATICS */
957 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
958 F
= _mm_setzero_pd();
959 GMX_MM_TRANSPOSE2_PD(Y
,F
);
960 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
961 H
= _mm_setzero_pd();
962 GMX_MM_TRANSPOSE2_PD(G
,H
);
963 Heps
= _mm_mul_pd(vfeps
,H
);
964 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
965 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
966 velec
= _mm_mul_pd(qq13
,VV
);
967 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
968 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq13
,FF
),_mm_mul_pd(vftabscale
,rinv13
)));
970 /* Update potential sum for this i atom from the interaction with this j atom. */
971 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
972 velecsum
= _mm_add_pd(velecsum
,velec
);
976 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
978 /* Calculate temporary vectorial force */
979 tx
= _mm_mul_pd(fscal
,dx13
);
980 ty
= _mm_mul_pd(fscal
,dy13
);
981 tz
= _mm_mul_pd(fscal
,dz13
);
983 /* Update vectorial force */
984 fix1
= _mm_add_pd(fix1
,tx
);
985 fiy1
= _mm_add_pd(fiy1
,ty
);
986 fiz1
= _mm_add_pd(fiz1
,tz
);
988 fjx3
= _mm_add_pd(fjx3
,tx
);
989 fjy3
= _mm_add_pd(fjy3
,ty
);
990 fjz3
= _mm_add_pd(fjz3
,tz
);
992 /**************************
993 * CALCULATE INTERACTIONS *
994 **************************/
996 r21
= _mm_mul_pd(rsq21
,rinv21
);
998 /* Calculate table index by multiplying r with table scale and truncate to integer */
999 rt
= _mm_mul_pd(r21
,vftabscale
);
1000 vfitab
= _mm_cvttpd_epi32(rt
);
1001 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1002 vfitab
= _mm_slli_epi32(vfitab
,2);
1004 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1005 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1006 F
= _mm_setzero_pd();
1007 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1008 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1009 H
= _mm_setzero_pd();
1010 GMX_MM_TRANSPOSE2_PD(G
,H
);
1011 Heps
= _mm_mul_pd(vfeps
,H
);
1012 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1013 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1014 velec
= _mm_mul_pd(qq21
,VV
);
1015 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1016 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
1018 /* Update potential sum for this i atom from the interaction with this j atom. */
1019 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1020 velecsum
= _mm_add_pd(velecsum
,velec
);
1024 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1026 /* Calculate temporary vectorial force */
1027 tx
= _mm_mul_pd(fscal
,dx21
);
1028 ty
= _mm_mul_pd(fscal
,dy21
);
1029 tz
= _mm_mul_pd(fscal
,dz21
);
1031 /* Update vectorial force */
1032 fix2
= _mm_add_pd(fix2
,tx
);
1033 fiy2
= _mm_add_pd(fiy2
,ty
);
1034 fiz2
= _mm_add_pd(fiz2
,tz
);
1036 fjx1
= _mm_add_pd(fjx1
,tx
);
1037 fjy1
= _mm_add_pd(fjy1
,ty
);
1038 fjz1
= _mm_add_pd(fjz1
,tz
);
1040 /**************************
1041 * CALCULATE INTERACTIONS *
1042 **************************/
1044 r22
= _mm_mul_pd(rsq22
,rinv22
);
1046 /* Calculate table index by multiplying r with table scale and truncate to integer */
1047 rt
= _mm_mul_pd(r22
,vftabscale
);
1048 vfitab
= _mm_cvttpd_epi32(rt
);
1049 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1050 vfitab
= _mm_slli_epi32(vfitab
,2);
1052 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1053 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1054 F
= _mm_setzero_pd();
1055 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1056 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1057 H
= _mm_setzero_pd();
1058 GMX_MM_TRANSPOSE2_PD(G
,H
);
1059 Heps
= _mm_mul_pd(vfeps
,H
);
1060 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1061 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1062 velec
= _mm_mul_pd(qq22
,VV
);
1063 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1064 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
1066 /* Update potential sum for this i atom from the interaction with this j atom. */
1067 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1068 velecsum
= _mm_add_pd(velecsum
,velec
);
1072 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1074 /* Calculate temporary vectorial force */
1075 tx
= _mm_mul_pd(fscal
,dx22
);
1076 ty
= _mm_mul_pd(fscal
,dy22
);
1077 tz
= _mm_mul_pd(fscal
,dz22
);
1079 /* Update vectorial force */
1080 fix2
= _mm_add_pd(fix2
,tx
);
1081 fiy2
= _mm_add_pd(fiy2
,ty
);
1082 fiz2
= _mm_add_pd(fiz2
,tz
);
1084 fjx2
= _mm_add_pd(fjx2
,tx
);
1085 fjy2
= _mm_add_pd(fjy2
,ty
);
1086 fjz2
= _mm_add_pd(fjz2
,tz
);
1088 /**************************
1089 * CALCULATE INTERACTIONS *
1090 **************************/
1092 r23
= _mm_mul_pd(rsq23
,rinv23
);
1094 /* Calculate table index by multiplying r with table scale and truncate to integer */
1095 rt
= _mm_mul_pd(r23
,vftabscale
);
1096 vfitab
= _mm_cvttpd_epi32(rt
);
1097 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1098 vfitab
= _mm_slli_epi32(vfitab
,2);
1100 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1101 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1102 F
= _mm_setzero_pd();
1103 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1104 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1105 H
= _mm_setzero_pd();
1106 GMX_MM_TRANSPOSE2_PD(G
,H
);
1107 Heps
= _mm_mul_pd(vfeps
,H
);
1108 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1109 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1110 velec
= _mm_mul_pd(qq23
,VV
);
1111 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1112 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq23
,FF
),_mm_mul_pd(vftabscale
,rinv23
)));
1114 /* Update potential sum for this i atom from the interaction with this j atom. */
1115 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1116 velecsum
= _mm_add_pd(velecsum
,velec
);
1120 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1122 /* Calculate temporary vectorial force */
1123 tx
= _mm_mul_pd(fscal
,dx23
);
1124 ty
= _mm_mul_pd(fscal
,dy23
);
1125 tz
= _mm_mul_pd(fscal
,dz23
);
1127 /* Update vectorial force */
1128 fix2
= _mm_add_pd(fix2
,tx
);
1129 fiy2
= _mm_add_pd(fiy2
,ty
);
1130 fiz2
= _mm_add_pd(fiz2
,tz
);
1132 fjx3
= _mm_add_pd(fjx3
,tx
);
1133 fjy3
= _mm_add_pd(fjy3
,ty
);
1134 fjz3
= _mm_add_pd(fjz3
,tz
);
1136 /**************************
1137 * CALCULATE INTERACTIONS *
1138 **************************/
1140 r31
= _mm_mul_pd(rsq31
,rinv31
);
1142 /* Calculate table index by multiplying r with table scale and truncate to integer */
1143 rt
= _mm_mul_pd(r31
,vftabscale
);
1144 vfitab
= _mm_cvttpd_epi32(rt
);
1145 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1146 vfitab
= _mm_slli_epi32(vfitab
,2);
1148 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1149 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1150 F
= _mm_setzero_pd();
1151 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1152 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1153 H
= _mm_setzero_pd();
1154 GMX_MM_TRANSPOSE2_PD(G
,H
);
1155 Heps
= _mm_mul_pd(vfeps
,H
);
1156 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1157 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1158 velec
= _mm_mul_pd(qq31
,VV
);
1159 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1160 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq31
,FF
),_mm_mul_pd(vftabscale
,rinv31
)));
1162 /* Update potential sum for this i atom from the interaction with this j atom. */
1163 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1164 velecsum
= _mm_add_pd(velecsum
,velec
);
1168 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1170 /* Calculate temporary vectorial force */
1171 tx
= _mm_mul_pd(fscal
,dx31
);
1172 ty
= _mm_mul_pd(fscal
,dy31
);
1173 tz
= _mm_mul_pd(fscal
,dz31
);
1175 /* Update vectorial force */
1176 fix3
= _mm_add_pd(fix3
,tx
);
1177 fiy3
= _mm_add_pd(fiy3
,ty
);
1178 fiz3
= _mm_add_pd(fiz3
,tz
);
1180 fjx1
= _mm_add_pd(fjx1
,tx
);
1181 fjy1
= _mm_add_pd(fjy1
,ty
);
1182 fjz1
= _mm_add_pd(fjz1
,tz
);
1184 /**************************
1185 * CALCULATE INTERACTIONS *
1186 **************************/
1188 r32
= _mm_mul_pd(rsq32
,rinv32
);
1190 /* Calculate table index by multiplying r with table scale and truncate to integer */
1191 rt
= _mm_mul_pd(r32
,vftabscale
);
1192 vfitab
= _mm_cvttpd_epi32(rt
);
1193 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1194 vfitab
= _mm_slli_epi32(vfitab
,2);
1196 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1197 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1198 F
= _mm_setzero_pd();
1199 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1200 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1201 H
= _mm_setzero_pd();
1202 GMX_MM_TRANSPOSE2_PD(G
,H
);
1203 Heps
= _mm_mul_pd(vfeps
,H
);
1204 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1205 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1206 velec
= _mm_mul_pd(qq32
,VV
);
1207 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1208 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq32
,FF
),_mm_mul_pd(vftabscale
,rinv32
)));
1210 /* Update potential sum for this i atom from the interaction with this j atom. */
1211 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1212 velecsum
= _mm_add_pd(velecsum
,velec
);
1216 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1218 /* Calculate temporary vectorial force */
1219 tx
= _mm_mul_pd(fscal
,dx32
);
1220 ty
= _mm_mul_pd(fscal
,dy32
);
1221 tz
= _mm_mul_pd(fscal
,dz32
);
1223 /* Update vectorial force */
1224 fix3
= _mm_add_pd(fix3
,tx
);
1225 fiy3
= _mm_add_pd(fiy3
,ty
);
1226 fiz3
= _mm_add_pd(fiz3
,tz
);
1228 fjx2
= _mm_add_pd(fjx2
,tx
);
1229 fjy2
= _mm_add_pd(fjy2
,ty
);
1230 fjz2
= _mm_add_pd(fjz2
,tz
);
1232 /**************************
1233 * CALCULATE INTERACTIONS *
1234 **************************/
1236 r33
= _mm_mul_pd(rsq33
,rinv33
);
1238 /* Calculate table index by multiplying r with table scale and truncate to integer */
1239 rt
= _mm_mul_pd(r33
,vftabscale
);
1240 vfitab
= _mm_cvttpd_epi32(rt
);
1241 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1242 vfitab
= _mm_slli_epi32(vfitab
,2);
1244 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1245 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1246 F
= _mm_setzero_pd();
1247 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1248 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1249 H
= _mm_setzero_pd();
1250 GMX_MM_TRANSPOSE2_PD(G
,H
);
1251 Heps
= _mm_mul_pd(vfeps
,H
);
1252 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1253 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1254 velec
= _mm_mul_pd(qq33
,VV
);
1255 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1256 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq33
,FF
),_mm_mul_pd(vftabscale
,rinv33
)));
1258 /* Update potential sum for this i atom from the interaction with this j atom. */
1259 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1260 velecsum
= _mm_add_pd(velecsum
,velec
);
1264 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1266 /* Calculate temporary vectorial force */
1267 tx
= _mm_mul_pd(fscal
,dx33
);
1268 ty
= _mm_mul_pd(fscal
,dy33
);
1269 tz
= _mm_mul_pd(fscal
,dz33
);
1271 /* Update vectorial force */
1272 fix3
= _mm_add_pd(fix3
,tx
);
1273 fiy3
= _mm_add_pd(fiy3
,ty
);
1274 fiz3
= _mm_add_pd(fiz3
,tz
);
1276 fjx3
= _mm_add_pd(fjx3
,tx
);
1277 fjy3
= _mm_add_pd(fjy3
,ty
);
1278 fjz3
= _mm_add_pd(fjz3
,tz
);
1280 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1282 /* Inner loop uses 422 flops */
1285 /* End of innermost loop */
1287 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1288 f
+i_coord_offset
,fshift
+i_shift_offset
);
1291 /* Update potential energies */
1292 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1293 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1295 /* Increment number of inner iterations */
1296 inneriter
+= j_index_end
- j_index_start
;
1298 /* Outer loop uses 26 flops */
1301 /* Increment number of outer iterations */
1304 /* Update outer/inner flops */
1306 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*422);
1309 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_double
1310 * Electrostatics interaction: CubicSplineTable
1311 * VdW interaction: LennardJones
1312 * Geometry: Water4-Water4
1313 * Calculate force/pot: Force
1316 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_double
1317 (t_nblist
* gmx_restrict nlist
,
1318 rvec
* gmx_restrict xx
,
1319 rvec
* gmx_restrict ff
,
1320 t_forcerec
* gmx_restrict fr
,
1321 t_mdatoms
* gmx_restrict mdatoms
,
1322 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1323 t_nrnb
* gmx_restrict nrnb
)
1325 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1326 * just 0 for non-waters.
1327 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1328 * jnr indices corresponding to data put in the four positions in the SIMD register.
1330 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1331 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1333 int j_coord_offsetA
,j_coord_offsetB
;
1334 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1335 real rcutoff_scalar
;
1336 real
*shiftvec
,*fshift
,*x
,*f
;
1337 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1339 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1341 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1343 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1345 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1346 int vdwjidx0A
,vdwjidx0B
;
1347 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1348 int vdwjidx1A
,vdwjidx1B
;
1349 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1350 int vdwjidx2A
,vdwjidx2B
;
1351 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1352 int vdwjidx3A
,vdwjidx3B
;
1353 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1354 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1355 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1356 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1357 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1358 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1359 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1360 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1361 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1362 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1363 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1364 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1367 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1370 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1371 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1373 __m128i ifour
= _mm_set1_epi32(4);
1374 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
1376 __m128d dummy_mask
,cutoff_mask
;
1377 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1378 __m128d one
= _mm_set1_pd(1.0);
1379 __m128d two
= _mm_set1_pd(2.0);
1385 jindex
= nlist
->jindex
;
1387 shiftidx
= nlist
->shift
;
1389 shiftvec
= fr
->shift_vec
[0];
1390 fshift
= fr
->fshift
[0];
1391 facel
= _mm_set1_pd(fr
->epsfac
);
1392 charge
= mdatoms
->chargeA
;
1393 nvdwtype
= fr
->ntype
;
1394 vdwparam
= fr
->nbfp
;
1395 vdwtype
= mdatoms
->typeA
;
1397 vftab
= kernel_data
->table_elec
->data
;
1398 vftabscale
= _mm_set1_pd(kernel_data
->table_elec
->scale
);
1400 /* Setup water-specific parameters */
1401 inr
= nlist
->iinr
[0];
1402 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1403 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1404 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
1405 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1407 jq1
= _mm_set1_pd(charge
[inr
+1]);
1408 jq2
= _mm_set1_pd(charge
[inr
+2]);
1409 jq3
= _mm_set1_pd(charge
[inr
+3]);
1410 vdwjidx0A
= 2*vdwtype
[inr
+0];
1411 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1412 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1413 qq11
= _mm_mul_pd(iq1
,jq1
);
1414 qq12
= _mm_mul_pd(iq1
,jq2
);
1415 qq13
= _mm_mul_pd(iq1
,jq3
);
1416 qq21
= _mm_mul_pd(iq2
,jq1
);
1417 qq22
= _mm_mul_pd(iq2
,jq2
);
1418 qq23
= _mm_mul_pd(iq2
,jq3
);
1419 qq31
= _mm_mul_pd(iq3
,jq1
);
1420 qq32
= _mm_mul_pd(iq3
,jq2
);
1421 qq33
= _mm_mul_pd(iq3
,jq3
);
1423 /* Avoid stupid compiler warnings */
1425 j_coord_offsetA
= 0;
1426 j_coord_offsetB
= 0;
1431 /* Start outer loop over neighborlists */
1432 for(iidx
=0; iidx
<nri
; iidx
++)
1434 /* Load shift vector for this list */
1435 i_shift_offset
= DIM
*shiftidx
[iidx
];
1437 /* Load limits for loop over neighbors */
1438 j_index_start
= jindex
[iidx
];
1439 j_index_end
= jindex
[iidx
+1];
1441 /* Get outer coordinate index */
1443 i_coord_offset
= DIM
*inr
;
1445 /* Load i particle coords and add shift vector */
1446 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1447 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1449 fix0
= _mm_setzero_pd();
1450 fiy0
= _mm_setzero_pd();
1451 fiz0
= _mm_setzero_pd();
1452 fix1
= _mm_setzero_pd();
1453 fiy1
= _mm_setzero_pd();
1454 fiz1
= _mm_setzero_pd();
1455 fix2
= _mm_setzero_pd();
1456 fiy2
= _mm_setzero_pd();
1457 fiz2
= _mm_setzero_pd();
1458 fix3
= _mm_setzero_pd();
1459 fiy3
= _mm_setzero_pd();
1460 fiz3
= _mm_setzero_pd();
1462 /* Start inner kernel loop */
1463 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1466 /* Get j neighbor index, and coordinate index */
1468 jnrB
= jjnr
[jidx
+1];
1469 j_coord_offsetA
= DIM
*jnrA
;
1470 j_coord_offsetB
= DIM
*jnrB
;
1472 /* load j atom coordinates */
1473 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1474 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1475 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1477 /* Calculate displacement vector */
1478 dx00
= _mm_sub_pd(ix0
,jx0
);
1479 dy00
= _mm_sub_pd(iy0
,jy0
);
1480 dz00
= _mm_sub_pd(iz0
,jz0
);
1481 dx11
= _mm_sub_pd(ix1
,jx1
);
1482 dy11
= _mm_sub_pd(iy1
,jy1
);
1483 dz11
= _mm_sub_pd(iz1
,jz1
);
1484 dx12
= _mm_sub_pd(ix1
,jx2
);
1485 dy12
= _mm_sub_pd(iy1
,jy2
);
1486 dz12
= _mm_sub_pd(iz1
,jz2
);
1487 dx13
= _mm_sub_pd(ix1
,jx3
);
1488 dy13
= _mm_sub_pd(iy1
,jy3
);
1489 dz13
= _mm_sub_pd(iz1
,jz3
);
1490 dx21
= _mm_sub_pd(ix2
,jx1
);
1491 dy21
= _mm_sub_pd(iy2
,jy1
);
1492 dz21
= _mm_sub_pd(iz2
,jz1
);
1493 dx22
= _mm_sub_pd(ix2
,jx2
);
1494 dy22
= _mm_sub_pd(iy2
,jy2
);
1495 dz22
= _mm_sub_pd(iz2
,jz2
);
1496 dx23
= _mm_sub_pd(ix2
,jx3
);
1497 dy23
= _mm_sub_pd(iy2
,jy3
);
1498 dz23
= _mm_sub_pd(iz2
,jz3
);
1499 dx31
= _mm_sub_pd(ix3
,jx1
);
1500 dy31
= _mm_sub_pd(iy3
,jy1
);
1501 dz31
= _mm_sub_pd(iz3
,jz1
);
1502 dx32
= _mm_sub_pd(ix3
,jx2
);
1503 dy32
= _mm_sub_pd(iy3
,jy2
);
1504 dz32
= _mm_sub_pd(iz3
,jz2
);
1505 dx33
= _mm_sub_pd(ix3
,jx3
);
1506 dy33
= _mm_sub_pd(iy3
,jy3
);
1507 dz33
= _mm_sub_pd(iz3
,jz3
);
1509 /* Calculate squared distance and things based on it */
1510 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1511 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1512 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1513 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1514 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1515 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1516 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1517 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1518 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1519 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1521 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1522 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1523 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1524 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1525 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1526 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1527 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1528 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1529 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1531 rinvsq00
= gmx_mm_inv_pd(rsq00
);
1533 fjx0
= _mm_setzero_pd();
1534 fjy0
= _mm_setzero_pd();
1535 fjz0
= _mm_setzero_pd();
1536 fjx1
= _mm_setzero_pd();
1537 fjy1
= _mm_setzero_pd();
1538 fjz1
= _mm_setzero_pd();
1539 fjx2
= _mm_setzero_pd();
1540 fjy2
= _mm_setzero_pd();
1541 fjz2
= _mm_setzero_pd();
1542 fjx3
= _mm_setzero_pd();
1543 fjy3
= _mm_setzero_pd();
1544 fjz3
= _mm_setzero_pd();
1546 /**************************
1547 * CALCULATE INTERACTIONS *
1548 **************************/
1550 /* LENNARD-JONES DISPERSION/REPULSION */
1552 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1553 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1557 /* Calculate temporary vectorial force */
1558 tx
= _mm_mul_pd(fscal
,dx00
);
1559 ty
= _mm_mul_pd(fscal
,dy00
);
1560 tz
= _mm_mul_pd(fscal
,dz00
);
1562 /* Update vectorial force */
1563 fix0
= _mm_add_pd(fix0
,tx
);
1564 fiy0
= _mm_add_pd(fiy0
,ty
);
1565 fiz0
= _mm_add_pd(fiz0
,tz
);
1567 fjx0
= _mm_add_pd(fjx0
,tx
);
1568 fjy0
= _mm_add_pd(fjy0
,ty
);
1569 fjz0
= _mm_add_pd(fjz0
,tz
);
1571 /**************************
1572 * CALCULATE INTERACTIONS *
1573 **************************/
1575 r11
= _mm_mul_pd(rsq11
,rinv11
);
1577 /* Calculate table index by multiplying r with table scale and truncate to integer */
1578 rt
= _mm_mul_pd(r11
,vftabscale
);
1579 vfitab
= _mm_cvttpd_epi32(rt
);
1580 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1581 vfitab
= _mm_slli_epi32(vfitab
,2);
1583 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1584 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1585 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1586 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1587 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1588 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1589 GMX_MM_TRANSPOSE2_PD(G
,H
);
1590 Heps
= _mm_mul_pd(vfeps
,H
);
1591 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1592 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1593 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
1597 /* Calculate temporary vectorial force */
1598 tx
= _mm_mul_pd(fscal
,dx11
);
1599 ty
= _mm_mul_pd(fscal
,dy11
);
1600 tz
= _mm_mul_pd(fscal
,dz11
);
1602 /* Update vectorial force */
1603 fix1
= _mm_add_pd(fix1
,tx
);
1604 fiy1
= _mm_add_pd(fiy1
,ty
);
1605 fiz1
= _mm_add_pd(fiz1
,tz
);
1607 fjx1
= _mm_add_pd(fjx1
,tx
);
1608 fjy1
= _mm_add_pd(fjy1
,ty
);
1609 fjz1
= _mm_add_pd(fjz1
,tz
);
1611 /**************************
1612 * CALCULATE INTERACTIONS *
1613 **************************/
1615 r12
= _mm_mul_pd(rsq12
,rinv12
);
1617 /* Calculate table index by multiplying r with table scale and truncate to integer */
1618 rt
= _mm_mul_pd(r12
,vftabscale
);
1619 vfitab
= _mm_cvttpd_epi32(rt
);
1620 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1621 vfitab
= _mm_slli_epi32(vfitab
,2);
1623 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1624 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1625 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1626 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1627 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1628 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1629 GMX_MM_TRANSPOSE2_PD(G
,H
);
1630 Heps
= _mm_mul_pd(vfeps
,H
);
1631 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1632 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1633 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
1637 /* Calculate temporary vectorial force */
1638 tx
= _mm_mul_pd(fscal
,dx12
);
1639 ty
= _mm_mul_pd(fscal
,dy12
);
1640 tz
= _mm_mul_pd(fscal
,dz12
);
1642 /* Update vectorial force */
1643 fix1
= _mm_add_pd(fix1
,tx
);
1644 fiy1
= _mm_add_pd(fiy1
,ty
);
1645 fiz1
= _mm_add_pd(fiz1
,tz
);
1647 fjx2
= _mm_add_pd(fjx2
,tx
);
1648 fjy2
= _mm_add_pd(fjy2
,ty
);
1649 fjz2
= _mm_add_pd(fjz2
,tz
);
1651 /**************************
1652 * CALCULATE INTERACTIONS *
1653 **************************/
1655 r13
= _mm_mul_pd(rsq13
,rinv13
);
1657 /* Calculate table index by multiplying r with table scale and truncate to integer */
1658 rt
= _mm_mul_pd(r13
,vftabscale
);
1659 vfitab
= _mm_cvttpd_epi32(rt
);
1660 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1661 vfitab
= _mm_slli_epi32(vfitab
,2);
1663 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1664 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1665 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1666 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1667 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1668 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1669 GMX_MM_TRANSPOSE2_PD(G
,H
);
1670 Heps
= _mm_mul_pd(vfeps
,H
);
1671 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1672 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1673 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq13
,FF
),_mm_mul_pd(vftabscale
,rinv13
)));
1677 /* Calculate temporary vectorial force */
1678 tx
= _mm_mul_pd(fscal
,dx13
);
1679 ty
= _mm_mul_pd(fscal
,dy13
);
1680 tz
= _mm_mul_pd(fscal
,dz13
);
1682 /* Update vectorial force */
1683 fix1
= _mm_add_pd(fix1
,tx
);
1684 fiy1
= _mm_add_pd(fiy1
,ty
);
1685 fiz1
= _mm_add_pd(fiz1
,tz
);
1687 fjx3
= _mm_add_pd(fjx3
,tx
);
1688 fjy3
= _mm_add_pd(fjy3
,ty
);
1689 fjz3
= _mm_add_pd(fjz3
,tz
);
1691 /**************************
1692 * CALCULATE INTERACTIONS *
1693 **************************/
1695 r21
= _mm_mul_pd(rsq21
,rinv21
);
1697 /* Calculate table index by multiplying r with table scale and truncate to integer */
1698 rt
= _mm_mul_pd(r21
,vftabscale
);
1699 vfitab
= _mm_cvttpd_epi32(rt
);
1700 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1701 vfitab
= _mm_slli_epi32(vfitab
,2);
1703 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1704 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1705 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1706 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1707 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1708 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1709 GMX_MM_TRANSPOSE2_PD(G
,H
);
1710 Heps
= _mm_mul_pd(vfeps
,H
);
1711 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1712 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1713 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
1717 /* Calculate temporary vectorial force */
1718 tx
= _mm_mul_pd(fscal
,dx21
);
1719 ty
= _mm_mul_pd(fscal
,dy21
);
1720 tz
= _mm_mul_pd(fscal
,dz21
);
1722 /* Update vectorial force */
1723 fix2
= _mm_add_pd(fix2
,tx
);
1724 fiy2
= _mm_add_pd(fiy2
,ty
);
1725 fiz2
= _mm_add_pd(fiz2
,tz
);
1727 fjx1
= _mm_add_pd(fjx1
,tx
);
1728 fjy1
= _mm_add_pd(fjy1
,ty
);
1729 fjz1
= _mm_add_pd(fjz1
,tz
);
1731 /**************************
1732 * CALCULATE INTERACTIONS *
1733 **************************/
1735 r22
= _mm_mul_pd(rsq22
,rinv22
);
1737 /* Calculate table index by multiplying r with table scale and truncate to integer */
1738 rt
= _mm_mul_pd(r22
,vftabscale
);
1739 vfitab
= _mm_cvttpd_epi32(rt
);
1740 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1741 vfitab
= _mm_slli_epi32(vfitab
,2);
1743 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1744 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1745 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1746 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1747 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1748 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1749 GMX_MM_TRANSPOSE2_PD(G
,H
);
1750 Heps
= _mm_mul_pd(vfeps
,H
);
1751 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1752 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1753 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
1757 /* Calculate temporary vectorial force */
1758 tx
= _mm_mul_pd(fscal
,dx22
);
1759 ty
= _mm_mul_pd(fscal
,dy22
);
1760 tz
= _mm_mul_pd(fscal
,dz22
);
1762 /* Update vectorial force */
1763 fix2
= _mm_add_pd(fix2
,tx
);
1764 fiy2
= _mm_add_pd(fiy2
,ty
);
1765 fiz2
= _mm_add_pd(fiz2
,tz
);
1767 fjx2
= _mm_add_pd(fjx2
,tx
);
1768 fjy2
= _mm_add_pd(fjy2
,ty
);
1769 fjz2
= _mm_add_pd(fjz2
,tz
);
1771 /**************************
1772 * CALCULATE INTERACTIONS *
1773 **************************/
1775 r23
= _mm_mul_pd(rsq23
,rinv23
);
1777 /* Calculate table index by multiplying r with table scale and truncate to integer */
1778 rt
= _mm_mul_pd(r23
,vftabscale
);
1779 vfitab
= _mm_cvttpd_epi32(rt
);
1780 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1781 vfitab
= _mm_slli_epi32(vfitab
,2);
1783 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1784 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1785 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1786 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1787 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1788 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1789 GMX_MM_TRANSPOSE2_PD(G
,H
);
1790 Heps
= _mm_mul_pd(vfeps
,H
);
1791 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1792 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1793 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq23
,FF
),_mm_mul_pd(vftabscale
,rinv23
)));
1797 /* Calculate temporary vectorial force */
1798 tx
= _mm_mul_pd(fscal
,dx23
);
1799 ty
= _mm_mul_pd(fscal
,dy23
);
1800 tz
= _mm_mul_pd(fscal
,dz23
);
1802 /* Update vectorial force */
1803 fix2
= _mm_add_pd(fix2
,tx
);
1804 fiy2
= _mm_add_pd(fiy2
,ty
);
1805 fiz2
= _mm_add_pd(fiz2
,tz
);
1807 fjx3
= _mm_add_pd(fjx3
,tx
);
1808 fjy3
= _mm_add_pd(fjy3
,ty
);
1809 fjz3
= _mm_add_pd(fjz3
,tz
);
1811 /**************************
1812 * CALCULATE INTERACTIONS *
1813 **************************/
1815 r31
= _mm_mul_pd(rsq31
,rinv31
);
1817 /* Calculate table index by multiplying r with table scale and truncate to integer */
1818 rt
= _mm_mul_pd(r31
,vftabscale
);
1819 vfitab
= _mm_cvttpd_epi32(rt
);
1820 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1821 vfitab
= _mm_slli_epi32(vfitab
,2);
1823 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1824 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1825 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1826 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1827 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1828 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1829 GMX_MM_TRANSPOSE2_PD(G
,H
);
1830 Heps
= _mm_mul_pd(vfeps
,H
);
1831 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1832 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1833 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq31
,FF
),_mm_mul_pd(vftabscale
,rinv31
)));
1837 /* Calculate temporary vectorial force */
1838 tx
= _mm_mul_pd(fscal
,dx31
);
1839 ty
= _mm_mul_pd(fscal
,dy31
);
1840 tz
= _mm_mul_pd(fscal
,dz31
);
1842 /* Update vectorial force */
1843 fix3
= _mm_add_pd(fix3
,tx
);
1844 fiy3
= _mm_add_pd(fiy3
,ty
);
1845 fiz3
= _mm_add_pd(fiz3
,tz
);
1847 fjx1
= _mm_add_pd(fjx1
,tx
);
1848 fjy1
= _mm_add_pd(fjy1
,ty
);
1849 fjz1
= _mm_add_pd(fjz1
,tz
);
1851 /**************************
1852 * CALCULATE INTERACTIONS *
1853 **************************/
1855 r32
= _mm_mul_pd(rsq32
,rinv32
);
1857 /* Calculate table index by multiplying r with table scale and truncate to integer */
1858 rt
= _mm_mul_pd(r32
,vftabscale
);
1859 vfitab
= _mm_cvttpd_epi32(rt
);
1860 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1861 vfitab
= _mm_slli_epi32(vfitab
,2);
1863 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1864 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1865 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1866 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1867 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1868 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1869 GMX_MM_TRANSPOSE2_PD(G
,H
);
1870 Heps
= _mm_mul_pd(vfeps
,H
);
1871 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1872 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1873 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq32
,FF
),_mm_mul_pd(vftabscale
,rinv32
)));
1877 /* Calculate temporary vectorial force */
1878 tx
= _mm_mul_pd(fscal
,dx32
);
1879 ty
= _mm_mul_pd(fscal
,dy32
);
1880 tz
= _mm_mul_pd(fscal
,dz32
);
1882 /* Update vectorial force */
1883 fix3
= _mm_add_pd(fix3
,tx
);
1884 fiy3
= _mm_add_pd(fiy3
,ty
);
1885 fiz3
= _mm_add_pd(fiz3
,tz
);
1887 fjx2
= _mm_add_pd(fjx2
,tx
);
1888 fjy2
= _mm_add_pd(fjy2
,ty
);
1889 fjz2
= _mm_add_pd(fjz2
,tz
);
1891 /**************************
1892 * CALCULATE INTERACTIONS *
1893 **************************/
1895 r33
= _mm_mul_pd(rsq33
,rinv33
);
1897 /* Calculate table index by multiplying r with table scale and truncate to integer */
1898 rt
= _mm_mul_pd(r33
,vftabscale
);
1899 vfitab
= _mm_cvttpd_epi32(rt
);
1900 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1901 vfitab
= _mm_slli_epi32(vfitab
,2);
1903 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1904 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1905 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1906 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1907 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1908 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1909 GMX_MM_TRANSPOSE2_PD(G
,H
);
1910 Heps
= _mm_mul_pd(vfeps
,H
);
1911 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1912 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1913 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq33
,FF
),_mm_mul_pd(vftabscale
,rinv33
)));
1917 /* Calculate temporary vectorial force */
1918 tx
= _mm_mul_pd(fscal
,dx33
);
1919 ty
= _mm_mul_pd(fscal
,dy33
);
1920 tz
= _mm_mul_pd(fscal
,dz33
);
1922 /* Update vectorial force */
1923 fix3
= _mm_add_pd(fix3
,tx
);
1924 fiy3
= _mm_add_pd(fiy3
,ty
);
1925 fiz3
= _mm_add_pd(fiz3
,tz
);
1927 fjx3
= _mm_add_pd(fjx3
,tx
);
1928 fjy3
= _mm_add_pd(fjy3
,ty
);
1929 fjz3
= _mm_add_pd(fjz3
,tz
);
1931 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1933 /* Inner loop uses 381 flops */
1936 if(jidx
<j_index_end
)
1940 j_coord_offsetA
= DIM
*jnrA
;
1942 /* load j atom coordinates */
1943 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1944 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1945 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1947 /* Calculate displacement vector */
1948 dx00
= _mm_sub_pd(ix0
,jx0
);
1949 dy00
= _mm_sub_pd(iy0
,jy0
);
1950 dz00
= _mm_sub_pd(iz0
,jz0
);
1951 dx11
= _mm_sub_pd(ix1
,jx1
);
1952 dy11
= _mm_sub_pd(iy1
,jy1
);
1953 dz11
= _mm_sub_pd(iz1
,jz1
);
1954 dx12
= _mm_sub_pd(ix1
,jx2
);
1955 dy12
= _mm_sub_pd(iy1
,jy2
);
1956 dz12
= _mm_sub_pd(iz1
,jz2
);
1957 dx13
= _mm_sub_pd(ix1
,jx3
);
1958 dy13
= _mm_sub_pd(iy1
,jy3
);
1959 dz13
= _mm_sub_pd(iz1
,jz3
);
1960 dx21
= _mm_sub_pd(ix2
,jx1
);
1961 dy21
= _mm_sub_pd(iy2
,jy1
);
1962 dz21
= _mm_sub_pd(iz2
,jz1
);
1963 dx22
= _mm_sub_pd(ix2
,jx2
);
1964 dy22
= _mm_sub_pd(iy2
,jy2
);
1965 dz22
= _mm_sub_pd(iz2
,jz2
);
1966 dx23
= _mm_sub_pd(ix2
,jx3
);
1967 dy23
= _mm_sub_pd(iy2
,jy3
);
1968 dz23
= _mm_sub_pd(iz2
,jz3
);
1969 dx31
= _mm_sub_pd(ix3
,jx1
);
1970 dy31
= _mm_sub_pd(iy3
,jy1
);
1971 dz31
= _mm_sub_pd(iz3
,jz1
);
1972 dx32
= _mm_sub_pd(ix3
,jx2
);
1973 dy32
= _mm_sub_pd(iy3
,jy2
);
1974 dz32
= _mm_sub_pd(iz3
,jz2
);
1975 dx33
= _mm_sub_pd(ix3
,jx3
);
1976 dy33
= _mm_sub_pd(iy3
,jy3
);
1977 dz33
= _mm_sub_pd(iz3
,jz3
);
1979 /* Calculate squared distance and things based on it */
1980 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1981 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1982 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1983 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1984 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1985 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1986 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1987 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1988 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1989 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1991 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1992 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1993 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1994 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1995 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1996 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1997 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1998 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1999 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
2001 rinvsq00
= gmx_mm_inv_pd(rsq00
);
2003 fjx0
= _mm_setzero_pd();
2004 fjy0
= _mm_setzero_pd();
2005 fjz0
= _mm_setzero_pd();
2006 fjx1
= _mm_setzero_pd();
2007 fjy1
= _mm_setzero_pd();
2008 fjz1
= _mm_setzero_pd();
2009 fjx2
= _mm_setzero_pd();
2010 fjy2
= _mm_setzero_pd();
2011 fjz2
= _mm_setzero_pd();
2012 fjx3
= _mm_setzero_pd();
2013 fjy3
= _mm_setzero_pd();
2014 fjz3
= _mm_setzero_pd();
2016 /**************************
2017 * CALCULATE INTERACTIONS *
2018 **************************/
2020 /* LENNARD-JONES DISPERSION/REPULSION */
2022 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
2023 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
2027 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2029 /* Calculate temporary vectorial force */
2030 tx
= _mm_mul_pd(fscal
,dx00
);
2031 ty
= _mm_mul_pd(fscal
,dy00
);
2032 tz
= _mm_mul_pd(fscal
,dz00
);
2034 /* Update vectorial force */
2035 fix0
= _mm_add_pd(fix0
,tx
);
2036 fiy0
= _mm_add_pd(fiy0
,ty
);
2037 fiz0
= _mm_add_pd(fiz0
,tz
);
2039 fjx0
= _mm_add_pd(fjx0
,tx
);
2040 fjy0
= _mm_add_pd(fjy0
,ty
);
2041 fjz0
= _mm_add_pd(fjz0
,tz
);
2043 /**************************
2044 * CALCULATE INTERACTIONS *
2045 **************************/
2047 r11
= _mm_mul_pd(rsq11
,rinv11
);
2049 /* Calculate table index by multiplying r with table scale and truncate to integer */
2050 rt
= _mm_mul_pd(r11
,vftabscale
);
2051 vfitab
= _mm_cvttpd_epi32(rt
);
2052 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2053 vfitab
= _mm_slli_epi32(vfitab
,2);
2055 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2056 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2057 F
= _mm_setzero_pd();
2058 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2059 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2060 H
= _mm_setzero_pd();
2061 GMX_MM_TRANSPOSE2_PD(G
,H
);
2062 Heps
= _mm_mul_pd(vfeps
,H
);
2063 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2064 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2065 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
2069 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2071 /* Calculate temporary vectorial force */
2072 tx
= _mm_mul_pd(fscal
,dx11
);
2073 ty
= _mm_mul_pd(fscal
,dy11
);
2074 tz
= _mm_mul_pd(fscal
,dz11
);
2076 /* Update vectorial force */
2077 fix1
= _mm_add_pd(fix1
,tx
);
2078 fiy1
= _mm_add_pd(fiy1
,ty
);
2079 fiz1
= _mm_add_pd(fiz1
,tz
);
2081 fjx1
= _mm_add_pd(fjx1
,tx
);
2082 fjy1
= _mm_add_pd(fjy1
,ty
);
2083 fjz1
= _mm_add_pd(fjz1
,tz
);
2085 /**************************
2086 * CALCULATE INTERACTIONS *
2087 **************************/
2089 r12
= _mm_mul_pd(rsq12
,rinv12
);
2091 /* Calculate table index by multiplying r with table scale and truncate to integer */
2092 rt
= _mm_mul_pd(r12
,vftabscale
);
2093 vfitab
= _mm_cvttpd_epi32(rt
);
2094 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2095 vfitab
= _mm_slli_epi32(vfitab
,2);
2097 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2098 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2099 F
= _mm_setzero_pd();
2100 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2101 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2102 H
= _mm_setzero_pd();
2103 GMX_MM_TRANSPOSE2_PD(G
,H
);
2104 Heps
= _mm_mul_pd(vfeps
,H
);
2105 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2106 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2107 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
2111 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2113 /* Calculate temporary vectorial force */
2114 tx
= _mm_mul_pd(fscal
,dx12
);
2115 ty
= _mm_mul_pd(fscal
,dy12
);
2116 tz
= _mm_mul_pd(fscal
,dz12
);
2118 /* Update vectorial force */
2119 fix1
= _mm_add_pd(fix1
,tx
);
2120 fiy1
= _mm_add_pd(fiy1
,ty
);
2121 fiz1
= _mm_add_pd(fiz1
,tz
);
2123 fjx2
= _mm_add_pd(fjx2
,tx
);
2124 fjy2
= _mm_add_pd(fjy2
,ty
);
2125 fjz2
= _mm_add_pd(fjz2
,tz
);
2127 /**************************
2128 * CALCULATE INTERACTIONS *
2129 **************************/
2131 r13
= _mm_mul_pd(rsq13
,rinv13
);
2133 /* Calculate table index by multiplying r with table scale and truncate to integer */
2134 rt
= _mm_mul_pd(r13
,vftabscale
);
2135 vfitab
= _mm_cvttpd_epi32(rt
);
2136 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2137 vfitab
= _mm_slli_epi32(vfitab
,2);
2139 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2140 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2141 F
= _mm_setzero_pd();
2142 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2143 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2144 H
= _mm_setzero_pd();
2145 GMX_MM_TRANSPOSE2_PD(G
,H
);
2146 Heps
= _mm_mul_pd(vfeps
,H
);
2147 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2148 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2149 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq13
,FF
),_mm_mul_pd(vftabscale
,rinv13
)));
2153 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2155 /* Calculate temporary vectorial force */
2156 tx
= _mm_mul_pd(fscal
,dx13
);
2157 ty
= _mm_mul_pd(fscal
,dy13
);
2158 tz
= _mm_mul_pd(fscal
,dz13
);
2160 /* Update vectorial force */
2161 fix1
= _mm_add_pd(fix1
,tx
);
2162 fiy1
= _mm_add_pd(fiy1
,ty
);
2163 fiz1
= _mm_add_pd(fiz1
,tz
);
2165 fjx3
= _mm_add_pd(fjx3
,tx
);
2166 fjy3
= _mm_add_pd(fjy3
,ty
);
2167 fjz3
= _mm_add_pd(fjz3
,tz
);
2169 /**************************
2170 * CALCULATE INTERACTIONS *
2171 **************************/
2173 r21
= _mm_mul_pd(rsq21
,rinv21
);
2175 /* Calculate table index by multiplying r with table scale and truncate to integer */
2176 rt
= _mm_mul_pd(r21
,vftabscale
);
2177 vfitab
= _mm_cvttpd_epi32(rt
);
2178 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2179 vfitab
= _mm_slli_epi32(vfitab
,2);
2181 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2182 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2183 F
= _mm_setzero_pd();
2184 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2185 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2186 H
= _mm_setzero_pd();
2187 GMX_MM_TRANSPOSE2_PD(G
,H
);
2188 Heps
= _mm_mul_pd(vfeps
,H
);
2189 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2190 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2191 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
2195 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2197 /* Calculate temporary vectorial force */
2198 tx
= _mm_mul_pd(fscal
,dx21
);
2199 ty
= _mm_mul_pd(fscal
,dy21
);
2200 tz
= _mm_mul_pd(fscal
,dz21
);
2202 /* Update vectorial force */
2203 fix2
= _mm_add_pd(fix2
,tx
);
2204 fiy2
= _mm_add_pd(fiy2
,ty
);
2205 fiz2
= _mm_add_pd(fiz2
,tz
);
2207 fjx1
= _mm_add_pd(fjx1
,tx
);
2208 fjy1
= _mm_add_pd(fjy1
,ty
);
2209 fjz1
= _mm_add_pd(fjz1
,tz
);
2211 /**************************
2212 * CALCULATE INTERACTIONS *
2213 **************************/
2215 r22
= _mm_mul_pd(rsq22
,rinv22
);
2217 /* Calculate table index by multiplying r with table scale and truncate to integer */
2218 rt
= _mm_mul_pd(r22
,vftabscale
);
2219 vfitab
= _mm_cvttpd_epi32(rt
);
2220 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2221 vfitab
= _mm_slli_epi32(vfitab
,2);
2223 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2224 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2225 F
= _mm_setzero_pd();
2226 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2227 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2228 H
= _mm_setzero_pd();
2229 GMX_MM_TRANSPOSE2_PD(G
,H
);
2230 Heps
= _mm_mul_pd(vfeps
,H
);
2231 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2232 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2233 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
2237 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2239 /* Calculate temporary vectorial force */
2240 tx
= _mm_mul_pd(fscal
,dx22
);
2241 ty
= _mm_mul_pd(fscal
,dy22
);
2242 tz
= _mm_mul_pd(fscal
,dz22
);
2244 /* Update vectorial force */
2245 fix2
= _mm_add_pd(fix2
,tx
);
2246 fiy2
= _mm_add_pd(fiy2
,ty
);
2247 fiz2
= _mm_add_pd(fiz2
,tz
);
2249 fjx2
= _mm_add_pd(fjx2
,tx
);
2250 fjy2
= _mm_add_pd(fjy2
,ty
);
2251 fjz2
= _mm_add_pd(fjz2
,tz
);
2253 /**************************
2254 * CALCULATE INTERACTIONS *
2255 **************************/
2257 r23
= _mm_mul_pd(rsq23
,rinv23
);
2259 /* Calculate table index by multiplying r with table scale and truncate to integer */
2260 rt
= _mm_mul_pd(r23
,vftabscale
);
2261 vfitab
= _mm_cvttpd_epi32(rt
);
2262 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2263 vfitab
= _mm_slli_epi32(vfitab
,2);
2265 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2266 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2267 F
= _mm_setzero_pd();
2268 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2269 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2270 H
= _mm_setzero_pd();
2271 GMX_MM_TRANSPOSE2_PD(G
,H
);
2272 Heps
= _mm_mul_pd(vfeps
,H
);
2273 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2274 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2275 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq23
,FF
),_mm_mul_pd(vftabscale
,rinv23
)));
2279 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2281 /* Calculate temporary vectorial force */
2282 tx
= _mm_mul_pd(fscal
,dx23
);
2283 ty
= _mm_mul_pd(fscal
,dy23
);
2284 tz
= _mm_mul_pd(fscal
,dz23
);
2286 /* Update vectorial force */
2287 fix2
= _mm_add_pd(fix2
,tx
);
2288 fiy2
= _mm_add_pd(fiy2
,ty
);
2289 fiz2
= _mm_add_pd(fiz2
,tz
);
2291 fjx3
= _mm_add_pd(fjx3
,tx
);
2292 fjy3
= _mm_add_pd(fjy3
,ty
);
2293 fjz3
= _mm_add_pd(fjz3
,tz
);
2295 /**************************
2296 * CALCULATE INTERACTIONS *
2297 **************************/
2299 r31
= _mm_mul_pd(rsq31
,rinv31
);
2301 /* Calculate table index by multiplying r with table scale and truncate to integer */
2302 rt
= _mm_mul_pd(r31
,vftabscale
);
2303 vfitab
= _mm_cvttpd_epi32(rt
);
2304 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2305 vfitab
= _mm_slli_epi32(vfitab
,2);
2307 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2308 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2309 F
= _mm_setzero_pd();
2310 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2311 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2312 H
= _mm_setzero_pd();
2313 GMX_MM_TRANSPOSE2_PD(G
,H
);
2314 Heps
= _mm_mul_pd(vfeps
,H
);
2315 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2316 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2317 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq31
,FF
),_mm_mul_pd(vftabscale
,rinv31
)));
2321 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2323 /* Calculate temporary vectorial force */
2324 tx
= _mm_mul_pd(fscal
,dx31
);
2325 ty
= _mm_mul_pd(fscal
,dy31
);
2326 tz
= _mm_mul_pd(fscal
,dz31
);
2328 /* Update vectorial force */
2329 fix3
= _mm_add_pd(fix3
,tx
);
2330 fiy3
= _mm_add_pd(fiy3
,ty
);
2331 fiz3
= _mm_add_pd(fiz3
,tz
);
2333 fjx1
= _mm_add_pd(fjx1
,tx
);
2334 fjy1
= _mm_add_pd(fjy1
,ty
);
2335 fjz1
= _mm_add_pd(fjz1
,tz
);
2337 /**************************
2338 * CALCULATE INTERACTIONS *
2339 **************************/
2341 r32
= _mm_mul_pd(rsq32
,rinv32
);
2343 /* Calculate table index by multiplying r with table scale and truncate to integer */
2344 rt
= _mm_mul_pd(r32
,vftabscale
);
2345 vfitab
= _mm_cvttpd_epi32(rt
);
2346 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2347 vfitab
= _mm_slli_epi32(vfitab
,2);
2349 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2350 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2351 F
= _mm_setzero_pd();
2352 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2353 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2354 H
= _mm_setzero_pd();
2355 GMX_MM_TRANSPOSE2_PD(G
,H
);
2356 Heps
= _mm_mul_pd(vfeps
,H
);
2357 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2358 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2359 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq32
,FF
),_mm_mul_pd(vftabscale
,rinv32
)));
2363 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2365 /* Calculate temporary vectorial force */
2366 tx
= _mm_mul_pd(fscal
,dx32
);
2367 ty
= _mm_mul_pd(fscal
,dy32
);
2368 tz
= _mm_mul_pd(fscal
,dz32
);
2370 /* Update vectorial force */
2371 fix3
= _mm_add_pd(fix3
,tx
);
2372 fiy3
= _mm_add_pd(fiy3
,ty
);
2373 fiz3
= _mm_add_pd(fiz3
,tz
);
2375 fjx2
= _mm_add_pd(fjx2
,tx
);
2376 fjy2
= _mm_add_pd(fjy2
,ty
);
2377 fjz2
= _mm_add_pd(fjz2
,tz
);
2379 /**************************
2380 * CALCULATE INTERACTIONS *
2381 **************************/
2383 r33
= _mm_mul_pd(rsq33
,rinv33
);
2385 /* Calculate table index by multiplying r with table scale and truncate to integer */
2386 rt
= _mm_mul_pd(r33
,vftabscale
);
2387 vfitab
= _mm_cvttpd_epi32(rt
);
2388 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2389 vfitab
= _mm_slli_epi32(vfitab
,2);
2391 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2392 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2393 F
= _mm_setzero_pd();
2394 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2395 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2396 H
= _mm_setzero_pd();
2397 GMX_MM_TRANSPOSE2_PD(G
,H
);
2398 Heps
= _mm_mul_pd(vfeps
,H
);
2399 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2400 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2401 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq33
,FF
),_mm_mul_pd(vftabscale
,rinv33
)));
2405 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2407 /* Calculate temporary vectorial force */
2408 tx
= _mm_mul_pd(fscal
,dx33
);
2409 ty
= _mm_mul_pd(fscal
,dy33
);
2410 tz
= _mm_mul_pd(fscal
,dz33
);
2412 /* Update vectorial force */
2413 fix3
= _mm_add_pd(fix3
,tx
);
2414 fiy3
= _mm_add_pd(fiy3
,ty
);
2415 fiz3
= _mm_add_pd(fiz3
,tz
);
2417 fjx3
= _mm_add_pd(fjx3
,tx
);
2418 fjy3
= _mm_add_pd(fjy3
,ty
);
2419 fjz3
= _mm_add_pd(fjz3
,tz
);
2421 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2423 /* Inner loop uses 381 flops */
2426 /* End of innermost loop */
2428 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2429 f
+i_coord_offset
,fshift
+i_shift_offset
);
2431 /* Increment number of inner iterations */
2432 inneriter
+= j_index_end
- j_index_start
;
2434 /* Outer loop uses 24 flops */
2437 /* Increment number of outer iterations */
2440 /* Update outer/inner flops */
2442 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*381);