2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
48 #include "kernelutil_x86_avx_128_fma_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
59 (t_nblist
* gmx_restrict nlist
,
60 rvec
* gmx_restrict xx
,
61 rvec
* gmx_restrict ff
,
62 t_forcerec
* gmx_restrict fr
,
63 t_mdatoms
* gmx_restrict mdatoms
,
64 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
65 t_nrnb
* gmx_restrict nrnb
)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
73 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
75 int j_coord_offsetA
,j_coord_offsetB
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
81 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
83 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
85 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
86 int vdwjidx0A
,vdwjidx0B
;
87 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
88 int vdwjidx1A
,vdwjidx1B
;
89 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
90 int vdwjidx2A
,vdwjidx2B
;
91 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
92 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
93 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
94 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
95 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
96 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
97 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
98 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
99 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
100 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
101 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
104 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
107 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
108 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
110 __m128i ifour
= _mm_set1_epi32(4);
111 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
,twovfeps
;
114 __m128d ewtabscale
,eweps
,twoeweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
116 __m128d dummy_mask
,cutoff_mask
;
117 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
118 __m128d one
= _mm_set1_pd(1.0);
119 __m128d two
= _mm_set1_pd(2.0);
125 jindex
= nlist
->jindex
;
127 shiftidx
= nlist
->shift
;
129 shiftvec
= fr
->shift_vec
[0];
130 fshift
= fr
->fshift
[0];
131 facel
= _mm_set1_pd(fr
->epsfac
);
132 charge
= mdatoms
->chargeA
;
133 nvdwtype
= fr
->ntype
;
135 vdwtype
= mdatoms
->typeA
;
137 vftab
= kernel_data
->table_vdw
->data
;
138 vftabscale
= _mm_set1_pd(kernel_data
->table_vdw
->scale
);
140 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
141 ewtab
= fr
->ic
->tabq_coul_FDV0
;
142 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
143 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
145 /* Setup water-specific parameters */
146 inr
= nlist
->iinr
[0];
147 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
148 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
149 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
150 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
152 jq0
= _mm_set1_pd(charge
[inr
+0]);
153 jq1
= _mm_set1_pd(charge
[inr
+1]);
154 jq2
= _mm_set1_pd(charge
[inr
+2]);
155 vdwjidx0A
= 2*vdwtype
[inr
+0];
156 qq00
= _mm_mul_pd(iq0
,jq0
);
157 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
158 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
159 qq01
= _mm_mul_pd(iq0
,jq1
);
160 qq02
= _mm_mul_pd(iq0
,jq2
);
161 qq10
= _mm_mul_pd(iq1
,jq0
);
162 qq11
= _mm_mul_pd(iq1
,jq1
);
163 qq12
= _mm_mul_pd(iq1
,jq2
);
164 qq20
= _mm_mul_pd(iq2
,jq0
);
165 qq21
= _mm_mul_pd(iq2
,jq1
);
166 qq22
= _mm_mul_pd(iq2
,jq2
);
168 /* Avoid stupid compiler warnings */
176 /* Start outer loop over neighborlists */
177 for(iidx
=0; iidx
<nri
; iidx
++)
179 /* Load shift vector for this list */
180 i_shift_offset
= DIM
*shiftidx
[iidx
];
182 /* Load limits for loop over neighbors */
183 j_index_start
= jindex
[iidx
];
184 j_index_end
= jindex
[iidx
+1];
186 /* Get outer coordinate index */
188 i_coord_offset
= DIM
*inr
;
190 /* Load i particle coords and add shift vector */
191 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
192 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
194 fix0
= _mm_setzero_pd();
195 fiy0
= _mm_setzero_pd();
196 fiz0
= _mm_setzero_pd();
197 fix1
= _mm_setzero_pd();
198 fiy1
= _mm_setzero_pd();
199 fiz1
= _mm_setzero_pd();
200 fix2
= _mm_setzero_pd();
201 fiy2
= _mm_setzero_pd();
202 fiz2
= _mm_setzero_pd();
204 /* Reset potential sums */
205 velecsum
= _mm_setzero_pd();
206 vvdwsum
= _mm_setzero_pd();
208 /* Start inner kernel loop */
209 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
212 /* Get j neighbor index, and coordinate index */
215 j_coord_offsetA
= DIM
*jnrA
;
216 j_coord_offsetB
= DIM
*jnrB
;
218 /* load j atom coordinates */
219 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
220 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
222 /* Calculate displacement vector */
223 dx00
= _mm_sub_pd(ix0
,jx0
);
224 dy00
= _mm_sub_pd(iy0
,jy0
);
225 dz00
= _mm_sub_pd(iz0
,jz0
);
226 dx01
= _mm_sub_pd(ix0
,jx1
);
227 dy01
= _mm_sub_pd(iy0
,jy1
);
228 dz01
= _mm_sub_pd(iz0
,jz1
);
229 dx02
= _mm_sub_pd(ix0
,jx2
);
230 dy02
= _mm_sub_pd(iy0
,jy2
);
231 dz02
= _mm_sub_pd(iz0
,jz2
);
232 dx10
= _mm_sub_pd(ix1
,jx0
);
233 dy10
= _mm_sub_pd(iy1
,jy0
);
234 dz10
= _mm_sub_pd(iz1
,jz0
);
235 dx11
= _mm_sub_pd(ix1
,jx1
);
236 dy11
= _mm_sub_pd(iy1
,jy1
);
237 dz11
= _mm_sub_pd(iz1
,jz1
);
238 dx12
= _mm_sub_pd(ix1
,jx2
);
239 dy12
= _mm_sub_pd(iy1
,jy2
);
240 dz12
= _mm_sub_pd(iz1
,jz2
);
241 dx20
= _mm_sub_pd(ix2
,jx0
);
242 dy20
= _mm_sub_pd(iy2
,jy0
);
243 dz20
= _mm_sub_pd(iz2
,jz0
);
244 dx21
= _mm_sub_pd(ix2
,jx1
);
245 dy21
= _mm_sub_pd(iy2
,jy1
);
246 dz21
= _mm_sub_pd(iz2
,jz1
);
247 dx22
= _mm_sub_pd(ix2
,jx2
);
248 dy22
= _mm_sub_pd(iy2
,jy2
);
249 dz22
= _mm_sub_pd(iz2
,jz2
);
251 /* Calculate squared distance and things based on it */
252 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
253 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
254 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
255 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
256 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
257 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
258 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
259 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
260 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
262 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
263 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
264 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
265 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
266 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
267 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
268 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
269 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
270 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
272 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
273 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
274 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
275 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
276 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
277 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
278 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
279 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
280 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
282 fjx0
= _mm_setzero_pd();
283 fjy0
= _mm_setzero_pd();
284 fjz0
= _mm_setzero_pd();
285 fjx1
= _mm_setzero_pd();
286 fjy1
= _mm_setzero_pd();
287 fjz1
= _mm_setzero_pd();
288 fjx2
= _mm_setzero_pd();
289 fjy2
= _mm_setzero_pd();
290 fjz2
= _mm_setzero_pd();
292 /**************************
293 * CALCULATE INTERACTIONS *
294 **************************/
296 r00
= _mm_mul_pd(rsq00
,rinv00
);
298 /* Calculate table index by multiplying r with table scale and truncate to integer */
299 rt
= _mm_mul_pd(r00
,vftabscale
);
300 vfitab
= _mm_cvttpd_epi32(rt
);
302 vfeps
= _mm_frcz_pd(rt
);
304 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
306 twovfeps
= _mm_add_pd(vfeps
,vfeps
);
307 vfitab
= _mm_slli_epi32(vfitab
,3);
309 /* EWALD ELECTROSTATICS */
311 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
312 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
313 ewitab
= _mm_cvttpd_epi32(ewrt
);
315 eweps
= _mm_frcz_pd(ewrt
);
317 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
319 twoeweps
= _mm_add_pd(eweps
,eweps
);
320 ewitab
= _mm_slli_epi32(ewitab
,2);
321 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
322 ewtabD
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
323 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
324 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
325 ewtabFn
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,1) +2);
326 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
327 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
328 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
329 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
330 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
332 /* CUBIC SPLINE TABLE DISPERSION */
333 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
334 F
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
335 GMX_MM_TRANSPOSE2_PD(Y
,F
);
336 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
337 H
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) +2);
338 GMX_MM_TRANSPOSE2_PD(G
,H
);
339 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
340 VV
= _mm_macc_pd(vfeps
,Fp
,Y
);
341 vvdw6
= _mm_mul_pd(c6_00
,VV
);
342 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
343 fvdw6
= _mm_mul_pd(c6_00
,FF
);
345 /* CUBIC SPLINE TABLE REPULSION */
346 vfitab
= _mm_add_epi32(vfitab
,ifour
);
347 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
348 F
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
349 GMX_MM_TRANSPOSE2_PD(Y
,F
);
350 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
351 H
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) +2);
352 GMX_MM_TRANSPOSE2_PD(G
,H
);
353 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
354 VV
= _mm_macc_pd(vfeps
,Fp
,Y
);
355 vvdw12
= _mm_mul_pd(c12_00
,VV
);
356 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
357 fvdw12
= _mm_mul_pd(c12_00
,FF
);
358 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
359 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
361 /* Update potential sum for this i atom from the interaction with this j atom. */
362 velecsum
= _mm_add_pd(velecsum
,velec
);
363 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
365 fscal
= _mm_add_pd(felec
,fvdw
);
367 /* Update vectorial force */
368 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
369 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
370 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
372 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
373 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
374 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r01
= _mm_mul_pd(rsq01
,rinv01
);
382 /* EWALD ELECTROSTATICS */
384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
385 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
386 ewitab
= _mm_cvttpd_epi32(ewrt
);
388 eweps
= _mm_frcz_pd(ewrt
);
390 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
392 twoeweps
= _mm_add_pd(eweps
,eweps
);
393 ewitab
= _mm_slli_epi32(ewitab
,2);
394 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
395 ewtabD
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
396 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
397 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
398 ewtabFn
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,1) +2);
399 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
400 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
401 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
402 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(rinv01
,velec
));
403 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
405 /* Update potential sum for this i atom from the interaction with this j atom. */
406 velecsum
= _mm_add_pd(velecsum
,velec
);
410 /* Update vectorial force */
411 fix0
= _mm_macc_pd(dx01
,fscal
,fix0
);
412 fiy0
= _mm_macc_pd(dy01
,fscal
,fiy0
);
413 fiz0
= _mm_macc_pd(dz01
,fscal
,fiz0
);
415 fjx1
= _mm_macc_pd(dx01
,fscal
,fjx1
);
416 fjy1
= _mm_macc_pd(dy01
,fscal
,fjy1
);
417 fjz1
= _mm_macc_pd(dz01
,fscal
,fjz1
);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 r02
= _mm_mul_pd(rsq02
,rinv02
);
425 /* EWALD ELECTROSTATICS */
427 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
428 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
429 ewitab
= _mm_cvttpd_epi32(ewrt
);
431 eweps
= _mm_frcz_pd(ewrt
);
433 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
435 twoeweps
= _mm_add_pd(eweps
,eweps
);
436 ewitab
= _mm_slli_epi32(ewitab
,2);
437 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
438 ewtabD
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
439 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
440 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
441 ewtabFn
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,1) +2);
442 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
443 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
444 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
445 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(rinv02
,velec
));
446 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
448 /* Update potential sum for this i atom from the interaction with this j atom. */
449 velecsum
= _mm_add_pd(velecsum
,velec
);
453 /* Update vectorial force */
454 fix0
= _mm_macc_pd(dx02
,fscal
,fix0
);
455 fiy0
= _mm_macc_pd(dy02
,fscal
,fiy0
);
456 fiz0
= _mm_macc_pd(dz02
,fscal
,fiz0
);
458 fjx2
= _mm_macc_pd(dx02
,fscal
,fjx2
);
459 fjy2
= _mm_macc_pd(dy02
,fscal
,fjy2
);
460 fjz2
= _mm_macc_pd(dz02
,fscal
,fjz2
);
462 /**************************
463 * CALCULATE INTERACTIONS *
464 **************************/
466 r10
= _mm_mul_pd(rsq10
,rinv10
);
468 /* EWALD ELECTROSTATICS */
470 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
471 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
472 ewitab
= _mm_cvttpd_epi32(ewrt
);
474 eweps
= _mm_frcz_pd(ewrt
);
476 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
478 twoeweps
= _mm_add_pd(eweps
,eweps
);
479 ewitab
= _mm_slli_epi32(ewitab
,2);
480 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
481 ewtabD
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
482 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
483 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
484 ewtabFn
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,1) +2);
485 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
486 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
487 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
488 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
489 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
491 /* Update potential sum for this i atom from the interaction with this j atom. */
492 velecsum
= _mm_add_pd(velecsum
,velec
);
496 /* Update vectorial force */
497 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
498 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
499 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
501 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
502 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
503 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
505 /**************************
506 * CALCULATE INTERACTIONS *
507 **************************/
509 r11
= _mm_mul_pd(rsq11
,rinv11
);
511 /* EWALD ELECTROSTATICS */
513 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
514 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
515 ewitab
= _mm_cvttpd_epi32(ewrt
);
517 eweps
= _mm_frcz_pd(ewrt
);
519 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
521 twoeweps
= _mm_add_pd(eweps
,eweps
);
522 ewitab
= _mm_slli_epi32(ewitab
,2);
523 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
524 ewtabD
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
525 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
526 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
527 ewtabFn
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,1) +2);
528 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
529 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
530 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
531 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
532 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
534 /* Update potential sum for this i atom from the interaction with this j atom. */
535 velecsum
= _mm_add_pd(velecsum
,velec
);
539 /* Update vectorial force */
540 fix1
= _mm_macc_pd(dx11
,fscal
,fix1
);
541 fiy1
= _mm_macc_pd(dy11
,fscal
,fiy1
);
542 fiz1
= _mm_macc_pd(dz11
,fscal
,fiz1
);
544 fjx1
= _mm_macc_pd(dx11
,fscal
,fjx1
);
545 fjy1
= _mm_macc_pd(dy11
,fscal
,fjy1
);
546 fjz1
= _mm_macc_pd(dz11
,fscal
,fjz1
);
548 /**************************
549 * CALCULATE INTERACTIONS *
550 **************************/
552 r12
= _mm_mul_pd(rsq12
,rinv12
);
554 /* EWALD ELECTROSTATICS */
556 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
557 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
558 ewitab
= _mm_cvttpd_epi32(ewrt
);
560 eweps
= _mm_frcz_pd(ewrt
);
562 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
564 twoeweps
= _mm_add_pd(eweps
,eweps
);
565 ewitab
= _mm_slli_epi32(ewitab
,2);
566 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
567 ewtabD
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
568 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
569 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
570 ewtabFn
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,1) +2);
571 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
572 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
573 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
574 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
575 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
577 /* Update potential sum for this i atom from the interaction with this j atom. */
578 velecsum
= _mm_add_pd(velecsum
,velec
);
582 /* Update vectorial force */
583 fix1
= _mm_macc_pd(dx12
,fscal
,fix1
);
584 fiy1
= _mm_macc_pd(dy12
,fscal
,fiy1
);
585 fiz1
= _mm_macc_pd(dz12
,fscal
,fiz1
);
587 fjx2
= _mm_macc_pd(dx12
,fscal
,fjx2
);
588 fjy2
= _mm_macc_pd(dy12
,fscal
,fjy2
);
589 fjz2
= _mm_macc_pd(dz12
,fscal
,fjz2
);
591 /**************************
592 * CALCULATE INTERACTIONS *
593 **************************/
595 r20
= _mm_mul_pd(rsq20
,rinv20
);
597 /* EWALD ELECTROSTATICS */
599 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
600 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
601 ewitab
= _mm_cvttpd_epi32(ewrt
);
603 eweps
= _mm_frcz_pd(ewrt
);
605 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
607 twoeweps
= _mm_add_pd(eweps
,eweps
);
608 ewitab
= _mm_slli_epi32(ewitab
,2);
609 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
610 ewtabD
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
611 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
612 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
613 ewtabFn
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,1) +2);
614 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
615 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
616 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
617 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
618 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
620 /* Update potential sum for this i atom from the interaction with this j atom. */
621 velecsum
= _mm_add_pd(velecsum
,velec
);
625 /* Update vectorial force */
626 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
627 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
628 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
630 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
631 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
632 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
634 /**************************
635 * CALCULATE INTERACTIONS *
636 **************************/
638 r21
= _mm_mul_pd(rsq21
,rinv21
);
640 /* EWALD ELECTROSTATICS */
642 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
643 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
644 ewitab
= _mm_cvttpd_epi32(ewrt
);
646 eweps
= _mm_frcz_pd(ewrt
);
648 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
650 twoeweps
= _mm_add_pd(eweps
,eweps
);
651 ewitab
= _mm_slli_epi32(ewitab
,2);
652 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
653 ewtabD
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
654 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
655 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
656 ewtabFn
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,1) +2);
657 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
658 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
659 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
660 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
661 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
663 /* Update potential sum for this i atom from the interaction with this j atom. */
664 velecsum
= _mm_add_pd(velecsum
,velec
);
668 /* Update vectorial force */
669 fix2
= _mm_macc_pd(dx21
,fscal
,fix2
);
670 fiy2
= _mm_macc_pd(dy21
,fscal
,fiy2
);
671 fiz2
= _mm_macc_pd(dz21
,fscal
,fiz2
);
673 fjx1
= _mm_macc_pd(dx21
,fscal
,fjx1
);
674 fjy1
= _mm_macc_pd(dy21
,fscal
,fjy1
);
675 fjz1
= _mm_macc_pd(dz21
,fscal
,fjz1
);
677 /**************************
678 * CALCULATE INTERACTIONS *
679 **************************/
681 r22
= _mm_mul_pd(rsq22
,rinv22
);
683 /* EWALD ELECTROSTATICS */
685 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
686 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
687 ewitab
= _mm_cvttpd_epi32(ewrt
);
689 eweps
= _mm_frcz_pd(ewrt
);
691 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
693 twoeweps
= _mm_add_pd(eweps
,eweps
);
694 ewitab
= _mm_slli_epi32(ewitab
,2);
695 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
696 ewtabD
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
697 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
698 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
699 ewtabFn
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,1) +2);
700 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
701 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
702 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
703 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
704 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
706 /* Update potential sum for this i atom from the interaction with this j atom. */
707 velecsum
= _mm_add_pd(velecsum
,velec
);
711 /* Update vectorial force */
712 fix2
= _mm_macc_pd(dx22
,fscal
,fix2
);
713 fiy2
= _mm_macc_pd(dy22
,fscal
,fiy2
);
714 fiz2
= _mm_macc_pd(dz22
,fscal
,fiz2
);
716 fjx2
= _mm_macc_pd(dx22
,fscal
,fjx2
);
717 fjy2
= _mm_macc_pd(dy22
,fscal
,fjy2
);
718 fjz2
= _mm_macc_pd(dz22
,fscal
,fjz2
);
720 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
722 /* Inner loop uses 430 flops */
729 j_coord_offsetA
= DIM
*jnrA
;
731 /* load j atom coordinates */
732 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
733 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
735 /* Calculate displacement vector */
736 dx00
= _mm_sub_pd(ix0
,jx0
);
737 dy00
= _mm_sub_pd(iy0
,jy0
);
738 dz00
= _mm_sub_pd(iz0
,jz0
);
739 dx01
= _mm_sub_pd(ix0
,jx1
);
740 dy01
= _mm_sub_pd(iy0
,jy1
);
741 dz01
= _mm_sub_pd(iz0
,jz1
);
742 dx02
= _mm_sub_pd(ix0
,jx2
);
743 dy02
= _mm_sub_pd(iy0
,jy2
);
744 dz02
= _mm_sub_pd(iz0
,jz2
);
745 dx10
= _mm_sub_pd(ix1
,jx0
);
746 dy10
= _mm_sub_pd(iy1
,jy0
);
747 dz10
= _mm_sub_pd(iz1
,jz0
);
748 dx11
= _mm_sub_pd(ix1
,jx1
);
749 dy11
= _mm_sub_pd(iy1
,jy1
);
750 dz11
= _mm_sub_pd(iz1
,jz1
);
751 dx12
= _mm_sub_pd(ix1
,jx2
);
752 dy12
= _mm_sub_pd(iy1
,jy2
);
753 dz12
= _mm_sub_pd(iz1
,jz2
);
754 dx20
= _mm_sub_pd(ix2
,jx0
);
755 dy20
= _mm_sub_pd(iy2
,jy0
);
756 dz20
= _mm_sub_pd(iz2
,jz0
);
757 dx21
= _mm_sub_pd(ix2
,jx1
);
758 dy21
= _mm_sub_pd(iy2
,jy1
);
759 dz21
= _mm_sub_pd(iz2
,jz1
);
760 dx22
= _mm_sub_pd(ix2
,jx2
);
761 dy22
= _mm_sub_pd(iy2
,jy2
);
762 dz22
= _mm_sub_pd(iz2
,jz2
);
764 /* Calculate squared distance and things based on it */
765 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
766 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
767 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
768 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
769 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
770 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
771 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
772 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
773 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
775 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
776 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
777 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
778 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
779 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
780 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
781 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
782 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
783 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
785 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
786 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
787 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
788 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
789 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
790 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
791 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
792 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
793 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
795 fjx0
= _mm_setzero_pd();
796 fjy0
= _mm_setzero_pd();
797 fjz0
= _mm_setzero_pd();
798 fjx1
= _mm_setzero_pd();
799 fjy1
= _mm_setzero_pd();
800 fjz1
= _mm_setzero_pd();
801 fjx2
= _mm_setzero_pd();
802 fjy2
= _mm_setzero_pd();
803 fjz2
= _mm_setzero_pd();
805 /**************************
806 * CALCULATE INTERACTIONS *
807 **************************/
809 r00
= _mm_mul_pd(rsq00
,rinv00
);
811 /* Calculate table index by multiplying r with table scale and truncate to integer */
812 rt
= _mm_mul_pd(r00
,vftabscale
);
813 vfitab
= _mm_cvttpd_epi32(rt
);
815 vfeps
= _mm_frcz_pd(rt
);
817 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
819 twovfeps
= _mm_add_pd(vfeps
,vfeps
);
820 vfitab
= _mm_slli_epi32(vfitab
,3);
822 /* EWALD ELECTROSTATICS */
824 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
825 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
826 ewitab
= _mm_cvttpd_epi32(ewrt
);
828 eweps
= _mm_frcz_pd(ewrt
);
830 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
832 twoeweps
= _mm_add_pd(eweps
,eweps
);
833 ewitab
= _mm_slli_epi32(ewitab
,2);
834 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
835 ewtabD
= _mm_setzero_pd();
836 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
837 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
838 ewtabFn
= _mm_setzero_pd();
839 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
840 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
841 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
842 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
843 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
845 /* CUBIC SPLINE TABLE DISPERSION */
846 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
847 F
= _mm_setzero_pd();
848 GMX_MM_TRANSPOSE2_PD(Y
,F
);
849 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
850 H
= _mm_setzero_pd();
851 GMX_MM_TRANSPOSE2_PD(G
,H
);
852 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
853 VV
= _mm_macc_pd(vfeps
,Fp
,Y
);
854 vvdw6
= _mm_mul_pd(c6_00
,VV
);
855 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
856 fvdw6
= _mm_mul_pd(c6_00
,FF
);
858 /* CUBIC SPLINE TABLE REPULSION */
859 vfitab
= _mm_add_epi32(vfitab
,ifour
);
860 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
861 F
= _mm_setzero_pd();
862 GMX_MM_TRANSPOSE2_PD(Y
,F
);
863 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
864 H
= _mm_setzero_pd();
865 GMX_MM_TRANSPOSE2_PD(G
,H
);
866 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
867 VV
= _mm_macc_pd(vfeps
,Fp
,Y
);
868 vvdw12
= _mm_mul_pd(c12_00
,VV
);
869 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
870 fvdw12
= _mm_mul_pd(c12_00
,FF
);
871 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
872 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
874 /* Update potential sum for this i atom from the interaction with this j atom. */
875 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
876 velecsum
= _mm_add_pd(velecsum
,velec
);
877 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
878 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
880 fscal
= _mm_add_pd(felec
,fvdw
);
882 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
884 /* Update vectorial force */
885 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
886 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
887 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
889 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
890 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
891 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
893 /**************************
894 * CALCULATE INTERACTIONS *
895 **************************/
897 r01
= _mm_mul_pd(rsq01
,rinv01
);
899 /* EWALD ELECTROSTATICS */
901 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
902 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
903 ewitab
= _mm_cvttpd_epi32(ewrt
);
905 eweps
= _mm_frcz_pd(ewrt
);
907 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
909 twoeweps
= _mm_add_pd(eweps
,eweps
);
910 ewitab
= _mm_slli_epi32(ewitab
,2);
911 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
912 ewtabD
= _mm_setzero_pd();
913 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
914 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
915 ewtabFn
= _mm_setzero_pd();
916 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
917 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
918 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
919 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(rinv01
,velec
));
920 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
922 /* Update potential sum for this i atom from the interaction with this j atom. */
923 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
924 velecsum
= _mm_add_pd(velecsum
,velec
);
928 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
930 /* Update vectorial force */
931 fix0
= _mm_macc_pd(dx01
,fscal
,fix0
);
932 fiy0
= _mm_macc_pd(dy01
,fscal
,fiy0
);
933 fiz0
= _mm_macc_pd(dz01
,fscal
,fiz0
);
935 fjx1
= _mm_macc_pd(dx01
,fscal
,fjx1
);
936 fjy1
= _mm_macc_pd(dy01
,fscal
,fjy1
);
937 fjz1
= _mm_macc_pd(dz01
,fscal
,fjz1
);
939 /**************************
940 * CALCULATE INTERACTIONS *
941 **************************/
943 r02
= _mm_mul_pd(rsq02
,rinv02
);
945 /* EWALD ELECTROSTATICS */
947 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
948 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
949 ewitab
= _mm_cvttpd_epi32(ewrt
);
951 eweps
= _mm_frcz_pd(ewrt
);
953 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
955 twoeweps
= _mm_add_pd(eweps
,eweps
);
956 ewitab
= _mm_slli_epi32(ewitab
,2);
957 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
958 ewtabD
= _mm_setzero_pd();
959 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
960 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
961 ewtabFn
= _mm_setzero_pd();
962 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
963 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
964 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
965 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(rinv02
,velec
));
966 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
968 /* Update potential sum for this i atom from the interaction with this j atom. */
969 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
970 velecsum
= _mm_add_pd(velecsum
,velec
);
974 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
976 /* Update vectorial force */
977 fix0
= _mm_macc_pd(dx02
,fscal
,fix0
);
978 fiy0
= _mm_macc_pd(dy02
,fscal
,fiy0
);
979 fiz0
= _mm_macc_pd(dz02
,fscal
,fiz0
);
981 fjx2
= _mm_macc_pd(dx02
,fscal
,fjx2
);
982 fjy2
= _mm_macc_pd(dy02
,fscal
,fjy2
);
983 fjz2
= _mm_macc_pd(dz02
,fscal
,fjz2
);
985 /**************************
986 * CALCULATE INTERACTIONS *
987 **************************/
989 r10
= _mm_mul_pd(rsq10
,rinv10
);
991 /* EWALD ELECTROSTATICS */
993 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
994 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
995 ewitab
= _mm_cvttpd_epi32(ewrt
);
997 eweps
= _mm_frcz_pd(ewrt
);
999 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1001 twoeweps
= _mm_add_pd(eweps
,eweps
);
1002 ewitab
= _mm_slli_epi32(ewitab
,2);
1003 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1004 ewtabD
= _mm_setzero_pd();
1005 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1006 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
1007 ewtabFn
= _mm_setzero_pd();
1008 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1009 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
1010 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
1011 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
1012 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1014 /* Update potential sum for this i atom from the interaction with this j atom. */
1015 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1016 velecsum
= _mm_add_pd(velecsum
,velec
);
1020 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1022 /* Update vectorial force */
1023 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
1024 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
1025 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
1027 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
1028 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
1029 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
1031 /**************************
1032 * CALCULATE INTERACTIONS *
1033 **************************/
1035 r11
= _mm_mul_pd(rsq11
,rinv11
);
1037 /* EWALD ELECTROSTATICS */
1039 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1040 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1041 ewitab
= _mm_cvttpd_epi32(ewrt
);
1043 eweps
= _mm_frcz_pd(ewrt
);
1045 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1047 twoeweps
= _mm_add_pd(eweps
,eweps
);
1048 ewitab
= _mm_slli_epi32(ewitab
,2);
1049 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1050 ewtabD
= _mm_setzero_pd();
1051 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1052 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
1053 ewtabFn
= _mm_setzero_pd();
1054 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1055 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
1056 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
1057 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
1058 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1060 /* Update potential sum for this i atom from the interaction with this j atom. */
1061 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1062 velecsum
= _mm_add_pd(velecsum
,velec
);
1066 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1068 /* Update vectorial force */
1069 fix1
= _mm_macc_pd(dx11
,fscal
,fix1
);
1070 fiy1
= _mm_macc_pd(dy11
,fscal
,fiy1
);
1071 fiz1
= _mm_macc_pd(dz11
,fscal
,fiz1
);
1073 fjx1
= _mm_macc_pd(dx11
,fscal
,fjx1
);
1074 fjy1
= _mm_macc_pd(dy11
,fscal
,fjy1
);
1075 fjz1
= _mm_macc_pd(dz11
,fscal
,fjz1
);
1077 /**************************
1078 * CALCULATE INTERACTIONS *
1079 **************************/
1081 r12
= _mm_mul_pd(rsq12
,rinv12
);
1083 /* EWALD ELECTROSTATICS */
1085 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1086 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1087 ewitab
= _mm_cvttpd_epi32(ewrt
);
1089 eweps
= _mm_frcz_pd(ewrt
);
1091 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1093 twoeweps
= _mm_add_pd(eweps
,eweps
);
1094 ewitab
= _mm_slli_epi32(ewitab
,2);
1095 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1096 ewtabD
= _mm_setzero_pd();
1097 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1098 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
1099 ewtabFn
= _mm_setzero_pd();
1100 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1101 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
1102 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
1103 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
1104 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1106 /* Update potential sum for this i atom from the interaction with this j atom. */
1107 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1108 velecsum
= _mm_add_pd(velecsum
,velec
);
1112 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1114 /* Update vectorial force */
1115 fix1
= _mm_macc_pd(dx12
,fscal
,fix1
);
1116 fiy1
= _mm_macc_pd(dy12
,fscal
,fiy1
);
1117 fiz1
= _mm_macc_pd(dz12
,fscal
,fiz1
);
1119 fjx2
= _mm_macc_pd(dx12
,fscal
,fjx2
);
1120 fjy2
= _mm_macc_pd(dy12
,fscal
,fjy2
);
1121 fjz2
= _mm_macc_pd(dz12
,fscal
,fjz2
);
1123 /**************************
1124 * CALCULATE INTERACTIONS *
1125 **************************/
1127 r20
= _mm_mul_pd(rsq20
,rinv20
);
1129 /* EWALD ELECTROSTATICS */
1131 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1132 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1133 ewitab
= _mm_cvttpd_epi32(ewrt
);
1135 eweps
= _mm_frcz_pd(ewrt
);
1137 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1139 twoeweps
= _mm_add_pd(eweps
,eweps
);
1140 ewitab
= _mm_slli_epi32(ewitab
,2);
1141 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1142 ewtabD
= _mm_setzero_pd();
1143 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1144 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
1145 ewtabFn
= _mm_setzero_pd();
1146 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1147 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
1148 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
1149 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
1150 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1152 /* Update potential sum for this i atom from the interaction with this j atom. */
1153 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1154 velecsum
= _mm_add_pd(velecsum
,velec
);
1158 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1160 /* Update vectorial force */
1161 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
1162 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
1163 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
1165 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
1166 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
1167 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
1169 /**************************
1170 * CALCULATE INTERACTIONS *
1171 **************************/
1173 r21
= _mm_mul_pd(rsq21
,rinv21
);
1175 /* EWALD ELECTROSTATICS */
1177 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1178 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1179 ewitab
= _mm_cvttpd_epi32(ewrt
);
1181 eweps
= _mm_frcz_pd(ewrt
);
1183 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1185 twoeweps
= _mm_add_pd(eweps
,eweps
);
1186 ewitab
= _mm_slli_epi32(ewitab
,2);
1187 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1188 ewtabD
= _mm_setzero_pd();
1189 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1190 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
1191 ewtabFn
= _mm_setzero_pd();
1192 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1193 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
1194 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
1195 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
1196 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1198 /* Update potential sum for this i atom from the interaction with this j atom. */
1199 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1200 velecsum
= _mm_add_pd(velecsum
,velec
);
1204 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1206 /* Update vectorial force */
1207 fix2
= _mm_macc_pd(dx21
,fscal
,fix2
);
1208 fiy2
= _mm_macc_pd(dy21
,fscal
,fiy2
);
1209 fiz2
= _mm_macc_pd(dz21
,fscal
,fiz2
);
1211 fjx1
= _mm_macc_pd(dx21
,fscal
,fjx1
);
1212 fjy1
= _mm_macc_pd(dy21
,fscal
,fjy1
);
1213 fjz1
= _mm_macc_pd(dz21
,fscal
,fjz1
);
1215 /**************************
1216 * CALCULATE INTERACTIONS *
1217 **************************/
1219 r22
= _mm_mul_pd(rsq22
,rinv22
);
1221 /* EWALD ELECTROSTATICS */
1223 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1224 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
1225 ewitab
= _mm_cvttpd_epi32(ewrt
);
1227 eweps
= _mm_frcz_pd(ewrt
);
1229 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1231 twoeweps
= _mm_add_pd(eweps
,eweps
);
1232 ewitab
= _mm_slli_epi32(ewitab
,2);
1233 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1234 ewtabD
= _mm_setzero_pd();
1235 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1236 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
1237 ewtabFn
= _mm_setzero_pd();
1238 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1239 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
1240 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
1241 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
1242 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
1244 /* Update potential sum for this i atom from the interaction with this j atom. */
1245 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1246 velecsum
= _mm_add_pd(velecsum
,velec
);
1250 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1252 /* Update vectorial force */
1253 fix2
= _mm_macc_pd(dx22
,fscal
,fix2
);
1254 fiy2
= _mm_macc_pd(dy22
,fscal
,fiy2
);
1255 fiz2
= _mm_macc_pd(dz22
,fscal
,fiz2
);
1257 fjx2
= _mm_macc_pd(dx22
,fscal
,fjx2
);
1258 fjy2
= _mm_macc_pd(dy22
,fscal
,fjy2
);
1259 fjz2
= _mm_macc_pd(dz22
,fscal
,fjz2
);
1261 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1263 /* Inner loop uses 430 flops */
1266 /* End of innermost loop */
1268 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1269 f
+i_coord_offset
,fshift
+i_shift_offset
);
1272 /* Update potential energies */
1273 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1274 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1276 /* Increment number of inner iterations */
1277 inneriter
+= j_index_end
- j_index_start
;
1279 /* Outer loop uses 20 flops */
1282 /* Increment number of outer iterations */
1285 /* Update outer/inner flops */
1287 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*430);
1290 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1291 * Electrostatics interaction: Ewald
1292 * VdW interaction: CubicSplineTable
1293 * Geometry: Water3-Water3
1294 * Calculate force/pot: Force
1297 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1298 (t_nblist
* gmx_restrict nlist
,
1299 rvec
* gmx_restrict xx
,
1300 rvec
* gmx_restrict ff
,
1301 t_forcerec
* gmx_restrict fr
,
1302 t_mdatoms
* gmx_restrict mdatoms
,
1303 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1304 t_nrnb
* gmx_restrict nrnb
)
1306 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1307 * just 0 for non-waters.
1308 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1309 * jnr indices corresponding to data put in the four positions in the SIMD register.
1311 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1312 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1314 int j_coord_offsetA
,j_coord_offsetB
;
1315 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1316 real rcutoff_scalar
;
1317 real
*shiftvec
,*fshift
,*x
,*f
;
1318 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1320 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1322 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1324 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1325 int vdwjidx0A
,vdwjidx0B
;
1326 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1327 int vdwjidx1A
,vdwjidx1B
;
1328 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1329 int vdwjidx2A
,vdwjidx2B
;
1330 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1331 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1332 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1333 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1334 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1335 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1336 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1337 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1338 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1339 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1340 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1343 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1346 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1347 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1349 __m128i ifour
= _mm_set1_epi32(4);
1350 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
,twovfeps
;
1353 __m128d ewtabscale
,eweps
,twoeweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1355 __m128d dummy_mask
,cutoff_mask
;
1356 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1357 __m128d one
= _mm_set1_pd(1.0);
1358 __m128d two
= _mm_set1_pd(2.0);
1364 jindex
= nlist
->jindex
;
1366 shiftidx
= nlist
->shift
;
1368 shiftvec
= fr
->shift_vec
[0];
1369 fshift
= fr
->fshift
[0];
1370 facel
= _mm_set1_pd(fr
->epsfac
);
1371 charge
= mdatoms
->chargeA
;
1372 nvdwtype
= fr
->ntype
;
1373 vdwparam
= fr
->nbfp
;
1374 vdwtype
= mdatoms
->typeA
;
1376 vftab
= kernel_data
->table_vdw
->data
;
1377 vftabscale
= _mm_set1_pd(kernel_data
->table_vdw
->scale
);
1379 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
1380 ewtab
= fr
->ic
->tabq_coul_F
;
1381 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
1382 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
1384 /* Setup water-specific parameters */
1385 inr
= nlist
->iinr
[0];
1386 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
1387 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1388 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1389 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1391 jq0
= _mm_set1_pd(charge
[inr
+0]);
1392 jq1
= _mm_set1_pd(charge
[inr
+1]);
1393 jq2
= _mm_set1_pd(charge
[inr
+2]);
1394 vdwjidx0A
= 2*vdwtype
[inr
+0];
1395 qq00
= _mm_mul_pd(iq0
,jq0
);
1396 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1397 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1398 qq01
= _mm_mul_pd(iq0
,jq1
);
1399 qq02
= _mm_mul_pd(iq0
,jq2
);
1400 qq10
= _mm_mul_pd(iq1
,jq0
);
1401 qq11
= _mm_mul_pd(iq1
,jq1
);
1402 qq12
= _mm_mul_pd(iq1
,jq2
);
1403 qq20
= _mm_mul_pd(iq2
,jq0
);
1404 qq21
= _mm_mul_pd(iq2
,jq1
);
1405 qq22
= _mm_mul_pd(iq2
,jq2
);
1407 /* Avoid stupid compiler warnings */
1409 j_coord_offsetA
= 0;
1410 j_coord_offsetB
= 0;
1415 /* Start outer loop over neighborlists */
1416 for(iidx
=0; iidx
<nri
; iidx
++)
1418 /* Load shift vector for this list */
1419 i_shift_offset
= DIM
*shiftidx
[iidx
];
1421 /* Load limits for loop over neighbors */
1422 j_index_start
= jindex
[iidx
];
1423 j_index_end
= jindex
[iidx
+1];
1425 /* Get outer coordinate index */
1427 i_coord_offset
= DIM
*inr
;
1429 /* Load i particle coords and add shift vector */
1430 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1431 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1433 fix0
= _mm_setzero_pd();
1434 fiy0
= _mm_setzero_pd();
1435 fiz0
= _mm_setzero_pd();
1436 fix1
= _mm_setzero_pd();
1437 fiy1
= _mm_setzero_pd();
1438 fiz1
= _mm_setzero_pd();
1439 fix2
= _mm_setzero_pd();
1440 fiy2
= _mm_setzero_pd();
1441 fiz2
= _mm_setzero_pd();
1443 /* Start inner kernel loop */
1444 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1447 /* Get j neighbor index, and coordinate index */
1449 jnrB
= jjnr
[jidx
+1];
1450 j_coord_offsetA
= DIM
*jnrA
;
1451 j_coord_offsetB
= DIM
*jnrB
;
1453 /* load j atom coordinates */
1454 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1455 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1457 /* Calculate displacement vector */
1458 dx00
= _mm_sub_pd(ix0
,jx0
);
1459 dy00
= _mm_sub_pd(iy0
,jy0
);
1460 dz00
= _mm_sub_pd(iz0
,jz0
);
1461 dx01
= _mm_sub_pd(ix0
,jx1
);
1462 dy01
= _mm_sub_pd(iy0
,jy1
);
1463 dz01
= _mm_sub_pd(iz0
,jz1
);
1464 dx02
= _mm_sub_pd(ix0
,jx2
);
1465 dy02
= _mm_sub_pd(iy0
,jy2
);
1466 dz02
= _mm_sub_pd(iz0
,jz2
);
1467 dx10
= _mm_sub_pd(ix1
,jx0
);
1468 dy10
= _mm_sub_pd(iy1
,jy0
);
1469 dz10
= _mm_sub_pd(iz1
,jz0
);
1470 dx11
= _mm_sub_pd(ix1
,jx1
);
1471 dy11
= _mm_sub_pd(iy1
,jy1
);
1472 dz11
= _mm_sub_pd(iz1
,jz1
);
1473 dx12
= _mm_sub_pd(ix1
,jx2
);
1474 dy12
= _mm_sub_pd(iy1
,jy2
);
1475 dz12
= _mm_sub_pd(iz1
,jz2
);
1476 dx20
= _mm_sub_pd(ix2
,jx0
);
1477 dy20
= _mm_sub_pd(iy2
,jy0
);
1478 dz20
= _mm_sub_pd(iz2
,jz0
);
1479 dx21
= _mm_sub_pd(ix2
,jx1
);
1480 dy21
= _mm_sub_pd(iy2
,jy1
);
1481 dz21
= _mm_sub_pd(iz2
,jz1
);
1482 dx22
= _mm_sub_pd(ix2
,jx2
);
1483 dy22
= _mm_sub_pd(iy2
,jy2
);
1484 dz22
= _mm_sub_pd(iz2
,jz2
);
1486 /* Calculate squared distance and things based on it */
1487 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1488 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1489 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1490 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1491 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1492 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1493 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1494 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1495 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1497 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1498 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1499 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1500 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1501 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1502 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1503 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1504 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1505 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1507 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1508 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
1509 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
1510 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1511 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1512 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1513 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1514 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1515 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1517 fjx0
= _mm_setzero_pd();
1518 fjy0
= _mm_setzero_pd();
1519 fjz0
= _mm_setzero_pd();
1520 fjx1
= _mm_setzero_pd();
1521 fjy1
= _mm_setzero_pd();
1522 fjz1
= _mm_setzero_pd();
1523 fjx2
= _mm_setzero_pd();
1524 fjy2
= _mm_setzero_pd();
1525 fjz2
= _mm_setzero_pd();
1527 /**************************
1528 * CALCULATE INTERACTIONS *
1529 **************************/
1531 r00
= _mm_mul_pd(rsq00
,rinv00
);
1533 /* Calculate table index by multiplying r with table scale and truncate to integer */
1534 rt
= _mm_mul_pd(r00
,vftabscale
);
1535 vfitab
= _mm_cvttpd_epi32(rt
);
1537 vfeps
= _mm_frcz_pd(rt
);
1539 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1541 twovfeps
= _mm_add_pd(vfeps
,vfeps
);
1542 vfitab
= _mm_slli_epi32(vfitab
,3);
1544 /* EWALD ELECTROSTATICS */
1546 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1547 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
1548 ewitab
= _mm_cvttpd_epi32(ewrt
);
1550 eweps
= _mm_frcz_pd(ewrt
);
1552 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1554 twoeweps
= _mm_add_pd(eweps
,eweps
);
1555 gmx_mm_load_2pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),ewtab
+_mm_extract_epi32(ewitab
,1),
1557 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
1558 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
1560 /* CUBIC SPLINE TABLE DISPERSION */
1561 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
1562 F
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
1563 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1564 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
1565 H
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) +2);
1566 GMX_MM_TRANSPOSE2_PD(G
,H
);
1567 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
1568 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
1569 fvdw6
= _mm_mul_pd(c6_00
,FF
);
1571 /* CUBIC SPLINE TABLE REPULSION */
1572 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1573 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
1574 F
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
1575 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1576 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
1577 H
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) +2);
1578 GMX_MM_TRANSPOSE2_PD(G
,H
);
1579 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
1580 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
1581 fvdw12
= _mm_mul_pd(c12_00
,FF
);
1582 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
1584 fscal
= _mm_add_pd(felec
,fvdw
);
1586 /* Update vectorial force */
1587 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
1588 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
1589 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
1591 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
1592 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
1593 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
1595 /**************************
1596 * CALCULATE INTERACTIONS *
1597 **************************/
1599 r01
= _mm_mul_pd(rsq01
,rinv01
);
1601 /* EWALD ELECTROSTATICS */
1603 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1604 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
1605 ewitab
= _mm_cvttpd_epi32(ewrt
);
1607 eweps
= _mm_frcz_pd(ewrt
);
1609 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1611 twoeweps
= _mm_add_pd(eweps
,eweps
);
1612 gmx_mm_load_2pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),ewtab
+_mm_extract_epi32(ewitab
,1),
1614 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
1615 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
1619 /* Update vectorial force */
1620 fix0
= _mm_macc_pd(dx01
,fscal
,fix0
);
1621 fiy0
= _mm_macc_pd(dy01
,fscal
,fiy0
);
1622 fiz0
= _mm_macc_pd(dz01
,fscal
,fiz0
);
1624 fjx1
= _mm_macc_pd(dx01
,fscal
,fjx1
);
1625 fjy1
= _mm_macc_pd(dy01
,fscal
,fjy1
);
1626 fjz1
= _mm_macc_pd(dz01
,fscal
,fjz1
);
1628 /**************************
1629 * CALCULATE INTERACTIONS *
1630 **************************/
1632 r02
= _mm_mul_pd(rsq02
,rinv02
);
1634 /* EWALD ELECTROSTATICS */
1636 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1637 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
1638 ewitab
= _mm_cvttpd_epi32(ewrt
);
1640 eweps
= _mm_frcz_pd(ewrt
);
1642 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1644 twoeweps
= _mm_add_pd(eweps
,eweps
);
1645 gmx_mm_load_2pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),ewtab
+_mm_extract_epi32(ewitab
,1),
1647 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
1648 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
1652 /* Update vectorial force */
1653 fix0
= _mm_macc_pd(dx02
,fscal
,fix0
);
1654 fiy0
= _mm_macc_pd(dy02
,fscal
,fiy0
);
1655 fiz0
= _mm_macc_pd(dz02
,fscal
,fiz0
);
1657 fjx2
= _mm_macc_pd(dx02
,fscal
,fjx2
);
1658 fjy2
= _mm_macc_pd(dy02
,fscal
,fjy2
);
1659 fjz2
= _mm_macc_pd(dz02
,fscal
,fjz2
);
1661 /**************************
1662 * CALCULATE INTERACTIONS *
1663 **************************/
1665 r10
= _mm_mul_pd(rsq10
,rinv10
);
1667 /* EWALD ELECTROSTATICS */
1669 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1670 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1671 ewitab
= _mm_cvttpd_epi32(ewrt
);
1673 eweps
= _mm_frcz_pd(ewrt
);
1675 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1677 twoeweps
= _mm_add_pd(eweps
,eweps
);
1678 gmx_mm_load_2pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),ewtab
+_mm_extract_epi32(ewitab
,1),
1680 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
1681 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1685 /* Update vectorial force */
1686 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
1687 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
1688 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
1690 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
1691 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
1692 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
1694 /**************************
1695 * CALCULATE INTERACTIONS *
1696 **************************/
1698 r11
= _mm_mul_pd(rsq11
,rinv11
);
1700 /* EWALD ELECTROSTATICS */
1702 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1703 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1704 ewitab
= _mm_cvttpd_epi32(ewrt
);
1706 eweps
= _mm_frcz_pd(ewrt
);
1708 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1710 twoeweps
= _mm_add_pd(eweps
,eweps
);
1711 gmx_mm_load_2pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),ewtab
+_mm_extract_epi32(ewitab
,1),
1713 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
1714 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1718 /* Update vectorial force */
1719 fix1
= _mm_macc_pd(dx11
,fscal
,fix1
);
1720 fiy1
= _mm_macc_pd(dy11
,fscal
,fiy1
);
1721 fiz1
= _mm_macc_pd(dz11
,fscal
,fiz1
);
1723 fjx1
= _mm_macc_pd(dx11
,fscal
,fjx1
);
1724 fjy1
= _mm_macc_pd(dy11
,fscal
,fjy1
);
1725 fjz1
= _mm_macc_pd(dz11
,fscal
,fjz1
);
1727 /**************************
1728 * CALCULATE INTERACTIONS *
1729 **************************/
1731 r12
= _mm_mul_pd(rsq12
,rinv12
);
1733 /* EWALD ELECTROSTATICS */
1735 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1736 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1737 ewitab
= _mm_cvttpd_epi32(ewrt
);
1739 eweps
= _mm_frcz_pd(ewrt
);
1741 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1743 twoeweps
= _mm_add_pd(eweps
,eweps
);
1744 gmx_mm_load_2pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),ewtab
+_mm_extract_epi32(ewitab
,1),
1746 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
1747 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1751 /* Update vectorial force */
1752 fix1
= _mm_macc_pd(dx12
,fscal
,fix1
);
1753 fiy1
= _mm_macc_pd(dy12
,fscal
,fiy1
);
1754 fiz1
= _mm_macc_pd(dz12
,fscal
,fiz1
);
1756 fjx2
= _mm_macc_pd(dx12
,fscal
,fjx2
);
1757 fjy2
= _mm_macc_pd(dy12
,fscal
,fjy2
);
1758 fjz2
= _mm_macc_pd(dz12
,fscal
,fjz2
);
1760 /**************************
1761 * CALCULATE INTERACTIONS *
1762 **************************/
1764 r20
= _mm_mul_pd(rsq20
,rinv20
);
1766 /* EWALD ELECTROSTATICS */
1768 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1769 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1770 ewitab
= _mm_cvttpd_epi32(ewrt
);
1772 eweps
= _mm_frcz_pd(ewrt
);
1774 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1776 twoeweps
= _mm_add_pd(eweps
,eweps
);
1777 gmx_mm_load_2pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),ewtab
+_mm_extract_epi32(ewitab
,1),
1779 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
1780 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1784 /* Update vectorial force */
1785 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
1786 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
1787 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
1789 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
1790 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
1791 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
1793 /**************************
1794 * CALCULATE INTERACTIONS *
1795 **************************/
1797 r21
= _mm_mul_pd(rsq21
,rinv21
);
1799 /* EWALD ELECTROSTATICS */
1801 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1802 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1803 ewitab
= _mm_cvttpd_epi32(ewrt
);
1805 eweps
= _mm_frcz_pd(ewrt
);
1807 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1809 twoeweps
= _mm_add_pd(eweps
,eweps
);
1810 gmx_mm_load_2pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),ewtab
+_mm_extract_epi32(ewitab
,1),
1812 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
1813 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1817 /* Update vectorial force */
1818 fix2
= _mm_macc_pd(dx21
,fscal
,fix2
);
1819 fiy2
= _mm_macc_pd(dy21
,fscal
,fiy2
);
1820 fiz2
= _mm_macc_pd(dz21
,fscal
,fiz2
);
1822 fjx1
= _mm_macc_pd(dx21
,fscal
,fjx1
);
1823 fjy1
= _mm_macc_pd(dy21
,fscal
,fjy1
);
1824 fjz1
= _mm_macc_pd(dz21
,fscal
,fjz1
);
1826 /**************************
1827 * CALCULATE INTERACTIONS *
1828 **************************/
1830 r22
= _mm_mul_pd(rsq22
,rinv22
);
1832 /* EWALD ELECTROSTATICS */
1834 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1835 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
1836 ewitab
= _mm_cvttpd_epi32(ewrt
);
1838 eweps
= _mm_frcz_pd(ewrt
);
1840 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1842 twoeweps
= _mm_add_pd(eweps
,eweps
);
1843 gmx_mm_load_2pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),ewtab
+_mm_extract_epi32(ewitab
,1),
1845 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
1846 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
1850 /* Update vectorial force */
1851 fix2
= _mm_macc_pd(dx22
,fscal
,fix2
);
1852 fiy2
= _mm_macc_pd(dy22
,fscal
,fiy2
);
1853 fiz2
= _mm_macc_pd(dz22
,fscal
,fiz2
);
1855 fjx2
= _mm_macc_pd(dx22
,fscal
,fjx2
);
1856 fjy2
= _mm_macc_pd(dy22
,fscal
,fjy2
);
1857 fjz2
= _mm_macc_pd(dz22
,fscal
,fjz2
);
1859 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1861 /* Inner loop uses 377 flops */
1864 if(jidx
<j_index_end
)
1868 j_coord_offsetA
= DIM
*jnrA
;
1870 /* load j atom coordinates */
1871 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1872 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1874 /* Calculate displacement vector */
1875 dx00
= _mm_sub_pd(ix0
,jx0
);
1876 dy00
= _mm_sub_pd(iy0
,jy0
);
1877 dz00
= _mm_sub_pd(iz0
,jz0
);
1878 dx01
= _mm_sub_pd(ix0
,jx1
);
1879 dy01
= _mm_sub_pd(iy0
,jy1
);
1880 dz01
= _mm_sub_pd(iz0
,jz1
);
1881 dx02
= _mm_sub_pd(ix0
,jx2
);
1882 dy02
= _mm_sub_pd(iy0
,jy2
);
1883 dz02
= _mm_sub_pd(iz0
,jz2
);
1884 dx10
= _mm_sub_pd(ix1
,jx0
);
1885 dy10
= _mm_sub_pd(iy1
,jy0
);
1886 dz10
= _mm_sub_pd(iz1
,jz0
);
1887 dx11
= _mm_sub_pd(ix1
,jx1
);
1888 dy11
= _mm_sub_pd(iy1
,jy1
);
1889 dz11
= _mm_sub_pd(iz1
,jz1
);
1890 dx12
= _mm_sub_pd(ix1
,jx2
);
1891 dy12
= _mm_sub_pd(iy1
,jy2
);
1892 dz12
= _mm_sub_pd(iz1
,jz2
);
1893 dx20
= _mm_sub_pd(ix2
,jx0
);
1894 dy20
= _mm_sub_pd(iy2
,jy0
);
1895 dz20
= _mm_sub_pd(iz2
,jz0
);
1896 dx21
= _mm_sub_pd(ix2
,jx1
);
1897 dy21
= _mm_sub_pd(iy2
,jy1
);
1898 dz21
= _mm_sub_pd(iz2
,jz1
);
1899 dx22
= _mm_sub_pd(ix2
,jx2
);
1900 dy22
= _mm_sub_pd(iy2
,jy2
);
1901 dz22
= _mm_sub_pd(iz2
,jz2
);
1903 /* Calculate squared distance and things based on it */
1904 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1905 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1906 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1907 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1908 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1909 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1910 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1911 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1912 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1914 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1915 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1916 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1917 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1918 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1919 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1920 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1921 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1922 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1924 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1925 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
1926 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
1927 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1928 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1929 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1930 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1931 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1932 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1934 fjx0
= _mm_setzero_pd();
1935 fjy0
= _mm_setzero_pd();
1936 fjz0
= _mm_setzero_pd();
1937 fjx1
= _mm_setzero_pd();
1938 fjy1
= _mm_setzero_pd();
1939 fjz1
= _mm_setzero_pd();
1940 fjx2
= _mm_setzero_pd();
1941 fjy2
= _mm_setzero_pd();
1942 fjz2
= _mm_setzero_pd();
1944 /**************************
1945 * CALCULATE INTERACTIONS *
1946 **************************/
1948 r00
= _mm_mul_pd(rsq00
,rinv00
);
1950 /* Calculate table index by multiplying r with table scale and truncate to integer */
1951 rt
= _mm_mul_pd(r00
,vftabscale
);
1952 vfitab
= _mm_cvttpd_epi32(rt
);
1954 vfeps
= _mm_frcz_pd(rt
);
1956 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1958 twovfeps
= _mm_add_pd(vfeps
,vfeps
);
1959 vfitab
= _mm_slli_epi32(vfitab
,3);
1961 /* EWALD ELECTROSTATICS */
1963 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1964 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
1965 ewitab
= _mm_cvttpd_epi32(ewrt
);
1967 eweps
= _mm_frcz_pd(ewrt
);
1969 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1971 twoeweps
= _mm_add_pd(eweps
,eweps
);
1972 gmx_mm_load_1pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1973 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
1974 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
1976 /* CUBIC SPLINE TABLE DISPERSION */
1977 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
1978 F
= _mm_setzero_pd();
1979 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1980 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
1981 H
= _mm_setzero_pd();
1982 GMX_MM_TRANSPOSE2_PD(G
,H
);
1983 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
1984 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
1985 fvdw6
= _mm_mul_pd(c6_00
,FF
);
1987 /* CUBIC SPLINE TABLE REPULSION */
1988 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1989 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
1990 F
= _mm_setzero_pd();
1991 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1992 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
1993 H
= _mm_setzero_pd();
1994 GMX_MM_TRANSPOSE2_PD(G
,H
);
1995 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
1996 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
1997 fvdw12
= _mm_mul_pd(c12_00
,FF
);
1998 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
2000 fscal
= _mm_add_pd(felec
,fvdw
);
2002 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2004 /* Update vectorial force */
2005 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
2006 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
2007 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
2009 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
2010 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
2011 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
2013 /**************************
2014 * CALCULATE INTERACTIONS *
2015 **************************/
2017 r01
= _mm_mul_pd(rsq01
,rinv01
);
2019 /* EWALD ELECTROSTATICS */
2021 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2022 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
2023 ewitab
= _mm_cvttpd_epi32(ewrt
);
2025 eweps
= _mm_frcz_pd(ewrt
);
2027 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2029 twoeweps
= _mm_add_pd(eweps
,eweps
);
2030 gmx_mm_load_1pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2031 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
2032 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
2036 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2038 /* Update vectorial force */
2039 fix0
= _mm_macc_pd(dx01
,fscal
,fix0
);
2040 fiy0
= _mm_macc_pd(dy01
,fscal
,fiy0
);
2041 fiz0
= _mm_macc_pd(dz01
,fscal
,fiz0
);
2043 fjx1
= _mm_macc_pd(dx01
,fscal
,fjx1
);
2044 fjy1
= _mm_macc_pd(dy01
,fscal
,fjy1
);
2045 fjz1
= _mm_macc_pd(dz01
,fscal
,fjz1
);
2047 /**************************
2048 * CALCULATE INTERACTIONS *
2049 **************************/
2051 r02
= _mm_mul_pd(rsq02
,rinv02
);
2053 /* EWALD ELECTROSTATICS */
2055 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2056 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
2057 ewitab
= _mm_cvttpd_epi32(ewrt
);
2059 eweps
= _mm_frcz_pd(ewrt
);
2061 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2063 twoeweps
= _mm_add_pd(eweps
,eweps
);
2064 gmx_mm_load_1pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2065 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
2066 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
2070 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2072 /* Update vectorial force */
2073 fix0
= _mm_macc_pd(dx02
,fscal
,fix0
);
2074 fiy0
= _mm_macc_pd(dy02
,fscal
,fiy0
);
2075 fiz0
= _mm_macc_pd(dz02
,fscal
,fiz0
);
2077 fjx2
= _mm_macc_pd(dx02
,fscal
,fjx2
);
2078 fjy2
= _mm_macc_pd(dy02
,fscal
,fjy2
);
2079 fjz2
= _mm_macc_pd(dz02
,fscal
,fjz2
);
2081 /**************************
2082 * CALCULATE INTERACTIONS *
2083 **************************/
2085 r10
= _mm_mul_pd(rsq10
,rinv10
);
2087 /* EWALD ELECTROSTATICS */
2089 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2090 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
2091 ewitab
= _mm_cvttpd_epi32(ewrt
);
2093 eweps
= _mm_frcz_pd(ewrt
);
2095 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2097 twoeweps
= _mm_add_pd(eweps
,eweps
);
2098 gmx_mm_load_1pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2099 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
2100 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
2104 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2106 /* Update vectorial force */
2107 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
2108 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
2109 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
2111 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
2112 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
2113 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
2115 /**************************
2116 * CALCULATE INTERACTIONS *
2117 **************************/
2119 r11
= _mm_mul_pd(rsq11
,rinv11
);
2121 /* EWALD ELECTROSTATICS */
2123 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2124 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
2125 ewitab
= _mm_cvttpd_epi32(ewrt
);
2127 eweps
= _mm_frcz_pd(ewrt
);
2129 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2131 twoeweps
= _mm_add_pd(eweps
,eweps
);
2132 gmx_mm_load_1pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2133 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
2134 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
2138 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2140 /* Update vectorial force */
2141 fix1
= _mm_macc_pd(dx11
,fscal
,fix1
);
2142 fiy1
= _mm_macc_pd(dy11
,fscal
,fiy1
);
2143 fiz1
= _mm_macc_pd(dz11
,fscal
,fiz1
);
2145 fjx1
= _mm_macc_pd(dx11
,fscal
,fjx1
);
2146 fjy1
= _mm_macc_pd(dy11
,fscal
,fjy1
);
2147 fjz1
= _mm_macc_pd(dz11
,fscal
,fjz1
);
2149 /**************************
2150 * CALCULATE INTERACTIONS *
2151 **************************/
2153 r12
= _mm_mul_pd(rsq12
,rinv12
);
2155 /* EWALD ELECTROSTATICS */
2157 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2158 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
2159 ewitab
= _mm_cvttpd_epi32(ewrt
);
2161 eweps
= _mm_frcz_pd(ewrt
);
2163 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2165 twoeweps
= _mm_add_pd(eweps
,eweps
);
2166 gmx_mm_load_1pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2167 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
2168 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
2172 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2174 /* Update vectorial force */
2175 fix1
= _mm_macc_pd(dx12
,fscal
,fix1
);
2176 fiy1
= _mm_macc_pd(dy12
,fscal
,fiy1
);
2177 fiz1
= _mm_macc_pd(dz12
,fscal
,fiz1
);
2179 fjx2
= _mm_macc_pd(dx12
,fscal
,fjx2
);
2180 fjy2
= _mm_macc_pd(dy12
,fscal
,fjy2
);
2181 fjz2
= _mm_macc_pd(dz12
,fscal
,fjz2
);
2183 /**************************
2184 * CALCULATE INTERACTIONS *
2185 **************************/
2187 r20
= _mm_mul_pd(rsq20
,rinv20
);
2189 /* EWALD ELECTROSTATICS */
2191 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2192 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
2193 ewitab
= _mm_cvttpd_epi32(ewrt
);
2195 eweps
= _mm_frcz_pd(ewrt
);
2197 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2199 twoeweps
= _mm_add_pd(eweps
,eweps
);
2200 gmx_mm_load_1pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2201 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
2202 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
2206 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2208 /* Update vectorial force */
2209 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
2210 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
2211 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
2213 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
2214 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
2215 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
2217 /**************************
2218 * CALCULATE INTERACTIONS *
2219 **************************/
2221 r21
= _mm_mul_pd(rsq21
,rinv21
);
2223 /* EWALD ELECTROSTATICS */
2225 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2226 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
2227 ewitab
= _mm_cvttpd_epi32(ewrt
);
2229 eweps
= _mm_frcz_pd(ewrt
);
2231 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2233 twoeweps
= _mm_add_pd(eweps
,eweps
);
2234 gmx_mm_load_1pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2235 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
2236 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
2240 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2242 /* Update vectorial force */
2243 fix2
= _mm_macc_pd(dx21
,fscal
,fix2
);
2244 fiy2
= _mm_macc_pd(dy21
,fscal
,fiy2
);
2245 fiz2
= _mm_macc_pd(dz21
,fscal
,fiz2
);
2247 fjx1
= _mm_macc_pd(dx21
,fscal
,fjx1
);
2248 fjy1
= _mm_macc_pd(dy21
,fscal
,fjy1
);
2249 fjz1
= _mm_macc_pd(dz21
,fscal
,fjz1
);
2251 /**************************
2252 * CALCULATE INTERACTIONS *
2253 **************************/
2255 r22
= _mm_mul_pd(rsq22
,rinv22
);
2257 /* EWALD ELECTROSTATICS */
2259 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2260 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2261 ewitab
= _mm_cvttpd_epi32(ewrt
);
2263 eweps
= _mm_frcz_pd(ewrt
);
2265 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2267 twoeweps
= _mm_add_pd(eweps
,eweps
);
2268 gmx_mm_load_1pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2269 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
2270 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
2274 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2276 /* Update vectorial force */
2277 fix2
= _mm_macc_pd(dx22
,fscal
,fix2
);
2278 fiy2
= _mm_macc_pd(dy22
,fscal
,fiy2
);
2279 fiz2
= _mm_macc_pd(dz22
,fscal
,fiz2
);
2281 fjx2
= _mm_macc_pd(dx22
,fscal
,fjx2
);
2282 fjy2
= _mm_macc_pd(dy22
,fscal
,fjy2
);
2283 fjz2
= _mm_macc_pd(dz22
,fscal
,fjz2
);
2285 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2287 /* Inner loop uses 377 flops */
2290 /* End of innermost loop */
2292 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2293 f
+i_coord_offset
,fshift
+i_shift_offset
);
2295 /* Increment number of inner iterations */
2296 inneriter
+= j_index_end
- j_index_start
;
2298 /* Outer loop uses 18 flops */
2301 /* Increment number of outer iterations */
2304 /* Update outer/inner flops */
2306 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*377);