2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_double
53 * Electrostatics interaction: Ewald
54 * VdW interaction: CubicSplineTable
55 * Geometry: Water4-Water4
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
89 int vdwjidx0A
,vdwjidx0B
;
90 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
91 int vdwjidx1A
,vdwjidx1B
;
92 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
93 int vdwjidx2A
,vdwjidx2B
;
94 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
95 int vdwjidx3A
,vdwjidx3B
;
96 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
97 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
98 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
99 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
100 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
101 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
102 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
103 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
104 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
105 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
106 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
107 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
110 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
113 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
114 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
116 __m128i ifour
= _mm_set1_epi32(4);
117 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
120 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
122 __m128d dummy_mask
,cutoff_mask
;
123 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
124 __m128d one
= _mm_set1_pd(1.0);
125 __m128d two
= _mm_set1_pd(2.0);
131 jindex
= nlist
->jindex
;
133 shiftidx
= nlist
->shift
;
135 shiftvec
= fr
->shift_vec
[0];
136 fshift
= fr
->fshift
[0];
137 facel
= _mm_set1_pd(fr
->epsfac
);
138 charge
= mdatoms
->chargeA
;
139 nvdwtype
= fr
->ntype
;
141 vdwtype
= mdatoms
->typeA
;
143 vftab
= kernel_data
->table_vdw
->data
;
144 vftabscale
= _mm_set1_pd(kernel_data
->table_vdw
->scale
);
146 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
147 ewtab
= fr
->ic
->tabq_coul_FDV0
;
148 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
149 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
151 /* Setup water-specific parameters */
152 inr
= nlist
->iinr
[0];
153 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
154 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
155 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
156 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
158 jq1
= _mm_set1_pd(charge
[inr
+1]);
159 jq2
= _mm_set1_pd(charge
[inr
+2]);
160 jq3
= _mm_set1_pd(charge
[inr
+3]);
161 vdwjidx0A
= 2*vdwtype
[inr
+0];
162 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
163 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
164 qq11
= _mm_mul_pd(iq1
,jq1
);
165 qq12
= _mm_mul_pd(iq1
,jq2
);
166 qq13
= _mm_mul_pd(iq1
,jq3
);
167 qq21
= _mm_mul_pd(iq2
,jq1
);
168 qq22
= _mm_mul_pd(iq2
,jq2
);
169 qq23
= _mm_mul_pd(iq2
,jq3
);
170 qq31
= _mm_mul_pd(iq3
,jq1
);
171 qq32
= _mm_mul_pd(iq3
,jq2
);
172 qq33
= _mm_mul_pd(iq3
,jq3
);
174 /* Avoid stupid compiler warnings */
182 /* Start outer loop over neighborlists */
183 for(iidx
=0; iidx
<nri
; iidx
++)
185 /* Load shift vector for this list */
186 i_shift_offset
= DIM
*shiftidx
[iidx
];
188 /* Load limits for loop over neighbors */
189 j_index_start
= jindex
[iidx
];
190 j_index_end
= jindex
[iidx
+1];
192 /* Get outer coordinate index */
194 i_coord_offset
= DIM
*inr
;
196 /* Load i particle coords and add shift vector */
197 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
198 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
200 fix0
= _mm_setzero_pd();
201 fiy0
= _mm_setzero_pd();
202 fiz0
= _mm_setzero_pd();
203 fix1
= _mm_setzero_pd();
204 fiy1
= _mm_setzero_pd();
205 fiz1
= _mm_setzero_pd();
206 fix2
= _mm_setzero_pd();
207 fiy2
= _mm_setzero_pd();
208 fiz2
= _mm_setzero_pd();
209 fix3
= _mm_setzero_pd();
210 fiy3
= _mm_setzero_pd();
211 fiz3
= _mm_setzero_pd();
213 /* Reset potential sums */
214 velecsum
= _mm_setzero_pd();
215 vvdwsum
= _mm_setzero_pd();
217 /* Start inner kernel loop */
218 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
221 /* Get j neighbor index, and coordinate index */
224 j_coord_offsetA
= DIM
*jnrA
;
225 j_coord_offsetB
= DIM
*jnrB
;
227 /* load j atom coordinates */
228 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
229 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
230 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
232 /* Calculate displacement vector */
233 dx00
= _mm_sub_pd(ix0
,jx0
);
234 dy00
= _mm_sub_pd(iy0
,jy0
);
235 dz00
= _mm_sub_pd(iz0
,jz0
);
236 dx11
= _mm_sub_pd(ix1
,jx1
);
237 dy11
= _mm_sub_pd(iy1
,jy1
);
238 dz11
= _mm_sub_pd(iz1
,jz1
);
239 dx12
= _mm_sub_pd(ix1
,jx2
);
240 dy12
= _mm_sub_pd(iy1
,jy2
);
241 dz12
= _mm_sub_pd(iz1
,jz2
);
242 dx13
= _mm_sub_pd(ix1
,jx3
);
243 dy13
= _mm_sub_pd(iy1
,jy3
);
244 dz13
= _mm_sub_pd(iz1
,jz3
);
245 dx21
= _mm_sub_pd(ix2
,jx1
);
246 dy21
= _mm_sub_pd(iy2
,jy1
);
247 dz21
= _mm_sub_pd(iz2
,jz1
);
248 dx22
= _mm_sub_pd(ix2
,jx2
);
249 dy22
= _mm_sub_pd(iy2
,jy2
);
250 dz22
= _mm_sub_pd(iz2
,jz2
);
251 dx23
= _mm_sub_pd(ix2
,jx3
);
252 dy23
= _mm_sub_pd(iy2
,jy3
);
253 dz23
= _mm_sub_pd(iz2
,jz3
);
254 dx31
= _mm_sub_pd(ix3
,jx1
);
255 dy31
= _mm_sub_pd(iy3
,jy1
);
256 dz31
= _mm_sub_pd(iz3
,jz1
);
257 dx32
= _mm_sub_pd(ix3
,jx2
);
258 dy32
= _mm_sub_pd(iy3
,jy2
);
259 dz32
= _mm_sub_pd(iz3
,jz2
);
260 dx33
= _mm_sub_pd(ix3
,jx3
);
261 dy33
= _mm_sub_pd(iy3
,jy3
);
262 dz33
= _mm_sub_pd(iz3
,jz3
);
264 /* Calculate squared distance and things based on it */
265 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
266 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
267 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
268 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
269 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
270 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
271 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
272 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
273 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
274 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
276 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
277 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
278 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
279 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
280 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
281 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
282 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
283 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
284 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
285 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
287 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
288 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
289 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
290 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
291 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
292 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
293 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
294 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
295 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
297 fjx0
= _mm_setzero_pd();
298 fjy0
= _mm_setzero_pd();
299 fjz0
= _mm_setzero_pd();
300 fjx1
= _mm_setzero_pd();
301 fjy1
= _mm_setzero_pd();
302 fjz1
= _mm_setzero_pd();
303 fjx2
= _mm_setzero_pd();
304 fjy2
= _mm_setzero_pd();
305 fjz2
= _mm_setzero_pd();
306 fjx3
= _mm_setzero_pd();
307 fjy3
= _mm_setzero_pd();
308 fjz3
= _mm_setzero_pd();
310 /**************************
311 * CALCULATE INTERACTIONS *
312 **************************/
314 r00
= _mm_mul_pd(rsq00
,rinv00
);
316 /* Calculate table index by multiplying r with table scale and truncate to integer */
317 rt
= _mm_mul_pd(r00
,vftabscale
);
318 vfitab
= _mm_cvttpd_epi32(rt
);
319 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
320 vfitab
= _mm_slli_epi32(vfitab
,3);
322 /* CUBIC SPLINE TABLE DISPERSION */
323 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
324 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
325 GMX_MM_TRANSPOSE2_PD(Y
,F
);
326 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
327 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
328 GMX_MM_TRANSPOSE2_PD(G
,H
);
329 Heps
= _mm_mul_pd(vfeps
,H
);
330 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
331 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
332 vvdw6
= _mm_mul_pd(c6_00
,VV
);
333 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
334 fvdw6
= _mm_mul_pd(c6_00
,FF
);
336 /* CUBIC SPLINE TABLE REPULSION */
337 vfitab
= _mm_add_epi32(vfitab
,ifour
);
338 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
339 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
340 GMX_MM_TRANSPOSE2_PD(Y
,F
);
341 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
342 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
343 GMX_MM_TRANSPOSE2_PD(G
,H
);
344 Heps
= _mm_mul_pd(vfeps
,H
);
345 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
346 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
347 vvdw12
= _mm_mul_pd(c12_00
,VV
);
348 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
349 fvdw12
= _mm_mul_pd(c12_00
,FF
);
350 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
351 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
353 /* Update potential sum for this i atom from the interaction with this j atom. */
354 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
358 /* Calculate temporary vectorial force */
359 tx
= _mm_mul_pd(fscal
,dx00
);
360 ty
= _mm_mul_pd(fscal
,dy00
);
361 tz
= _mm_mul_pd(fscal
,dz00
);
363 /* Update vectorial force */
364 fix0
= _mm_add_pd(fix0
,tx
);
365 fiy0
= _mm_add_pd(fiy0
,ty
);
366 fiz0
= _mm_add_pd(fiz0
,tz
);
368 fjx0
= _mm_add_pd(fjx0
,tx
);
369 fjy0
= _mm_add_pd(fjy0
,ty
);
370 fjz0
= _mm_add_pd(fjz0
,tz
);
372 /**************************
373 * CALCULATE INTERACTIONS *
374 **************************/
376 r11
= _mm_mul_pd(rsq11
,rinv11
);
378 /* EWALD ELECTROSTATICS */
380 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
381 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
382 ewitab
= _mm_cvttpd_epi32(ewrt
);
383 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
384 ewitab
= _mm_slli_epi32(ewitab
,2);
385 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
386 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
387 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
388 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
389 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
390 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
391 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
392 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
393 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
394 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
396 /* Update potential sum for this i atom from the interaction with this j atom. */
397 velecsum
= _mm_add_pd(velecsum
,velec
);
401 /* Calculate temporary vectorial force */
402 tx
= _mm_mul_pd(fscal
,dx11
);
403 ty
= _mm_mul_pd(fscal
,dy11
);
404 tz
= _mm_mul_pd(fscal
,dz11
);
406 /* Update vectorial force */
407 fix1
= _mm_add_pd(fix1
,tx
);
408 fiy1
= _mm_add_pd(fiy1
,ty
);
409 fiz1
= _mm_add_pd(fiz1
,tz
);
411 fjx1
= _mm_add_pd(fjx1
,tx
);
412 fjy1
= _mm_add_pd(fjy1
,ty
);
413 fjz1
= _mm_add_pd(fjz1
,tz
);
415 /**************************
416 * CALCULATE INTERACTIONS *
417 **************************/
419 r12
= _mm_mul_pd(rsq12
,rinv12
);
421 /* EWALD ELECTROSTATICS */
423 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
424 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
425 ewitab
= _mm_cvttpd_epi32(ewrt
);
426 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
427 ewitab
= _mm_slli_epi32(ewitab
,2);
428 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
429 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
430 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
431 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
432 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
433 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
434 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
435 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
436 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
437 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
439 /* Update potential sum for this i atom from the interaction with this j atom. */
440 velecsum
= _mm_add_pd(velecsum
,velec
);
444 /* Calculate temporary vectorial force */
445 tx
= _mm_mul_pd(fscal
,dx12
);
446 ty
= _mm_mul_pd(fscal
,dy12
);
447 tz
= _mm_mul_pd(fscal
,dz12
);
449 /* Update vectorial force */
450 fix1
= _mm_add_pd(fix1
,tx
);
451 fiy1
= _mm_add_pd(fiy1
,ty
);
452 fiz1
= _mm_add_pd(fiz1
,tz
);
454 fjx2
= _mm_add_pd(fjx2
,tx
);
455 fjy2
= _mm_add_pd(fjy2
,ty
);
456 fjz2
= _mm_add_pd(fjz2
,tz
);
458 /**************************
459 * CALCULATE INTERACTIONS *
460 **************************/
462 r13
= _mm_mul_pd(rsq13
,rinv13
);
464 /* EWALD ELECTROSTATICS */
466 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
467 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
468 ewitab
= _mm_cvttpd_epi32(ewrt
);
469 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
470 ewitab
= _mm_slli_epi32(ewitab
,2);
471 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
472 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
473 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
474 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
475 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
476 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
477 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
478 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
479 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(rinv13
,velec
));
480 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
482 /* Update potential sum for this i atom from the interaction with this j atom. */
483 velecsum
= _mm_add_pd(velecsum
,velec
);
487 /* Calculate temporary vectorial force */
488 tx
= _mm_mul_pd(fscal
,dx13
);
489 ty
= _mm_mul_pd(fscal
,dy13
);
490 tz
= _mm_mul_pd(fscal
,dz13
);
492 /* Update vectorial force */
493 fix1
= _mm_add_pd(fix1
,tx
);
494 fiy1
= _mm_add_pd(fiy1
,ty
);
495 fiz1
= _mm_add_pd(fiz1
,tz
);
497 fjx3
= _mm_add_pd(fjx3
,tx
);
498 fjy3
= _mm_add_pd(fjy3
,ty
);
499 fjz3
= _mm_add_pd(fjz3
,tz
);
501 /**************************
502 * CALCULATE INTERACTIONS *
503 **************************/
505 r21
= _mm_mul_pd(rsq21
,rinv21
);
507 /* EWALD ELECTROSTATICS */
509 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
510 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
511 ewitab
= _mm_cvttpd_epi32(ewrt
);
512 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
513 ewitab
= _mm_slli_epi32(ewitab
,2);
514 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
515 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
516 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
517 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
518 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
519 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
520 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
521 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
522 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
523 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velecsum
= _mm_add_pd(velecsum
,velec
);
530 /* Calculate temporary vectorial force */
531 tx
= _mm_mul_pd(fscal
,dx21
);
532 ty
= _mm_mul_pd(fscal
,dy21
);
533 tz
= _mm_mul_pd(fscal
,dz21
);
535 /* Update vectorial force */
536 fix2
= _mm_add_pd(fix2
,tx
);
537 fiy2
= _mm_add_pd(fiy2
,ty
);
538 fiz2
= _mm_add_pd(fiz2
,tz
);
540 fjx1
= _mm_add_pd(fjx1
,tx
);
541 fjy1
= _mm_add_pd(fjy1
,ty
);
542 fjz1
= _mm_add_pd(fjz1
,tz
);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 r22
= _mm_mul_pd(rsq22
,rinv22
);
550 /* EWALD ELECTROSTATICS */
552 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
553 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
554 ewitab
= _mm_cvttpd_epi32(ewrt
);
555 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
556 ewitab
= _mm_slli_epi32(ewitab
,2);
557 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
558 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
559 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
560 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
561 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
562 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
563 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
564 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
565 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
566 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
568 /* Update potential sum for this i atom from the interaction with this j atom. */
569 velecsum
= _mm_add_pd(velecsum
,velec
);
573 /* Calculate temporary vectorial force */
574 tx
= _mm_mul_pd(fscal
,dx22
);
575 ty
= _mm_mul_pd(fscal
,dy22
);
576 tz
= _mm_mul_pd(fscal
,dz22
);
578 /* Update vectorial force */
579 fix2
= _mm_add_pd(fix2
,tx
);
580 fiy2
= _mm_add_pd(fiy2
,ty
);
581 fiz2
= _mm_add_pd(fiz2
,tz
);
583 fjx2
= _mm_add_pd(fjx2
,tx
);
584 fjy2
= _mm_add_pd(fjy2
,ty
);
585 fjz2
= _mm_add_pd(fjz2
,tz
);
587 /**************************
588 * CALCULATE INTERACTIONS *
589 **************************/
591 r23
= _mm_mul_pd(rsq23
,rinv23
);
593 /* EWALD ELECTROSTATICS */
595 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
596 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
597 ewitab
= _mm_cvttpd_epi32(ewrt
);
598 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
599 ewitab
= _mm_slli_epi32(ewitab
,2);
600 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
601 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
602 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
603 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
604 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
605 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
606 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
607 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
608 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(rinv23
,velec
));
609 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
611 /* Update potential sum for this i atom from the interaction with this j atom. */
612 velecsum
= _mm_add_pd(velecsum
,velec
);
616 /* Calculate temporary vectorial force */
617 tx
= _mm_mul_pd(fscal
,dx23
);
618 ty
= _mm_mul_pd(fscal
,dy23
);
619 tz
= _mm_mul_pd(fscal
,dz23
);
621 /* Update vectorial force */
622 fix2
= _mm_add_pd(fix2
,tx
);
623 fiy2
= _mm_add_pd(fiy2
,ty
);
624 fiz2
= _mm_add_pd(fiz2
,tz
);
626 fjx3
= _mm_add_pd(fjx3
,tx
);
627 fjy3
= _mm_add_pd(fjy3
,ty
);
628 fjz3
= _mm_add_pd(fjz3
,tz
);
630 /**************************
631 * CALCULATE INTERACTIONS *
632 **************************/
634 r31
= _mm_mul_pd(rsq31
,rinv31
);
636 /* EWALD ELECTROSTATICS */
638 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
639 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
640 ewitab
= _mm_cvttpd_epi32(ewrt
);
641 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
642 ewitab
= _mm_slli_epi32(ewitab
,2);
643 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
644 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
645 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
646 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
647 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
648 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
649 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
650 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
651 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(rinv31
,velec
));
652 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
654 /* Update potential sum for this i atom from the interaction with this j atom. */
655 velecsum
= _mm_add_pd(velecsum
,velec
);
659 /* Calculate temporary vectorial force */
660 tx
= _mm_mul_pd(fscal
,dx31
);
661 ty
= _mm_mul_pd(fscal
,dy31
);
662 tz
= _mm_mul_pd(fscal
,dz31
);
664 /* Update vectorial force */
665 fix3
= _mm_add_pd(fix3
,tx
);
666 fiy3
= _mm_add_pd(fiy3
,ty
);
667 fiz3
= _mm_add_pd(fiz3
,tz
);
669 fjx1
= _mm_add_pd(fjx1
,tx
);
670 fjy1
= _mm_add_pd(fjy1
,ty
);
671 fjz1
= _mm_add_pd(fjz1
,tz
);
673 /**************************
674 * CALCULATE INTERACTIONS *
675 **************************/
677 r32
= _mm_mul_pd(rsq32
,rinv32
);
679 /* EWALD ELECTROSTATICS */
681 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
682 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
683 ewitab
= _mm_cvttpd_epi32(ewrt
);
684 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
685 ewitab
= _mm_slli_epi32(ewitab
,2);
686 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
687 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
688 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
689 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
690 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
691 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
692 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
693 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
694 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(rinv32
,velec
));
695 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
697 /* Update potential sum for this i atom from the interaction with this j atom. */
698 velecsum
= _mm_add_pd(velecsum
,velec
);
702 /* Calculate temporary vectorial force */
703 tx
= _mm_mul_pd(fscal
,dx32
);
704 ty
= _mm_mul_pd(fscal
,dy32
);
705 tz
= _mm_mul_pd(fscal
,dz32
);
707 /* Update vectorial force */
708 fix3
= _mm_add_pd(fix3
,tx
);
709 fiy3
= _mm_add_pd(fiy3
,ty
);
710 fiz3
= _mm_add_pd(fiz3
,tz
);
712 fjx2
= _mm_add_pd(fjx2
,tx
);
713 fjy2
= _mm_add_pd(fjy2
,ty
);
714 fjz2
= _mm_add_pd(fjz2
,tz
);
716 /**************************
717 * CALCULATE INTERACTIONS *
718 **************************/
720 r33
= _mm_mul_pd(rsq33
,rinv33
);
722 /* EWALD ELECTROSTATICS */
724 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
725 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
726 ewitab
= _mm_cvttpd_epi32(ewrt
);
727 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
728 ewitab
= _mm_slli_epi32(ewitab
,2);
729 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
730 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
731 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
732 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
733 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
734 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
735 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
736 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
737 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(rinv33
,velec
));
738 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
740 /* Update potential sum for this i atom from the interaction with this j atom. */
741 velecsum
= _mm_add_pd(velecsum
,velec
);
745 /* Calculate temporary vectorial force */
746 tx
= _mm_mul_pd(fscal
,dx33
);
747 ty
= _mm_mul_pd(fscal
,dy33
);
748 tz
= _mm_mul_pd(fscal
,dz33
);
750 /* Update vectorial force */
751 fix3
= _mm_add_pd(fix3
,tx
);
752 fiy3
= _mm_add_pd(fiy3
,ty
);
753 fiz3
= _mm_add_pd(fiz3
,tz
);
755 fjx3
= _mm_add_pd(fjx3
,tx
);
756 fjy3
= _mm_add_pd(fjy3
,ty
);
757 fjz3
= _mm_add_pd(fjz3
,tz
);
759 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
761 /* Inner loop uses 428 flops */
768 j_coord_offsetA
= DIM
*jnrA
;
770 /* load j atom coordinates */
771 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
772 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
773 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
775 /* Calculate displacement vector */
776 dx00
= _mm_sub_pd(ix0
,jx0
);
777 dy00
= _mm_sub_pd(iy0
,jy0
);
778 dz00
= _mm_sub_pd(iz0
,jz0
);
779 dx11
= _mm_sub_pd(ix1
,jx1
);
780 dy11
= _mm_sub_pd(iy1
,jy1
);
781 dz11
= _mm_sub_pd(iz1
,jz1
);
782 dx12
= _mm_sub_pd(ix1
,jx2
);
783 dy12
= _mm_sub_pd(iy1
,jy2
);
784 dz12
= _mm_sub_pd(iz1
,jz2
);
785 dx13
= _mm_sub_pd(ix1
,jx3
);
786 dy13
= _mm_sub_pd(iy1
,jy3
);
787 dz13
= _mm_sub_pd(iz1
,jz3
);
788 dx21
= _mm_sub_pd(ix2
,jx1
);
789 dy21
= _mm_sub_pd(iy2
,jy1
);
790 dz21
= _mm_sub_pd(iz2
,jz1
);
791 dx22
= _mm_sub_pd(ix2
,jx2
);
792 dy22
= _mm_sub_pd(iy2
,jy2
);
793 dz22
= _mm_sub_pd(iz2
,jz2
);
794 dx23
= _mm_sub_pd(ix2
,jx3
);
795 dy23
= _mm_sub_pd(iy2
,jy3
);
796 dz23
= _mm_sub_pd(iz2
,jz3
);
797 dx31
= _mm_sub_pd(ix3
,jx1
);
798 dy31
= _mm_sub_pd(iy3
,jy1
);
799 dz31
= _mm_sub_pd(iz3
,jz1
);
800 dx32
= _mm_sub_pd(ix3
,jx2
);
801 dy32
= _mm_sub_pd(iy3
,jy2
);
802 dz32
= _mm_sub_pd(iz3
,jz2
);
803 dx33
= _mm_sub_pd(ix3
,jx3
);
804 dy33
= _mm_sub_pd(iy3
,jy3
);
805 dz33
= _mm_sub_pd(iz3
,jz3
);
807 /* Calculate squared distance and things based on it */
808 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
809 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
810 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
811 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
812 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
813 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
814 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
815 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
816 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
817 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
819 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
820 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
821 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
822 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
823 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
824 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
825 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
826 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
827 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
828 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
830 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
831 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
832 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
833 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
834 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
835 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
836 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
837 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
838 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
840 fjx0
= _mm_setzero_pd();
841 fjy0
= _mm_setzero_pd();
842 fjz0
= _mm_setzero_pd();
843 fjx1
= _mm_setzero_pd();
844 fjy1
= _mm_setzero_pd();
845 fjz1
= _mm_setzero_pd();
846 fjx2
= _mm_setzero_pd();
847 fjy2
= _mm_setzero_pd();
848 fjz2
= _mm_setzero_pd();
849 fjx3
= _mm_setzero_pd();
850 fjy3
= _mm_setzero_pd();
851 fjz3
= _mm_setzero_pd();
853 /**************************
854 * CALCULATE INTERACTIONS *
855 **************************/
857 r00
= _mm_mul_pd(rsq00
,rinv00
);
859 /* Calculate table index by multiplying r with table scale and truncate to integer */
860 rt
= _mm_mul_pd(r00
,vftabscale
);
861 vfitab
= _mm_cvttpd_epi32(rt
);
862 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
863 vfitab
= _mm_slli_epi32(vfitab
,3);
865 /* CUBIC SPLINE TABLE DISPERSION */
866 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
867 F
= _mm_setzero_pd();
868 GMX_MM_TRANSPOSE2_PD(Y
,F
);
869 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
870 H
= _mm_setzero_pd();
871 GMX_MM_TRANSPOSE2_PD(G
,H
);
872 Heps
= _mm_mul_pd(vfeps
,H
);
873 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
874 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
875 vvdw6
= _mm_mul_pd(c6_00
,VV
);
876 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
877 fvdw6
= _mm_mul_pd(c6_00
,FF
);
879 /* CUBIC SPLINE TABLE REPULSION */
880 vfitab
= _mm_add_epi32(vfitab
,ifour
);
881 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
882 F
= _mm_setzero_pd();
883 GMX_MM_TRANSPOSE2_PD(Y
,F
);
884 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
885 H
= _mm_setzero_pd();
886 GMX_MM_TRANSPOSE2_PD(G
,H
);
887 Heps
= _mm_mul_pd(vfeps
,H
);
888 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
889 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
890 vvdw12
= _mm_mul_pd(c12_00
,VV
);
891 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
892 fvdw12
= _mm_mul_pd(c12_00
,FF
);
893 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
894 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
896 /* Update potential sum for this i atom from the interaction with this j atom. */
897 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
898 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
902 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
904 /* Calculate temporary vectorial force */
905 tx
= _mm_mul_pd(fscal
,dx00
);
906 ty
= _mm_mul_pd(fscal
,dy00
);
907 tz
= _mm_mul_pd(fscal
,dz00
);
909 /* Update vectorial force */
910 fix0
= _mm_add_pd(fix0
,tx
);
911 fiy0
= _mm_add_pd(fiy0
,ty
);
912 fiz0
= _mm_add_pd(fiz0
,tz
);
914 fjx0
= _mm_add_pd(fjx0
,tx
);
915 fjy0
= _mm_add_pd(fjy0
,ty
);
916 fjz0
= _mm_add_pd(fjz0
,tz
);
918 /**************************
919 * CALCULATE INTERACTIONS *
920 **************************/
922 r11
= _mm_mul_pd(rsq11
,rinv11
);
924 /* EWALD ELECTROSTATICS */
926 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
927 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
928 ewitab
= _mm_cvttpd_epi32(ewrt
);
929 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
930 ewitab
= _mm_slli_epi32(ewitab
,2);
931 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
932 ewtabD
= _mm_setzero_pd();
933 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
934 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
935 ewtabFn
= _mm_setzero_pd();
936 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
937 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
938 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
939 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
940 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
942 /* Update potential sum for this i atom from the interaction with this j atom. */
943 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
944 velecsum
= _mm_add_pd(velecsum
,velec
);
948 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
950 /* Calculate temporary vectorial force */
951 tx
= _mm_mul_pd(fscal
,dx11
);
952 ty
= _mm_mul_pd(fscal
,dy11
);
953 tz
= _mm_mul_pd(fscal
,dz11
);
955 /* Update vectorial force */
956 fix1
= _mm_add_pd(fix1
,tx
);
957 fiy1
= _mm_add_pd(fiy1
,ty
);
958 fiz1
= _mm_add_pd(fiz1
,tz
);
960 fjx1
= _mm_add_pd(fjx1
,tx
);
961 fjy1
= _mm_add_pd(fjy1
,ty
);
962 fjz1
= _mm_add_pd(fjz1
,tz
);
964 /**************************
965 * CALCULATE INTERACTIONS *
966 **************************/
968 r12
= _mm_mul_pd(rsq12
,rinv12
);
970 /* EWALD ELECTROSTATICS */
972 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
973 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
974 ewitab
= _mm_cvttpd_epi32(ewrt
);
975 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
976 ewitab
= _mm_slli_epi32(ewitab
,2);
977 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
978 ewtabD
= _mm_setzero_pd();
979 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
980 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
981 ewtabFn
= _mm_setzero_pd();
982 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
983 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
984 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
985 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
986 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
988 /* Update potential sum for this i atom from the interaction with this j atom. */
989 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
990 velecsum
= _mm_add_pd(velecsum
,velec
);
994 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
996 /* Calculate temporary vectorial force */
997 tx
= _mm_mul_pd(fscal
,dx12
);
998 ty
= _mm_mul_pd(fscal
,dy12
);
999 tz
= _mm_mul_pd(fscal
,dz12
);
1001 /* Update vectorial force */
1002 fix1
= _mm_add_pd(fix1
,tx
);
1003 fiy1
= _mm_add_pd(fiy1
,ty
);
1004 fiz1
= _mm_add_pd(fiz1
,tz
);
1006 fjx2
= _mm_add_pd(fjx2
,tx
);
1007 fjy2
= _mm_add_pd(fjy2
,ty
);
1008 fjz2
= _mm_add_pd(fjz2
,tz
);
1010 /**************************
1011 * CALCULATE INTERACTIONS *
1012 **************************/
1014 r13
= _mm_mul_pd(rsq13
,rinv13
);
1016 /* EWALD ELECTROSTATICS */
1018 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1019 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
1020 ewitab
= _mm_cvttpd_epi32(ewrt
);
1021 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1022 ewitab
= _mm_slli_epi32(ewitab
,2);
1023 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1024 ewtabD
= _mm_setzero_pd();
1025 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1026 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1027 ewtabFn
= _mm_setzero_pd();
1028 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1029 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1030 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1031 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(rinv13
,velec
));
1032 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
1034 /* Update potential sum for this i atom from the interaction with this j atom. */
1035 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1036 velecsum
= _mm_add_pd(velecsum
,velec
);
1040 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1042 /* Calculate temporary vectorial force */
1043 tx
= _mm_mul_pd(fscal
,dx13
);
1044 ty
= _mm_mul_pd(fscal
,dy13
);
1045 tz
= _mm_mul_pd(fscal
,dz13
);
1047 /* Update vectorial force */
1048 fix1
= _mm_add_pd(fix1
,tx
);
1049 fiy1
= _mm_add_pd(fiy1
,ty
);
1050 fiz1
= _mm_add_pd(fiz1
,tz
);
1052 fjx3
= _mm_add_pd(fjx3
,tx
);
1053 fjy3
= _mm_add_pd(fjy3
,ty
);
1054 fjz3
= _mm_add_pd(fjz3
,tz
);
1056 /**************************
1057 * CALCULATE INTERACTIONS *
1058 **************************/
1060 r21
= _mm_mul_pd(rsq21
,rinv21
);
1062 /* EWALD ELECTROSTATICS */
1064 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1065 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1066 ewitab
= _mm_cvttpd_epi32(ewrt
);
1067 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1068 ewitab
= _mm_slli_epi32(ewitab
,2);
1069 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1070 ewtabD
= _mm_setzero_pd();
1071 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1072 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1073 ewtabFn
= _mm_setzero_pd();
1074 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1075 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1076 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1077 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
1078 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1080 /* Update potential sum for this i atom from the interaction with this j atom. */
1081 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1082 velecsum
= _mm_add_pd(velecsum
,velec
);
1086 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1088 /* Calculate temporary vectorial force */
1089 tx
= _mm_mul_pd(fscal
,dx21
);
1090 ty
= _mm_mul_pd(fscal
,dy21
);
1091 tz
= _mm_mul_pd(fscal
,dz21
);
1093 /* Update vectorial force */
1094 fix2
= _mm_add_pd(fix2
,tx
);
1095 fiy2
= _mm_add_pd(fiy2
,ty
);
1096 fiz2
= _mm_add_pd(fiz2
,tz
);
1098 fjx1
= _mm_add_pd(fjx1
,tx
);
1099 fjy1
= _mm_add_pd(fjy1
,ty
);
1100 fjz1
= _mm_add_pd(fjz1
,tz
);
1102 /**************************
1103 * CALCULATE INTERACTIONS *
1104 **************************/
1106 r22
= _mm_mul_pd(rsq22
,rinv22
);
1108 /* EWALD ELECTROSTATICS */
1110 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1111 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
1112 ewitab
= _mm_cvttpd_epi32(ewrt
);
1113 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1114 ewitab
= _mm_slli_epi32(ewitab
,2);
1115 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1116 ewtabD
= _mm_setzero_pd();
1117 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1118 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1119 ewtabFn
= _mm_setzero_pd();
1120 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1121 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1122 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1123 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
1124 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
1126 /* Update potential sum for this i atom from the interaction with this j atom. */
1127 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1128 velecsum
= _mm_add_pd(velecsum
,velec
);
1132 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1134 /* Calculate temporary vectorial force */
1135 tx
= _mm_mul_pd(fscal
,dx22
);
1136 ty
= _mm_mul_pd(fscal
,dy22
);
1137 tz
= _mm_mul_pd(fscal
,dz22
);
1139 /* Update vectorial force */
1140 fix2
= _mm_add_pd(fix2
,tx
);
1141 fiy2
= _mm_add_pd(fiy2
,ty
);
1142 fiz2
= _mm_add_pd(fiz2
,tz
);
1144 fjx2
= _mm_add_pd(fjx2
,tx
);
1145 fjy2
= _mm_add_pd(fjy2
,ty
);
1146 fjz2
= _mm_add_pd(fjz2
,tz
);
1148 /**************************
1149 * CALCULATE INTERACTIONS *
1150 **************************/
1152 r23
= _mm_mul_pd(rsq23
,rinv23
);
1154 /* EWALD ELECTROSTATICS */
1156 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1157 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
1158 ewitab
= _mm_cvttpd_epi32(ewrt
);
1159 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1160 ewitab
= _mm_slli_epi32(ewitab
,2);
1161 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1162 ewtabD
= _mm_setzero_pd();
1163 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1164 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1165 ewtabFn
= _mm_setzero_pd();
1166 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1167 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1168 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1169 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(rinv23
,velec
));
1170 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
1172 /* Update potential sum for this i atom from the interaction with this j atom. */
1173 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1174 velecsum
= _mm_add_pd(velecsum
,velec
);
1178 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1180 /* Calculate temporary vectorial force */
1181 tx
= _mm_mul_pd(fscal
,dx23
);
1182 ty
= _mm_mul_pd(fscal
,dy23
);
1183 tz
= _mm_mul_pd(fscal
,dz23
);
1185 /* Update vectorial force */
1186 fix2
= _mm_add_pd(fix2
,tx
);
1187 fiy2
= _mm_add_pd(fiy2
,ty
);
1188 fiz2
= _mm_add_pd(fiz2
,tz
);
1190 fjx3
= _mm_add_pd(fjx3
,tx
);
1191 fjy3
= _mm_add_pd(fjy3
,ty
);
1192 fjz3
= _mm_add_pd(fjz3
,tz
);
1194 /**************************
1195 * CALCULATE INTERACTIONS *
1196 **************************/
1198 r31
= _mm_mul_pd(rsq31
,rinv31
);
1200 /* EWALD ELECTROSTATICS */
1202 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1203 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
1204 ewitab
= _mm_cvttpd_epi32(ewrt
);
1205 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1206 ewitab
= _mm_slli_epi32(ewitab
,2);
1207 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1208 ewtabD
= _mm_setzero_pd();
1209 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1210 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1211 ewtabFn
= _mm_setzero_pd();
1212 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1213 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1214 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1215 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(rinv31
,velec
));
1216 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
1218 /* Update potential sum for this i atom from the interaction with this j atom. */
1219 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1220 velecsum
= _mm_add_pd(velecsum
,velec
);
1224 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1226 /* Calculate temporary vectorial force */
1227 tx
= _mm_mul_pd(fscal
,dx31
);
1228 ty
= _mm_mul_pd(fscal
,dy31
);
1229 tz
= _mm_mul_pd(fscal
,dz31
);
1231 /* Update vectorial force */
1232 fix3
= _mm_add_pd(fix3
,tx
);
1233 fiy3
= _mm_add_pd(fiy3
,ty
);
1234 fiz3
= _mm_add_pd(fiz3
,tz
);
1236 fjx1
= _mm_add_pd(fjx1
,tx
);
1237 fjy1
= _mm_add_pd(fjy1
,ty
);
1238 fjz1
= _mm_add_pd(fjz1
,tz
);
1240 /**************************
1241 * CALCULATE INTERACTIONS *
1242 **************************/
1244 r32
= _mm_mul_pd(rsq32
,rinv32
);
1246 /* EWALD ELECTROSTATICS */
1248 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1249 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
1250 ewitab
= _mm_cvttpd_epi32(ewrt
);
1251 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1252 ewitab
= _mm_slli_epi32(ewitab
,2);
1253 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1254 ewtabD
= _mm_setzero_pd();
1255 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1256 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1257 ewtabFn
= _mm_setzero_pd();
1258 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1259 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1260 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1261 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(rinv32
,velec
));
1262 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
1264 /* Update potential sum for this i atom from the interaction with this j atom. */
1265 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1266 velecsum
= _mm_add_pd(velecsum
,velec
);
1270 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1272 /* Calculate temporary vectorial force */
1273 tx
= _mm_mul_pd(fscal
,dx32
);
1274 ty
= _mm_mul_pd(fscal
,dy32
);
1275 tz
= _mm_mul_pd(fscal
,dz32
);
1277 /* Update vectorial force */
1278 fix3
= _mm_add_pd(fix3
,tx
);
1279 fiy3
= _mm_add_pd(fiy3
,ty
);
1280 fiz3
= _mm_add_pd(fiz3
,tz
);
1282 fjx2
= _mm_add_pd(fjx2
,tx
);
1283 fjy2
= _mm_add_pd(fjy2
,ty
);
1284 fjz2
= _mm_add_pd(fjz2
,tz
);
1286 /**************************
1287 * CALCULATE INTERACTIONS *
1288 **************************/
1290 r33
= _mm_mul_pd(rsq33
,rinv33
);
1292 /* EWALD ELECTROSTATICS */
1294 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1295 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
1296 ewitab
= _mm_cvttpd_epi32(ewrt
);
1297 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1298 ewitab
= _mm_slli_epi32(ewitab
,2);
1299 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1300 ewtabD
= _mm_setzero_pd();
1301 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1302 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1303 ewtabFn
= _mm_setzero_pd();
1304 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1305 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1306 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1307 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(rinv33
,velec
));
1308 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
1310 /* Update potential sum for this i atom from the interaction with this j atom. */
1311 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1312 velecsum
= _mm_add_pd(velecsum
,velec
);
1316 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1318 /* Calculate temporary vectorial force */
1319 tx
= _mm_mul_pd(fscal
,dx33
);
1320 ty
= _mm_mul_pd(fscal
,dy33
);
1321 tz
= _mm_mul_pd(fscal
,dz33
);
1323 /* Update vectorial force */
1324 fix3
= _mm_add_pd(fix3
,tx
);
1325 fiy3
= _mm_add_pd(fiy3
,ty
);
1326 fiz3
= _mm_add_pd(fiz3
,tz
);
1328 fjx3
= _mm_add_pd(fjx3
,tx
);
1329 fjy3
= _mm_add_pd(fjy3
,ty
);
1330 fjz3
= _mm_add_pd(fjz3
,tz
);
1332 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1334 /* Inner loop uses 428 flops */
1337 /* End of innermost loop */
1339 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1340 f
+i_coord_offset
,fshift
+i_shift_offset
);
1343 /* Update potential energies */
1344 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1345 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1347 /* Increment number of inner iterations */
1348 inneriter
+= j_index_end
- j_index_start
;
1350 /* Outer loop uses 26 flops */
1353 /* Increment number of outer iterations */
1356 /* Update outer/inner flops */
1358 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*428);
1361 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_double
1362 * Electrostatics interaction: Ewald
1363 * VdW interaction: CubicSplineTable
1364 * Geometry: Water4-Water4
1365 * Calculate force/pot: Force
1368 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_double
1369 (t_nblist
* gmx_restrict nlist
,
1370 rvec
* gmx_restrict xx
,
1371 rvec
* gmx_restrict ff
,
1372 t_forcerec
* gmx_restrict fr
,
1373 t_mdatoms
* gmx_restrict mdatoms
,
1374 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1375 t_nrnb
* gmx_restrict nrnb
)
1377 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1378 * just 0 for non-waters.
1379 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1380 * jnr indices corresponding to data put in the four positions in the SIMD register.
1382 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1383 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1385 int j_coord_offsetA
,j_coord_offsetB
;
1386 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1387 real rcutoff_scalar
;
1388 real
*shiftvec
,*fshift
,*x
,*f
;
1389 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1391 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1393 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1395 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1397 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1398 int vdwjidx0A
,vdwjidx0B
;
1399 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1400 int vdwjidx1A
,vdwjidx1B
;
1401 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1402 int vdwjidx2A
,vdwjidx2B
;
1403 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1404 int vdwjidx3A
,vdwjidx3B
;
1405 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1406 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1407 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1408 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1409 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1410 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1411 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1412 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1413 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1414 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1415 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1416 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1419 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1422 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1423 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1425 __m128i ifour
= _mm_set1_epi32(4);
1426 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
1429 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1431 __m128d dummy_mask
,cutoff_mask
;
1432 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1433 __m128d one
= _mm_set1_pd(1.0);
1434 __m128d two
= _mm_set1_pd(2.0);
1440 jindex
= nlist
->jindex
;
1442 shiftidx
= nlist
->shift
;
1444 shiftvec
= fr
->shift_vec
[0];
1445 fshift
= fr
->fshift
[0];
1446 facel
= _mm_set1_pd(fr
->epsfac
);
1447 charge
= mdatoms
->chargeA
;
1448 nvdwtype
= fr
->ntype
;
1449 vdwparam
= fr
->nbfp
;
1450 vdwtype
= mdatoms
->typeA
;
1452 vftab
= kernel_data
->table_vdw
->data
;
1453 vftabscale
= _mm_set1_pd(kernel_data
->table_vdw
->scale
);
1455 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
1456 ewtab
= fr
->ic
->tabq_coul_F
;
1457 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
1458 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
1460 /* Setup water-specific parameters */
1461 inr
= nlist
->iinr
[0];
1462 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1463 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1464 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
1465 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1467 jq1
= _mm_set1_pd(charge
[inr
+1]);
1468 jq2
= _mm_set1_pd(charge
[inr
+2]);
1469 jq3
= _mm_set1_pd(charge
[inr
+3]);
1470 vdwjidx0A
= 2*vdwtype
[inr
+0];
1471 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1472 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1473 qq11
= _mm_mul_pd(iq1
,jq1
);
1474 qq12
= _mm_mul_pd(iq1
,jq2
);
1475 qq13
= _mm_mul_pd(iq1
,jq3
);
1476 qq21
= _mm_mul_pd(iq2
,jq1
);
1477 qq22
= _mm_mul_pd(iq2
,jq2
);
1478 qq23
= _mm_mul_pd(iq2
,jq3
);
1479 qq31
= _mm_mul_pd(iq3
,jq1
);
1480 qq32
= _mm_mul_pd(iq3
,jq2
);
1481 qq33
= _mm_mul_pd(iq3
,jq3
);
1483 /* Avoid stupid compiler warnings */
1485 j_coord_offsetA
= 0;
1486 j_coord_offsetB
= 0;
1491 /* Start outer loop over neighborlists */
1492 for(iidx
=0; iidx
<nri
; iidx
++)
1494 /* Load shift vector for this list */
1495 i_shift_offset
= DIM
*shiftidx
[iidx
];
1497 /* Load limits for loop over neighbors */
1498 j_index_start
= jindex
[iidx
];
1499 j_index_end
= jindex
[iidx
+1];
1501 /* Get outer coordinate index */
1503 i_coord_offset
= DIM
*inr
;
1505 /* Load i particle coords and add shift vector */
1506 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1507 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1509 fix0
= _mm_setzero_pd();
1510 fiy0
= _mm_setzero_pd();
1511 fiz0
= _mm_setzero_pd();
1512 fix1
= _mm_setzero_pd();
1513 fiy1
= _mm_setzero_pd();
1514 fiz1
= _mm_setzero_pd();
1515 fix2
= _mm_setzero_pd();
1516 fiy2
= _mm_setzero_pd();
1517 fiz2
= _mm_setzero_pd();
1518 fix3
= _mm_setzero_pd();
1519 fiy3
= _mm_setzero_pd();
1520 fiz3
= _mm_setzero_pd();
1522 /* Start inner kernel loop */
1523 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1526 /* Get j neighbor index, and coordinate index */
1528 jnrB
= jjnr
[jidx
+1];
1529 j_coord_offsetA
= DIM
*jnrA
;
1530 j_coord_offsetB
= DIM
*jnrB
;
1532 /* load j atom coordinates */
1533 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1534 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1535 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1537 /* Calculate displacement vector */
1538 dx00
= _mm_sub_pd(ix0
,jx0
);
1539 dy00
= _mm_sub_pd(iy0
,jy0
);
1540 dz00
= _mm_sub_pd(iz0
,jz0
);
1541 dx11
= _mm_sub_pd(ix1
,jx1
);
1542 dy11
= _mm_sub_pd(iy1
,jy1
);
1543 dz11
= _mm_sub_pd(iz1
,jz1
);
1544 dx12
= _mm_sub_pd(ix1
,jx2
);
1545 dy12
= _mm_sub_pd(iy1
,jy2
);
1546 dz12
= _mm_sub_pd(iz1
,jz2
);
1547 dx13
= _mm_sub_pd(ix1
,jx3
);
1548 dy13
= _mm_sub_pd(iy1
,jy3
);
1549 dz13
= _mm_sub_pd(iz1
,jz3
);
1550 dx21
= _mm_sub_pd(ix2
,jx1
);
1551 dy21
= _mm_sub_pd(iy2
,jy1
);
1552 dz21
= _mm_sub_pd(iz2
,jz1
);
1553 dx22
= _mm_sub_pd(ix2
,jx2
);
1554 dy22
= _mm_sub_pd(iy2
,jy2
);
1555 dz22
= _mm_sub_pd(iz2
,jz2
);
1556 dx23
= _mm_sub_pd(ix2
,jx3
);
1557 dy23
= _mm_sub_pd(iy2
,jy3
);
1558 dz23
= _mm_sub_pd(iz2
,jz3
);
1559 dx31
= _mm_sub_pd(ix3
,jx1
);
1560 dy31
= _mm_sub_pd(iy3
,jy1
);
1561 dz31
= _mm_sub_pd(iz3
,jz1
);
1562 dx32
= _mm_sub_pd(ix3
,jx2
);
1563 dy32
= _mm_sub_pd(iy3
,jy2
);
1564 dz32
= _mm_sub_pd(iz3
,jz2
);
1565 dx33
= _mm_sub_pd(ix3
,jx3
);
1566 dy33
= _mm_sub_pd(iy3
,jy3
);
1567 dz33
= _mm_sub_pd(iz3
,jz3
);
1569 /* Calculate squared distance and things based on it */
1570 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1571 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1572 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1573 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1574 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1575 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1576 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1577 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1578 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1579 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1581 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1582 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1583 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1584 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1585 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1586 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1587 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1588 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1589 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1590 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1592 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1593 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1594 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1595 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1596 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1597 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1598 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1599 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1600 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1602 fjx0
= _mm_setzero_pd();
1603 fjy0
= _mm_setzero_pd();
1604 fjz0
= _mm_setzero_pd();
1605 fjx1
= _mm_setzero_pd();
1606 fjy1
= _mm_setzero_pd();
1607 fjz1
= _mm_setzero_pd();
1608 fjx2
= _mm_setzero_pd();
1609 fjy2
= _mm_setzero_pd();
1610 fjz2
= _mm_setzero_pd();
1611 fjx3
= _mm_setzero_pd();
1612 fjy3
= _mm_setzero_pd();
1613 fjz3
= _mm_setzero_pd();
1615 /**************************
1616 * CALCULATE INTERACTIONS *
1617 **************************/
1619 r00
= _mm_mul_pd(rsq00
,rinv00
);
1621 /* Calculate table index by multiplying r with table scale and truncate to integer */
1622 rt
= _mm_mul_pd(r00
,vftabscale
);
1623 vfitab
= _mm_cvttpd_epi32(rt
);
1624 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1625 vfitab
= _mm_slli_epi32(vfitab
,3);
1627 /* CUBIC SPLINE TABLE DISPERSION */
1628 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1629 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1630 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1631 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1632 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1633 GMX_MM_TRANSPOSE2_PD(G
,H
);
1634 Heps
= _mm_mul_pd(vfeps
,H
);
1635 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1636 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1637 fvdw6
= _mm_mul_pd(c6_00
,FF
);
1639 /* CUBIC SPLINE TABLE REPULSION */
1640 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1641 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1642 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1643 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1644 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1645 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1646 GMX_MM_TRANSPOSE2_PD(G
,H
);
1647 Heps
= _mm_mul_pd(vfeps
,H
);
1648 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1649 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1650 fvdw12
= _mm_mul_pd(c12_00
,FF
);
1651 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
1655 /* Calculate temporary vectorial force */
1656 tx
= _mm_mul_pd(fscal
,dx00
);
1657 ty
= _mm_mul_pd(fscal
,dy00
);
1658 tz
= _mm_mul_pd(fscal
,dz00
);
1660 /* Update vectorial force */
1661 fix0
= _mm_add_pd(fix0
,tx
);
1662 fiy0
= _mm_add_pd(fiy0
,ty
);
1663 fiz0
= _mm_add_pd(fiz0
,tz
);
1665 fjx0
= _mm_add_pd(fjx0
,tx
);
1666 fjy0
= _mm_add_pd(fjy0
,ty
);
1667 fjz0
= _mm_add_pd(fjz0
,tz
);
1669 /**************************
1670 * CALCULATE INTERACTIONS *
1671 **************************/
1673 r11
= _mm_mul_pd(rsq11
,rinv11
);
1675 /* EWALD ELECTROSTATICS */
1677 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1678 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1679 ewitab
= _mm_cvttpd_epi32(ewrt
);
1680 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1681 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1683 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1684 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1688 /* Calculate temporary vectorial force */
1689 tx
= _mm_mul_pd(fscal
,dx11
);
1690 ty
= _mm_mul_pd(fscal
,dy11
);
1691 tz
= _mm_mul_pd(fscal
,dz11
);
1693 /* Update vectorial force */
1694 fix1
= _mm_add_pd(fix1
,tx
);
1695 fiy1
= _mm_add_pd(fiy1
,ty
);
1696 fiz1
= _mm_add_pd(fiz1
,tz
);
1698 fjx1
= _mm_add_pd(fjx1
,tx
);
1699 fjy1
= _mm_add_pd(fjy1
,ty
);
1700 fjz1
= _mm_add_pd(fjz1
,tz
);
1702 /**************************
1703 * CALCULATE INTERACTIONS *
1704 **************************/
1706 r12
= _mm_mul_pd(rsq12
,rinv12
);
1708 /* EWALD ELECTROSTATICS */
1710 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1711 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1712 ewitab
= _mm_cvttpd_epi32(ewrt
);
1713 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1714 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1716 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1717 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1721 /* Calculate temporary vectorial force */
1722 tx
= _mm_mul_pd(fscal
,dx12
);
1723 ty
= _mm_mul_pd(fscal
,dy12
);
1724 tz
= _mm_mul_pd(fscal
,dz12
);
1726 /* Update vectorial force */
1727 fix1
= _mm_add_pd(fix1
,tx
);
1728 fiy1
= _mm_add_pd(fiy1
,ty
);
1729 fiz1
= _mm_add_pd(fiz1
,tz
);
1731 fjx2
= _mm_add_pd(fjx2
,tx
);
1732 fjy2
= _mm_add_pd(fjy2
,ty
);
1733 fjz2
= _mm_add_pd(fjz2
,tz
);
1735 /**************************
1736 * CALCULATE INTERACTIONS *
1737 **************************/
1739 r13
= _mm_mul_pd(rsq13
,rinv13
);
1741 /* EWALD ELECTROSTATICS */
1743 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1744 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
1745 ewitab
= _mm_cvttpd_epi32(ewrt
);
1746 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1747 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1749 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1750 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
1754 /* Calculate temporary vectorial force */
1755 tx
= _mm_mul_pd(fscal
,dx13
);
1756 ty
= _mm_mul_pd(fscal
,dy13
);
1757 tz
= _mm_mul_pd(fscal
,dz13
);
1759 /* Update vectorial force */
1760 fix1
= _mm_add_pd(fix1
,tx
);
1761 fiy1
= _mm_add_pd(fiy1
,ty
);
1762 fiz1
= _mm_add_pd(fiz1
,tz
);
1764 fjx3
= _mm_add_pd(fjx3
,tx
);
1765 fjy3
= _mm_add_pd(fjy3
,ty
);
1766 fjz3
= _mm_add_pd(fjz3
,tz
);
1768 /**************************
1769 * CALCULATE INTERACTIONS *
1770 **************************/
1772 r21
= _mm_mul_pd(rsq21
,rinv21
);
1774 /* EWALD ELECTROSTATICS */
1776 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1777 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1778 ewitab
= _mm_cvttpd_epi32(ewrt
);
1779 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1780 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1782 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1783 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1787 /* Calculate temporary vectorial force */
1788 tx
= _mm_mul_pd(fscal
,dx21
);
1789 ty
= _mm_mul_pd(fscal
,dy21
);
1790 tz
= _mm_mul_pd(fscal
,dz21
);
1792 /* Update vectorial force */
1793 fix2
= _mm_add_pd(fix2
,tx
);
1794 fiy2
= _mm_add_pd(fiy2
,ty
);
1795 fiz2
= _mm_add_pd(fiz2
,tz
);
1797 fjx1
= _mm_add_pd(fjx1
,tx
);
1798 fjy1
= _mm_add_pd(fjy1
,ty
);
1799 fjz1
= _mm_add_pd(fjz1
,tz
);
1801 /**************************
1802 * CALCULATE INTERACTIONS *
1803 **************************/
1805 r22
= _mm_mul_pd(rsq22
,rinv22
);
1807 /* EWALD ELECTROSTATICS */
1809 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1810 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
1811 ewitab
= _mm_cvttpd_epi32(ewrt
);
1812 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1813 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1815 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1816 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
1820 /* Calculate temporary vectorial force */
1821 tx
= _mm_mul_pd(fscal
,dx22
);
1822 ty
= _mm_mul_pd(fscal
,dy22
);
1823 tz
= _mm_mul_pd(fscal
,dz22
);
1825 /* Update vectorial force */
1826 fix2
= _mm_add_pd(fix2
,tx
);
1827 fiy2
= _mm_add_pd(fiy2
,ty
);
1828 fiz2
= _mm_add_pd(fiz2
,tz
);
1830 fjx2
= _mm_add_pd(fjx2
,tx
);
1831 fjy2
= _mm_add_pd(fjy2
,ty
);
1832 fjz2
= _mm_add_pd(fjz2
,tz
);
1834 /**************************
1835 * CALCULATE INTERACTIONS *
1836 **************************/
1838 r23
= _mm_mul_pd(rsq23
,rinv23
);
1840 /* EWALD ELECTROSTATICS */
1842 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1843 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
1844 ewitab
= _mm_cvttpd_epi32(ewrt
);
1845 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1846 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1848 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1849 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
1853 /* Calculate temporary vectorial force */
1854 tx
= _mm_mul_pd(fscal
,dx23
);
1855 ty
= _mm_mul_pd(fscal
,dy23
);
1856 tz
= _mm_mul_pd(fscal
,dz23
);
1858 /* Update vectorial force */
1859 fix2
= _mm_add_pd(fix2
,tx
);
1860 fiy2
= _mm_add_pd(fiy2
,ty
);
1861 fiz2
= _mm_add_pd(fiz2
,tz
);
1863 fjx3
= _mm_add_pd(fjx3
,tx
);
1864 fjy3
= _mm_add_pd(fjy3
,ty
);
1865 fjz3
= _mm_add_pd(fjz3
,tz
);
1867 /**************************
1868 * CALCULATE INTERACTIONS *
1869 **************************/
1871 r31
= _mm_mul_pd(rsq31
,rinv31
);
1873 /* EWALD ELECTROSTATICS */
1875 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1876 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
1877 ewitab
= _mm_cvttpd_epi32(ewrt
);
1878 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1879 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1881 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1882 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
1886 /* Calculate temporary vectorial force */
1887 tx
= _mm_mul_pd(fscal
,dx31
);
1888 ty
= _mm_mul_pd(fscal
,dy31
);
1889 tz
= _mm_mul_pd(fscal
,dz31
);
1891 /* Update vectorial force */
1892 fix3
= _mm_add_pd(fix3
,tx
);
1893 fiy3
= _mm_add_pd(fiy3
,ty
);
1894 fiz3
= _mm_add_pd(fiz3
,tz
);
1896 fjx1
= _mm_add_pd(fjx1
,tx
);
1897 fjy1
= _mm_add_pd(fjy1
,ty
);
1898 fjz1
= _mm_add_pd(fjz1
,tz
);
1900 /**************************
1901 * CALCULATE INTERACTIONS *
1902 **************************/
1904 r32
= _mm_mul_pd(rsq32
,rinv32
);
1906 /* EWALD ELECTROSTATICS */
1908 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1909 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
1910 ewitab
= _mm_cvttpd_epi32(ewrt
);
1911 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1912 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1914 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1915 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
1919 /* Calculate temporary vectorial force */
1920 tx
= _mm_mul_pd(fscal
,dx32
);
1921 ty
= _mm_mul_pd(fscal
,dy32
);
1922 tz
= _mm_mul_pd(fscal
,dz32
);
1924 /* Update vectorial force */
1925 fix3
= _mm_add_pd(fix3
,tx
);
1926 fiy3
= _mm_add_pd(fiy3
,ty
);
1927 fiz3
= _mm_add_pd(fiz3
,tz
);
1929 fjx2
= _mm_add_pd(fjx2
,tx
);
1930 fjy2
= _mm_add_pd(fjy2
,ty
);
1931 fjz2
= _mm_add_pd(fjz2
,tz
);
1933 /**************************
1934 * CALCULATE INTERACTIONS *
1935 **************************/
1937 r33
= _mm_mul_pd(rsq33
,rinv33
);
1939 /* EWALD ELECTROSTATICS */
1941 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1942 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
1943 ewitab
= _mm_cvttpd_epi32(ewrt
);
1944 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1945 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1947 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1948 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
1952 /* Calculate temporary vectorial force */
1953 tx
= _mm_mul_pd(fscal
,dx33
);
1954 ty
= _mm_mul_pd(fscal
,dy33
);
1955 tz
= _mm_mul_pd(fscal
,dz33
);
1957 /* Update vectorial force */
1958 fix3
= _mm_add_pd(fix3
,tx
);
1959 fiy3
= _mm_add_pd(fiy3
,ty
);
1960 fiz3
= _mm_add_pd(fiz3
,tz
);
1962 fjx3
= _mm_add_pd(fjx3
,tx
);
1963 fjy3
= _mm_add_pd(fjy3
,ty
);
1964 fjz3
= _mm_add_pd(fjz3
,tz
);
1966 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1968 /* Inner loop uses 375 flops */
1971 if(jidx
<j_index_end
)
1975 j_coord_offsetA
= DIM
*jnrA
;
1977 /* load j atom coordinates */
1978 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1979 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1980 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1982 /* Calculate displacement vector */
1983 dx00
= _mm_sub_pd(ix0
,jx0
);
1984 dy00
= _mm_sub_pd(iy0
,jy0
);
1985 dz00
= _mm_sub_pd(iz0
,jz0
);
1986 dx11
= _mm_sub_pd(ix1
,jx1
);
1987 dy11
= _mm_sub_pd(iy1
,jy1
);
1988 dz11
= _mm_sub_pd(iz1
,jz1
);
1989 dx12
= _mm_sub_pd(ix1
,jx2
);
1990 dy12
= _mm_sub_pd(iy1
,jy2
);
1991 dz12
= _mm_sub_pd(iz1
,jz2
);
1992 dx13
= _mm_sub_pd(ix1
,jx3
);
1993 dy13
= _mm_sub_pd(iy1
,jy3
);
1994 dz13
= _mm_sub_pd(iz1
,jz3
);
1995 dx21
= _mm_sub_pd(ix2
,jx1
);
1996 dy21
= _mm_sub_pd(iy2
,jy1
);
1997 dz21
= _mm_sub_pd(iz2
,jz1
);
1998 dx22
= _mm_sub_pd(ix2
,jx2
);
1999 dy22
= _mm_sub_pd(iy2
,jy2
);
2000 dz22
= _mm_sub_pd(iz2
,jz2
);
2001 dx23
= _mm_sub_pd(ix2
,jx3
);
2002 dy23
= _mm_sub_pd(iy2
,jy3
);
2003 dz23
= _mm_sub_pd(iz2
,jz3
);
2004 dx31
= _mm_sub_pd(ix3
,jx1
);
2005 dy31
= _mm_sub_pd(iy3
,jy1
);
2006 dz31
= _mm_sub_pd(iz3
,jz1
);
2007 dx32
= _mm_sub_pd(ix3
,jx2
);
2008 dy32
= _mm_sub_pd(iy3
,jy2
);
2009 dz32
= _mm_sub_pd(iz3
,jz2
);
2010 dx33
= _mm_sub_pd(ix3
,jx3
);
2011 dy33
= _mm_sub_pd(iy3
,jy3
);
2012 dz33
= _mm_sub_pd(iz3
,jz3
);
2014 /* Calculate squared distance and things based on it */
2015 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
2016 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
2017 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
2018 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
2019 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
2020 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
2021 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
2022 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
2023 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
2024 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
2026 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
2027 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
2028 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
2029 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
2030 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
2031 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
2032 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
2033 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
2034 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
2035 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
2037 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
2038 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
2039 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
2040 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
2041 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
2042 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
2043 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
2044 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
2045 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
2047 fjx0
= _mm_setzero_pd();
2048 fjy0
= _mm_setzero_pd();
2049 fjz0
= _mm_setzero_pd();
2050 fjx1
= _mm_setzero_pd();
2051 fjy1
= _mm_setzero_pd();
2052 fjz1
= _mm_setzero_pd();
2053 fjx2
= _mm_setzero_pd();
2054 fjy2
= _mm_setzero_pd();
2055 fjz2
= _mm_setzero_pd();
2056 fjx3
= _mm_setzero_pd();
2057 fjy3
= _mm_setzero_pd();
2058 fjz3
= _mm_setzero_pd();
2060 /**************************
2061 * CALCULATE INTERACTIONS *
2062 **************************/
2064 r00
= _mm_mul_pd(rsq00
,rinv00
);
2066 /* Calculate table index by multiplying r with table scale and truncate to integer */
2067 rt
= _mm_mul_pd(r00
,vftabscale
);
2068 vfitab
= _mm_cvttpd_epi32(rt
);
2069 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2070 vfitab
= _mm_slli_epi32(vfitab
,3);
2072 /* CUBIC SPLINE TABLE DISPERSION */
2073 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2074 F
= _mm_setzero_pd();
2075 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2076 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2077 H
= _mm_setzero_pd();
2078 GMX_MM_TRANSPOSE2_PD(G
,H
);
2079 Heps
= _mm_mul_pd(vfeps
,H
);
2080 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2081 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2082 fvdw6
= _mm_mul_pd(c6_00
,FF
);
2084 /* CUBIC SPLINE TABLE REPULSION */
2085 vfitab
= _mm_add_epi32(vfitab
,ifour
);
2086 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2087 F
= _mm_setzero_pd();
2088 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2089 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2090 H
= _mm_setzero_pd();
2091 GMX_MM_TRANSPOSE2_PD(G
,H
);
2092 Heps
= _mm_mul_pd(vfeps
,H
);
2093 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2094 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2095 fvdw12
= _mm_mul_pd(c12_00
,FF
);
2096 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
2100 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2102 /* Calculate temporary vectorial force */
2103 tx
= _mm_mul_pd(fscal
,dx00
);
2104 ty
= _mm_mul_pd(fscal
,dy00
);
2105 tz
= _mm_mul_pd(fscal
,dz00
);
2107 /* Update vectorial force */
2108 fix0
= _mm_add_pd(fix0
,tx
);
2109 fiy0
= _mm_add_pd(fiy0
,ty
);
2110 fiz0
= _mm_add_pd(fiz0
,tz
);
2112 fjx0
= _mm_add_pd(fjx0
,tx
);
2113 fjy0
= _mm_add_pd(fjy0
,ty
);
2114 fjz0
= _mm_add_pd(fjz0
,tz
);
2116 /**************************
2117 * CALCULATE INTERACTIONS *
2118 **************************/
2120 r11
= _mm_mul_pd(rsq11
,rinv11
);
2122 /* EWALD ELECTROSTATICS */
2124 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2125 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
2126 ewitab
= _mm_cvttpd_epi32(ewrt
);
2127 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2128 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2129 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2130 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
2134 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2136 /* Calculate temporary vectorial force */
2137 tx
= _mm_mul_pd(fscal
,dx11
);
2138 ty
= _mm_mul_pd(fscal
,dy11
);
2139 tz
= _mm_mul_pd(fscal
,dz11
);
2141 /* Update vectorial force */
2142 fix1
= _mm_add_pd(fix1
,tx
);
2143 fiy1
= _mm_add_pd(fiy1
,ty
);
2144 fiz1
= _mm_add_pd(fiz1
,tz
);
2146 fjx1
= _mm_add_pd(fjx1
,tx
);
2147 fjy1
= _mm_add_pd(fjy1
,ty
);
2148 fjz1
= _mm_add_pd(fjz1
,tz
);
2150 /**************************
2151 * CALCULATE INTERACTIONS *
2152 **************************/
2154 r12
= _mm_mul_pd(rsq12
,rinv12
);
2156 /* EWALD ELECTROSTATICS */
2158 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2159 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
2160 ewitab
= _mm_cvttpd_epi32(ewrt
);
2161 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2162 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2163 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2164 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
2168 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2170 /* Calculate temporary vectorial force */
2171 tx
= _mm_mul_pd(fscal
,dx12
);
2172 ty
= _mm_mul_pd(fscal
,dy12
);
2173 tz
= _mm_mul_pd(fscal
,dz12
);
2175 /* Update vectorial force */
2176 fix1
= _mm_add_pd(fix1
,tx
);
2177 fiy1
= _mm_add_pd(fiy1
,ty
);
2178 fiz1
= _mm_add_pd(fiz1
,tz
);
2180 fjx2
= _mm_add_pd(fjx2
,tx
);
2181 fjy2
= _mm_add_pd(fjy2
,ty
);
2182 fjz2
= _mm_add_pd(fjz2
,tz
);
2184 /**************************
2185 * CALCULATE INTERACTIONS *
2186 **************************/
2188 r13
= _mm_mul_pd(rsq13
,rinv13
);
2190 /* EWALD ELECTROSTATICS */
2192 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2193 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
2194 ewitab
= _mm_cvttpd_epi32(ewrt
);
2195 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2196 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2197 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2198 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
2202 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2204 /* Calculate temporary vectorial force */
2205 tx
= _mm_mul_pd(fscal
,dx13
);
2206 ty
= _mm_mul_pd(fscal
,dy13
);
2207 tz
= _mm_mul_pd(fscal
,dz13
);
2209 /* Update vectorial force */
2210 fix1
= _mm_add_pd(fix1
,tx
);
2211 fiy1
= _mm_add_pd(fiy1
,ty
);
2212 fiz1
= _mm_add_pd(fiz1
,tz
);
2214 fjx3
= _mm_add_pd(fjx3
,tx
);
2215 fjy3
= _mm_add_pd(fjy3
,ty
);
2216 fjz3
= _mm_add_pd(fjz3
,tz
);
2218 /**************************
2219 * CALCULATE INTERACTIONS *
2220 **************************/
2222 r21
= _mm_mul_pd(rsq21
,rinv21
);
2224 /* EWALD ELECTROSTATICS */
2226 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2227 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
2228 ewitab
= _mm_cvttpd_epi32(ewrt
);
2229 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2230 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2231 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2232 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
2236 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2238 /* Calculate temporary vectorial force */
2239 tx
= _mm_mul_pd(fscal
,dx21
);
2240 ty
= _mm_mul_pd(fscal
,dy21
);
2241 tz
= _mm_mul_pd(fscal
,dz21
);
2243 /* Update vectorial force */
2244 fix2
= _mm_add_pd(fix2
,tx
);
2245 fiy2
= _mm_add_pd(fiy2
,ty
);
2246 fiz2
= _mm_add_pd(fiz2
,tz
);
2248 fjx1
= _mm_add_pd(fjx1
,tx
);
2249 fjy1
= _mm_add_pd(fjy1
,ty
);
2250 fjz1
= _mm_add_pd(fjz1
,tz
);
2252 /**************************
2253 * CALCULATE INTERACTIONS *
2254 **************************/
2256 r22
= _mm_mul_pd(rsq22
,rinv22
);
2258 /* EWALD ELECTROSTATICS */
2260 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2261 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2262 ewitab
= _mm_cvttpd_epi32(ewrt
);
2263 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2264 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2265 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2266 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
2270 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2272 /* Calculate temporary vectorial force */
2273 tx
= _mm_mul_pd(fscal
,dx22
);
2274 ty
= _mm_mul_pd(fscal
,dy22
);
2275 tz
= _mm_mul_pd(fscal
,dz22
);
2277 /* Update vectorial force */
2278 fix2
= _mm_add_pd(fix2
,tx
);
2279 fiy2
= _mm_add_pd(fiy2
,ty
);
2280 fiz2
= _mm_add_pd(fiz2
,tz
);
2282 fjx2
= _mm_add_pd(fjx2
,tx
);
2283 fjy2
= _mm_add_pd(fjy2
,ty
);
2284 fjz2
= _mm_add_pd(fjz2
,tz
);
2286 /**************************
2287 * CALCULATE INTERACTIONS *
2288 **************************/
2290 r23
= _mm_mul_pd(rsq23
,rinv23
);
2292 /* EWALD ELECTROSTATICS */
2294 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2295 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
2296 ewitab
= _mm_cvttpd_epi32(ewrt
);
2297 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2298 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2299 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2300 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
2304 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2306 /* Calculate temporary vectorial force */
2307 tx
= _mm_mul_pd(fscal
,dx23
);
2308 ty
= _mm_mul_pd(fscal
,dy23
);
2309 tz
= _mm_mul_pd(fscal
,dz23
);
2311 /* Update vectorial force */
2312 fix2
= _mm_add_pd(fix2
,tx
);
2313 fiy2
= _mm_add_pd(fiy2
,ty
);
2314 fiz2
= _mm_add_pd(fiz2
,tz
);
2316 fjx3
= _mm_add_pd(fjx3
,tx
);
2317 fjy3
= _mm_add_pd(fjy3
,ty
);
2318 fjz3
= _mm_add_pd(fjz3
,tz
);
2320 /**************************
2321 * CALCULATE INTERACTIONS *
2322 **************************/
2324 r31
= _mm_mul_pd(rsq31
,rinv31
);
2326 /* EWALD ELECTROSTATICS */
2328 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2329 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
2330 ewitab
= _mm_cvttpd_epi32(ewrt
);
2331 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2332 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2333 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2334 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
2338 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2340 /* Calculate temporary vectorial force */
2341 tx
= _mm_mul_pd(fscal
,dx31
);
2342 ty
= _mm_mul_pd(fscal
,dy31
);
2343 tz
= _mm_mul_pd(fscal
,dz31
);
2345 /* Update vectorial force */
2346 fix3
= _mm_add_pd(fix3
,tx
);
2347 fiy3
= _mm_add_pd(fiy3
,ty
);
2348 fiz3
= _mm_add_pd(fiz3
,tz
);
2350 fjx1
= _mm_add_pd(fjx1
,tx
);
2351 fjy1
= _mm_add_pd(fjy1
,ty
);
2352 fjz1
= _mm_add_pd(fjz1
,tz
);
2354 /**************************
2355 * CALCULATE INTERACTIONS *
2356 **************************/
2358 r32
= _mm_mul_pd(rsq32
,rinv32
);
2360 /* EWALD ELECTROSTATICS */
2362 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2363 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
2364 ewitab
= _mm_cvttpd_epi32(ewrt
);
2365 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2366 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2367 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2368 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
2372 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2374 /* Calculate temporary vectorial force */
2375 tx
= _mm_mul_pd(fscal
,dx32
);
2376 ty
= _mm_mul_pd(fscal
,dy32
);
2377 tz
= _mm_mul_pd(fscal
,dz32
);
2379 /* Update vectorial force */
2380 fix3
= _mm_add_pd(fix3
,tx
);
2381 fiy3
= _mm_add_pd(fiy3
,ty
);
2382 fiz3
= _mm_add_pd(fiz3
,tz
);
2384 fjx2
= _mm_add_pd(fjx2
,tx
);
2385 fjy2
= _mm_add_pd(fjy2
,ty
);
2386 fjz2
= _mm_add_pd(fjz2
,tz
);
2388 /**************************
2389 * CALCULATE INTERACTIONS *
2390 **************************/
2392 r33
= _mm_mul_pd(rsq33
,rinv33
);
2394 /* EWALD ELECTROSTATICS */
2396 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2397 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
2398 ewitab
= _mm_cvttpd_epi32(ewrt
);
2399 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2400 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2401 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2402 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
2406 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2408 /* Calculate temporary vectorial force */
2409 tx
= _mm_mul_pd(fscal
,dx33
);
2410 ty
= _mm_mul_pd(fscal
,dy33
);
2411 tz
= _mm_mul_pd(fscal
,dz33
);
2413 /* Update vectorial force */
2414 fix3
= _mm_add_pd(fix3
,tx
);
2415 fiy3
= _mm_add_pd(fiy3
,ty
);
2416 fiz3
= _mm_add_pd(fiz3
,tz
);
2418 fjx3
= _mm_add_pd(fjx3
,tx
);
2419 fjy3
= _mm_add_pd(fjy3
,ty
);
2420 fjz3
= _mm_add_pd(fjz3
,tz
);
2422 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2424 /* Inner loop uses 375 flops */
2427 /* End of innermost loop */
2429 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2430 f
+i_coord_offset
,fshift
+i_shift_offset
);
2432 /* Increment number of inner iterations */
2433 inneriter
+= j_index_end
- j_index_start
;
2435 /* Outer loop uses 24 flops */
2438 /* Increment number of outer iterations */
2441 /* Update outer/inner flops */
2443 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*375);