2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse4_1_double.h"
48 #include "kernelutil_x86_sse4_1_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse4_1_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse4_1_double
59 (t_nblist
* gmx_restrict nlist
,
60 rvec
* gmx_restrict xx
,
61 rvec
* gmx_restrict ff
,
62 t_forcerec
* gmx_restrict fr
,
63 t_mdatoms
* gmx_restrict mdatoms
,
64 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
65 t_nrnb
* gmx_restrict nrnb
)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
73 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
75 int j_coord_offsetA
,j_coord_offsetB
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
81 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
83 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
85 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
87 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
88 int vdwjidx0A
,vdwjidx0B
;
89 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
90 int vdwjidx1A
,vdwjidx1B
;
91 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
92 int vdwjidx2A
,vdwjidx2B
;
93 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
94 int vdwjidx3A
,vdwjidx3B
;
95 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
96 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
97 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
98 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
99 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
100 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
101 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
102 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
103 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
104 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
105 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
106 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
109 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
112 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
113 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
115 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
117 __m128d dummy_mask
,cutoff_mask
;
118 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
119 __m128d one
= _mm_set1_pd(1.0);
120 __m128d two
= _mm_set1_pd(2.0);
126 jindex
= nlist
->jindex
;
128 shiftidx
= nlist
->shift
;
130 shiftvec
= fr
->shift_vec
[0];
131 fshift
= fr
->fshift
[0];
132 facel
= _mm_set1_pd(fr
->epsfac
);
133 charge
= mdatoms
->chargeA
;
134 nvdwtype
= fr
->ntype
;
136 vdwtype
= mdatoms
->typeA
;
138 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
139 ewtab
= fr
->ic
->tabq_coul_FDV0
;
140 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
141 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
143 /* Setup water-specific parameters */
144 inr
= nlist
->iinr
[0];
145 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
146 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
147 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
148 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
150 jq1
= _mm_set1_pd(charge
[inr
+1]);
151 jq2
= _mm_set1_pd(charge
[inr
+2]);
152 jq3
= _mm_set1_pd(charge
[inr
+3]);
153 vdwjidx0A
= 2*vdwtype
[inr
+0];
154 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
155 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
156 qq11
= _mm_mul_pd(iq1
,jq1
);
157 qq12
= _mm_mul_pd(iq1
,jq2
);
158 qq13
= _mm_mul_pd(iq1
,jq3
);
159 qq21
= _mm_mul_pd(iq2
,jq1
);
160 qq22
= _mm_mul_pd(iq2
,jq2
);
161 qq23
= _mm_mul_pd(iq2
,jq3
);
162 qq31
= _mm_mul_pd(iq3
,jq1
);
163 qq32
= _mm_mul_pd(iq3
,jq2
);
164 qq33
= _mm_mul_pd(iq3
,jq3
);
166 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
167 rcutoff_scalar
= fr
->rcoulomb
;
168 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
169 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
171 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
172 rvdw
= _mm_set1_pd(fr
->rvdw
);
174 /* Avoid stupid compiler warnings */
182 /* Start outer loop over neighborlists */
183 for(iidx
=0; iidx
<nri
; iidx
++)
185 /* Load shift vector for this list */
186 i_shift_offset
= DIM
*shiftidx
[iidx
];
188 /* Load limits for loop over neighbors */
189 j_index_start
= jindex
[iidx
];
190 j_index_end
= jindex
[iidx
+1];
192 /* Get outer coordinate index */
194 i_coord_offset
= DIM
*inr
;
196 /* Load i particle coords and add shift vector */
197 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
198 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
200 fix0
= _mm_setzero_pd();
201 fiy0
= _mm_setzero_pd();
202 fiz0
= _mm_setzero_pd();
203 fix1
= _mm_setzero_pd();
204 fiy1
= _mm_setzero_pd();
205 fiz1
= _mm_setzero_pd();
206 fix2
= _mm_setzero_pd();
207 fiy2
= _mm_setzero_pd();
208 fiz2
= _mm_setzero_pd();
209 fix3
= _mm_setzero_pd();
210 fiy3
= _mm_setzero_pd();
211 fiz3
= _mm_setzero_pd();
213 /* Reset potential sums */
214 velecsum
= _mm_setzero_pd();
215 vvdwsum
= _mm_setzero_pd();
217 /* Start inner kernel loop */
218 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
221 /* Get j neighbor index, and coordinate index */
224 j_coord_offsetA
= DIM
*jnrA
;
225 j_coord_offsetB
= DIM
*jnrB
;
227 /* load j atom coordinates */
228 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
229 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
230 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
232 /* Calculate displacement vector */
233 dx00
= _mm_sub_pd(ix0
,jx0
);
234 dy00
= _mm_sub_pd(iy0
,jy0
);
235 dz00
= _mm_sub_pd(iz0
,jz0
);
236 dx11
= _mm_sub_pd(ix1
,jx1
);
237 dy11
= _mm_sub_pd(iy1
,jy1
);
238 dz11
= _mm_sub_pd(iz1
,jz1
);
239 dx12
= _mm_sub_pd(ix1
,jx2
);
240 dy12
= _mm_sub_pd(iy1
,jy2
);
241 dz12
= _mm_sub_pd(iz1
,jz2
);
242 dx13
= _mm_sub_pd(ix1
,jx3
);
243 dy13
= _mm_sub_pd(iy1
,jy3
);
244 dz13
= _mm_sub_pd(iz1
,jz3
);
245 dx21
= _mm_sub_pd(ix2
,jx1
);
246 dy21
= _mm_sub_pd(iy2
,jy1
);
247 dz21
= _mm_sub_pd(iz2
,jz1
);
248 dx22
= _mm_sub_pd(ix2
,jx2
);
249 dy22
= _mm_sub_pd(iy2
,jy2
);
250 dz22
= _mm_sub_pd(iz2
,jz2
);
251 dx23
= _mm_sub_pd(ix2
,jx3
);
252 dy23
= _mm_sub_pd(iy2
,jy3
);
253 dz23
= _mm_sub_pd(iz2
,jz3
);
254 dx31
= _mm_sub_pd(ix3
,jx1
);
255 dy31
= _mm_sub_pd(iy3
,jy1
);
256 dz31
= _mm_sub_pd(iz3
,jz1
);
257 dx32
= _mm_sub_pd(ix3
,jx2
);
258 dy32
= _mm_sub_pd(iy3
,jy2
);
259 dz32
= _mm_sub_pd(iz3
,jz2
);
260 dx33
= _mm_sub_pd(ix3
,jx3
);
261 dy33
= _mm_sub_pd(iy3
,jy3
);
262 dz33
= _mm_sub_pd(iz3
,jz3
);
264 /* Calculate squared distance and things based on it */
265 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
266 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
267 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
268 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
269 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
270 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
271 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
272 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
273 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
274 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
276 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
277 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
278 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
279 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
280 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
281 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
282 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
283 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
284 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
286 rinvsq00
= gmx_mm_inv_pd(rsq00
);
287 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
288 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
289 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
290 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
291 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
292 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
293 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
294 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
295 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
297 fjx0
= _mm_setzero_pd();
298 fjy0
= _mm_setzero_pd();
299 fjz0
= _mm_setzero_pd();
300 fjx1
= _mm_setzero_pd();
301 fjy1
= _mm_setzero_pd();
302 fjz1
= _mm_setzero_pd();
303 fjx2
= _mm_setzero_pd();
304 fjy2
= _mm_setzero_pd();
305 fjz2
= _mm_setzero_pd();
306 fjx3
= _mm_setzero_pd();
307 fjy3
= _mm_setzero_pd();
308 fjz3
= _mm_setzero_pd();
310 /**************************
311 * CALCULATE INTERACTIONS *
312 **************************/
314 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
317 /* LENNARD-JONES DISPERSION/REPULSION */
319 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
320 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
321 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
322 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
323 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_mul_pd(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
324 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
326 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
328 /* Update potential sum for this i atom from the interaction with this j atom. */
329 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
330 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
334 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
336 /* Calculate temporary vectorial force */
337 tx
= _mm_mul_pd(fscal
,dx00
);
338 ty
= _mm_mul_pd(fscal
,dy00
);
339 tz
= _mm_mul_pd(fscal
,dz00
);
341 /* Update vectorial force */
342 fix0
= _mm_add_pd(fix0
,tx
);
343 fiy0
= _mm_add_pd(fiy0
,ty
);
344 fiz0
= _mm_add_pd(fiz0
,tz
);
346 fjx0
= _mm_add_pd(fjx0
,tx
);
347 fjy0
= _mm_add_pd(fjy0
,ty
);
348 fjz0
= _mm_add_pd(fjz0
,tz
);
352 /**************************
353 * CALCULATE INTERACTIONS *
354 **************************/
356 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
359 r11
= _mm_mul_pd(rsq11
,rinv11
);
361 /* EWALD ELECTROSTATICS */
363 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
364 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
365 ewitab
= _mm_cvttpd_epi32(ewrt
);
366 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
367 ewitab
= _mm_slli_epi32(ewitab
,2);
368 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
369 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
370 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
371 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
372 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
373 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
374 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
375 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
376 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_sub_pd(rinv11
,sh_ewald
),velec
));
377 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
379 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
381 /* Update potential sum for this i atom from the interaction with this j atom. */
382 velec
= _mm_and_pd(velec
,cutoff_mask
);
383 velecsum
= _mm_add_pd(velecsum
,velec
);
387 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
389 /* Calculate temporary vectorial force */
390 tx
= _mm_mul_pd(fscal
,dx11
);
391 ty
= _mm_mul_pd(fscal
,dy11
);
392 tz
= _mm_mul_pd(fscal
,dz11
);
394 /* Update vectorial force */
395 fix1
= _mm_add_pd(fix1
,tx
);
396 fiy1
= _mm_add_pd(fiy1
,ty
);
397 fiz1
= _mm_add_pd(fiz1
,tz
);
399 fjx1
= _mm_add_pd(fjx1
,tx
);
400 fjy1
= _mm_add_pd(fjy1
,ty
);
401 fjz1
= _mm_add_pd(fjz1
,tz
);
405 /**************************
406 * CALCULATE INTERACTIONS *
407 **************************/
409 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
412 r12
= _mm_mul_pd(rsq12
,rinv12
);
414 /* EWALD ELECTROSTATICS */
416 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
417 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
418 ewitab
= _mm_cvttpd_epi32(ewrt
);
419 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
420 ewitab
= _mm_slli_epi32(ewitab
,2);
421 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
422 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
423 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
424 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
425 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
426 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
427 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
428 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
429 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_sub_pd(rinv12
,sh_ewald
),velec
));
430 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
432 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
434 /* Update potential sum for this i atom from the interaction with this j atom. */
435 velec
= _mm_and_pd(velec
,cutoff_mask
);
436 velecsum
= _mm_add_pd(velecsum
,velec
);
440 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
442 /* Calculate temporary vectorial force */
443 tx
= _mm_mul_pd(fscal
,dx12
);
444 ty
= _mm_mul_pd(fscal
,dy12
);
445 tz
= _mm_mul_pd(fscal
,dz12
);
447 /* Update vectorial force */
448 fix1
= _mm_add_pd(fix1
,tx
);
449 fiy1
= _mm_add_pd(fiy1
,ty
);
450 fiz1
= _mm_add_pd(fiz1
,tz
);
452 fjx2
= _mm_add_pd(fjx2
,tx
);
453 fjy2
= _mm_add_pd(fjy2
,ty
);
454 fjz2
= _mm_add_pd(fjz2
,tz
);
458 /**************************
459 * CALCULATE INTERACTIONS *
460 **************************/
462 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
465 r13
= _mm_mul_pd(rsq13
,rinv13
);
467 /* EWALD ELECTROSTATICS */
469 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
470 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
471 ewitab
= _mm_cvttpd_epi32(ewrt
);
472 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
473 ewitab
= _mm_slli_epi32(ewitab
,2);
474 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
475 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
476 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
477 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
478 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
479 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
480 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
481 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
482 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_sub_pd(rinv13
,sh_ewald
),velec
));
483 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
485 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
487 /* Update potential sum for this i atom from the interaction with this j atom. */
488 velec
= _mm_and_pd(velec
,cutoff_mask
);
489 velecsum
= _mm_add_pd(velecsum
,velec
);
493 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
495 /* Calculate temporary vectorial force */
496 tx
= _mm_mul_pd(fscal
,dx13
);
497 ty
= _mm_mul_pd(fscal
,dy13
);
498 tz
= _mm_mul_pd(fscal
,dz13
);
500 /* Update vectorial force */
501 fix1
= _mm_add_pd(fix1
,tx
);
502 fiy1
= _mm_add_pd(fiy1
,ty
);
503 fiz1
= _mm_add_pd(fiz1
,tz
);
505 fjx3
= _mm_add_pd(fjx3
,tx
);
506 fjy3
= _mm_add_pd(fjy3
,ty
);
507 fjz3
= _mm_add_pd(fjz3
,tz
);
511 /**************************
512 * CALCULATE INTERACTIONS *
513 **************************/
515 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
518 r21
= _mm_mul_pd(rsq21
,rinv21
);
520 /* EWALD ELECTROSTATICS */
522 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
523 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
524 ewitab
= _mm_cvttpd_epi32(ewrt
);
525 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
526 ewitab
= _mm_slli_epi32(ewitab
,2);
527 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
528 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
529 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
530 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
531 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
532 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
533 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
534 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
535 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_sub_pd(rinv21
,sh_ewald
),velec
));
536 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
538 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
540 /* Update potential sum for this i atom from the interaction with this j atom. */
541 velec
= _mm_and_pd(velec
,cutoff_mask
);
542 velecsum
= _mm_add_pd(velecsum
,velec
);
546 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
548 /* Calculate temporary vectorial force */
549 tx
= _mm_mul_pd(fscal
,dx21
);
550 ty
= _mm_mul_pd(fscal
,dy21
);
551 tz
= _mm_mul_pd(fscal
,dz21
);
553 /* Update vectorial force */
554 fix2
= _mm_add_pd(fix2
,tx
);
555 fiy2
= _mm_add_pd(fiy2
,ty
);
556 fiz2
= _mm_add_pd(fiz2
,tz
);
558 fjx1
= _mm_add_pd(fjx1
,tx
);
559 fjy1
= _mm_add_pd(fjy1
,ty
);
560 fjz1
= _mm_add_pd(fjz1
,tz
);
564 /**************************
565 * CALCULATE INTERACTIONS *
566 **************************/
568 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
571 r22
= _mm_mul_pd(rsq22
,rinv22
);
573 /* EWALD ELECTROSTATICS */
575 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
576 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
577 ewitab
= _mm_cvttpd_epi32(ewrt
);
578 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
579 ewitab
= _mm_slli_epi32(ewitab
,2);
580 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
581 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
582 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
583 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
584 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
585 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
586 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
587 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
588 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_sub_pd(rinv22
,sh_ewald
),velec
));
589 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
591 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
593 /* Update potential sum for this i atom from the interaction with this j atom. */
594 velec
= _mm_and_pd(velec
,cutoff_mask
);
595 velecsum
= _mm_add_pd(velecsum
,velec
);
599 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
601 /* Calculate temporary vectorial force */
602 tx
= _mm_mul_pd(fscal
,dx22
);
603 ty
= _mm_mul_pd(fscal
,dy22
);
604 tz
= _mm_mul_pd(fscal
,dz22
);
606 /* Update vectorial force */
607 fix2
= _mm_add_pd(fix2
,tx
);
608 fiy2
= _mm_add_pd(fiy2
,ty
);
609 fiz2
= _mm_add_pd(fiz2
,tz
);
611 fjx2
= _mm_add_pd(fjx2
,tx
);
612 fjy2
= _mm_add_pd(fjy2
,ty
);
613 fjz2
= _mm_add_pd(fjz2
,tz
);
617 /**************************
618 * CALCULATE INTERACTIONS *
619 **************************/
621 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
624 r23
= _mm_mul_pd(rsq23
,rinv23
);
626 /* EWALD ELECTROSTATICS */
628 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
629 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
630 ewitab
= _mm_cvttpd_epi32(ewrt
);
631 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
632 ewitab
= _mm_slli_epi32(ewitab
,2);
633 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
634 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
635 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
636 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
637 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
638 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
639 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
640 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
641 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_sub_pd(rinv23
,sh_ewald
),velec
));
642 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
644 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
646 /* Update potential sum for this i atom from the interaction with this j atom. */
647 velec
= _mm_and_pd(velec
,cutoff_mask
);
648 velecsum
= _mm_add_pd(velecsum
,velec
);
652 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
654 /* Calculate temporary vectorial force */
655 tx
= _mm_mul_pd(fscal
,dx23
);
656 ty
= _mm_mul_pd(fscal
,dy23
);
657 tz
= _mm_mul_pd(fscal
,dz23
);
659 /* Update vectorial force */
660 fix2
= _mm_add_pd(fix2
,tx
);
661 fiy2
= _mm_add_pd(fiy2
,ty
);
662 fiz2
= _mm_add_pd(fiz2
,tz
);
664 fjx3
= _mm_add_pd(fjx3
,tx
);
665 fjy3
= _mm_add_pd(fjy3
,ty
);
666 fjz3
= _mm_add_pd(fjz3
,tz
);
670 /**************************
671 * CALCULATE INTERACTIONS *
672 **************************/
674 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
677 r31
= _mm_mul_pd(rsq31
,rinv31
);
679 /* EWALD ELECTROSTATICS */
681 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
682 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
683 ewitab
= _mm_cvttpd_epi32(ewrt
);
684 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
685 ewitab
= _mm_slli_epi32(ewitab
,2);
686 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
687 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
688 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
689 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
690 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
691 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
692 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
693 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
694 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_sub_pd(rinv31
,sh_ewald
),velec
));
695 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
697 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
699 /* Update potential sum for this i atom from the interaction with this j atom. */
700 velec
= _mm_and_pd(velec
,cutoff_mask
);
701 velecsum
= _mm_add_pd(velecsum
,velec
);
705 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
707 /* Calculate temporary vectorial force */
708 tx
= _mm_mul_pd(fscal
,dx31
);
709 ty
= _mm_mul_pd(fscal
,dy31
);
710 tz
= _mm_mul_pd(fscal
,dz31
);
712 /* Update vectorial force */
713 fix3
= _mm_add_pd(fix3
,tx
);
714 fiy3
= _mm_add_pd(fiy3
,ty
);
715 fiz3
= _mm_add_pd(fiz3
,tz
);
717 fjx1
= _mm_add_pd(fjx1
,tx
);
718 fjy1
= _mm_add_pd(fjy1
,ty
);
719 fjz1
= _mm_add_pd(fjz1
,tz
);
723 /**************************
724 * CALCULATE INTERACTIONS *
725 **************************/
727 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
730 r32
= _mm_mul_pd(rsq32
,rinv32
);
732 /* EWALD ELECTROSTATICS */
734 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
735 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
736 ewitab
= _mm_cvttpd_epi32(ewrt
);
737 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
738 ewitab
= _mm_slli_epi32(ewitab
,2);
739 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
740 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
741 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
742 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
743 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
744 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
745 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
746 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
747 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_sub_pd(rinv32
,sh_ewald
),velec
));
748 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
750 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
752 /* Update potential sum for this i atom from the interaction with this j atom. */
753 velec
= _mm_and_pd(velec
,cutoff_mask
);
754 velecsum
= _mm_add_pd(velecsum
,velec
);
758 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
760 /* Calculate temporary vectorial force */
761 tx
= _mm_mul_pd(fscal
,dx32
);
762 ty
= _mm_mul_pd(fscal
,dy32
);
763 tz
= _mm_mul_pd(fscal
,dz32
);
765 /* Update vectorial force */
766 fix3
= _mm_add_pd(fix3
,tx
);
767 fiy3
= _mm_add_pd(fiy3
,ty
);
768 fiz3
= _mm_add_pd(fiz3
,tz
);
770 fjx2
= _mm_add_pd(fjx2
,tx
);
771 fjy2
= _mm_add_pd(fjy2
,ty
);
772 fjz2
= _mm_add_pd(fjz2
,tz
);
776 /**************************
777 * CALCULATE INTERACTIONS *
778 **************************/
780 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
783 r33
= _mm_mul_pd(rsq33
,rinv33
);
785 /* EWALD ELECTROSTATICS */
787 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
788 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
789 ewitab
= _mm_cvttpd_epi32(ewrt
);
790 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
791 ewitab
= _mm_slli_epi32(ewitab
,2);
792 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
793 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
794 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
795 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
796 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
797 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
798 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
799 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
800 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_sub_pd(rinv33
,sh_ewald
),velec
));
801 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
803 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
805 /* Update potential sum for this i atom from the interaction with this j atom. */
806 velec
= _mm_and_pd(velec
,cutoff_mask
);
807 velecsum
= _mm_add_pd(velecsum
,velec
);
811 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
813 /* Calculate temporary vectorial force */
814 tx
= _mm_mul_pd(fscal
,dx33
);
815 ty
= _mm_mul_pd(fscal
,dy33
);
816 tz
= _mm_mul_pd(fscal
,dz33
);
818 /* Update vectorial force */
819 fix3
= _mm_add_pd(fix3
,tx
);
820 fiy3
= _mm_add_pd(fiy3
,ty
);
821 fiz3
= _mm_add_pd(fiz3
,tz
);
823 fjx3
= _mm_add_pd(fjx3
,tx
);
824 fjy3
= _mm_add_pd(fjy3
,ty
);
825 fjz3
= _mm_add_pd(fjz3
,tz
);
829 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
831 /* Inner loop uses 458 flops */
838 j_coord_offsetA
= DIM
*jnrA
;
840 /* load j atom coordinates */
841 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
842 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
843 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
845 /* Calculate displacement vector */
846 dx00
= _mm_sub_pd(ix0
,jx0
);
847 dy00
= _mm_sub_pd(iy0
,jy0
);
848 dz00
= _mm_sub_pd(iz0
,jz0
);
849 dx11
= _mm_sub_pd(ix1
,jx1
);
850 dy11
= _mm_sub_pd(iy1
,jy1
);
851 dz11
= _mm_sub_pd(iz1
,jz1
);
852 dx12
= _mm_sub_pd(ix1
,jx2
);
853 dy12
= _mm_sub_pd(iy1
,jy2
);
854 dz12
= _mm_sub_pd(iz1
,jz2
);
855 dx13
= _mm_sub_pd(ix1
,jx3
);
856 dy13
= _mm_sub_pd(iy1
,jy3
);
857 dz13
= _mm_sub_pd(iz1
,jz3
);
858 dx21
= _mm_sub_pd(ix2
,jx1
);
859 dy21
= _mm_sub_pd(iy2
,jy1
);
860 dz21
= _mm_sub_pd(iz2
,jz1
);
861 dx22
= _mm_sub_pd(ix2
,jx2
);
862 dy22
= _mm_sub_pd(iy2
,jy2
);
863 dz22
= _mm_sub_pd(iz2
,jz2
);
864 dx23
= _mm_sub_pd(ix2
,jx3
);
865 dy23
= _mm_sub_pd(iy2
,jy3
);
866 dz23
= _mm_sub_pd(iz2
,jz3
);
867 dx31
= _mm_sub_pd(ix3
,jx1
);
868 dy31
= _mm_sub_pd(iy3
,jy1
);
869 dz31
= _mm_sub_pd(iz3
,jz1
);
870 dx32
= _mm_sub_pd(ix3
,jx2
);
871 dy32
= _mm_sub_pd(iy3
,jy2
);
872 dz32
= _mm_sub_pd(iz3
,jz2
);
873 dx33
= _mm_sub_pd(ix3
,jx3
);
874 dy33
= _mm_sub_pd(iy3
,jy3
);
875 dz33
= _mm_sub_pd(iz3
,jz3
);
877 /* Calculate squared distance and things based on it */
878 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
879 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
880 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
881 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
882 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
883 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
884 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
885 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
886 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
887 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
889 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
890 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
891 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
892 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
893 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
894 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
895 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
896 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
897 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
899 rinvsq00
= gmx_mm_inv_pd(rsq00
);
900 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
901 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
902 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
903 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
904 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
905 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
906 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
907 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
908 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
910 fjx0
= _mm_setzero_pd();
911 fjy0
= _mm_setzero_pd();
912 fjz0
= _mm_setzero_pd();
913 fjx1
= _mm_setzero_pd();
914 fjy1
= _mm_setzero_pd();
915 fjz1
= _mm_setzero_pd();
916 fjx2
= _mm_setzero_pd();
917 fjy2
= _mm_setzero_pd();
918 fjz2
= _mm_setzero_pd();
919 fjx3
= _mm_setzero_pd();
920 fjy3
= _mm_setzero_pd();
921 fjz3
= _mm_setzero_pd();
923 /**************************
924 * CALCULATE INTERACTIONS *
925 **************************/
927 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
930 /* LENNARD-JONES DISPERSION/REPULSION */
932 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
933 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
934 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
935 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
936 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_mul_pd(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
937 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
939 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
941 /* Update potential sum for this i atom from the interaction with this j atom. */
942 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
943 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
944 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
948 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
950 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
952 /* Calculate temporary vectorial force */
953 tx
= _mm_mul_pd(fscal
,dx00
);
954 ty
= _mm_mul_pd(fscal
,dy00
);
955 tz
= _mm_mul_pd(fscal
,dz00
);
957 /* Update vectorial force */
958 fix0
= _mm_add_pd(fix0
,tx
);
959 fiy0
= _mm_add_pd(fiy0
,ty
);
960 fiz0
= _mm_add_pd(fiz0
,tz
);
962 fjx0
= _mm_add_pd(fjx0
,tx
);
963 fjy0
= _mm_add_pd(fjy0
,ty
);
964 fjz0
= _mm_add_pd(fjz0
,tz
);
968 /**************************
969 * CALCULATE INTERACTIONS *
970 **************************/
972 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
975 r11
= _mm_mul_pd(rsq11
,rinv11
);
977 /* EWALD ELECTROSTATICS */
979 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
980 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
981 ewitab
= _mm_cvttpd_epi32(ewrt
);
982 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
983 ewitab
= _mm_slli_epi32(ewitab
,2);
984 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
985 ewtabD
= _mm_setzero_pd();
986 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
987 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
988 ewtabFn
= _mm_setzero_pd();
989 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
990 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
991 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
992 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_sub_pd(rinv11
,sh_ewald
),velec
));
993 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
995 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
997 /* Update potential sum for this i atom from the interaction with this j atom. */
998 velec
= _mm_and_pd(velec
,cutoff_mask
);
999 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1000 velecsum
= _mm_add_pd(velecsum
,velec
);
1004 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1006 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1008 /* Calculate temporary vectorial force */
1009 tx
= _mm_mul_pd(fscal
,dx11
);
1010 ty
= _mm_mul_pd(fscal
,dy11
);
1011 tz
= _mm_mul_pd(fscal
,dz11
);
1013 /* Update vectorial force */
1014 fix1
= _mm_add_pd(fix1
,tx
);
1015 fiy1
= _mm_add_pd(fiy1
,ty
);
1016 fiz1
= _mm_add_pd(fiz1
,tz
);
1018 fjx1
= _mm_add_pd(fjx1
,tx
);
1019 fjy1
= _mm_add_pd(fjy1
,ty
);
1020 fjz1
= _mm_add_pd(fjz1
,tz
);
1024 /**************************
1025 * CALCULATE INTERACTIONS *
1026 **************************/
1028 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1031 r12
= _mm_mul_pd(rsq12
,rinv12
);
1033 /* EWALD ELECTROSTATICS */
1035 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1036 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1037 ewitab
= _mm_cvttpd_epi32(ewrt
);
1038 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1039 ewitab
= _mm_slli_epi32(ewitab
,2);
1040 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1041 ewtabD
= _mm_setzero_pd();
1042 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1043 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1044 ewtabFn
= _mm_setzero_pd();
1045 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1046 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1047 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1048 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_sub_pd(rinv12
,sh_ewald
),velec
));
1049 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1051 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1053 /* Update potential sum for this i atom from the interaction with this j atom. */
1054 velec
= _mm_and_pd(velec
,cutoff_mask
);
1055 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1056 velecsum
= _mm_add_pd(velecsum
,velec
);
1060 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1062 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1064 /* Calculate temporary vectorial force */
1065 tx
= _mm_mul_pd(fscal
,dx12
);
1066 ty
= _mm_mul_pd(fscal
,dy12
);
1067 tz
= _mm_mul_pd(fscal
,dz12
);
1069 /* Update vectorial force */
1070 fix1
= _mm_add_pd(fix1
,tx
);
1071 fiy1
= _mm_add_pd(fiy1
,ty
);
1072 fiz1
= _mm_add_pd(fiz1
,tz
);
1074 fjx2
= _mm_add_pd(fjx2
,tx
);
1075 fjy2
= _mm_add_pd(fjy2
,ty
);
1076 fjz2
= _mm_add_pd(fjz2
,tz
);
1080 /**************************
1081 * CALCULATE INTERACTIONS *
1082 **************************/
1084 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
1087 r13
= _mm_mul_pd(rsq13
,rinv13
);
1089 /* EWALD ELECTROSTATICS */
1091 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1092 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
1093 ewitab
= _mm_cvttpd_epi32(ewrt
);
1094 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1095 ewitab
= _mm_slli_epi32(ewitab
,2);
1096 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1097 ewtabD
= _mm_setzero_pd();
1098 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1099 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1100 ewtabFn
= _mm_setzero_pd();
1101 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1102 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1103 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1104 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_sub_pd(rinv13
,sh_ewald
),velec
));
1105 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
1107 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
1109 /* Update potential sum for this i atom from the interaction with this j atom. */
1110 velec
= _mm_and_pd(velec
,cutoff_mask
);
1111 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1112 velecsum
= _mm_add_pd(velecsum
,velec
);
1116 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1118 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1120 /* Calculate temporary vectorial force */
1121 tx
= _mm_mul_pd(fscal
,dx13
);
1122 ty
= _mm_mul_pd(fscal
,dy13
);
1123 tz
= _mm_mul_pd(fscal
,dz13
);
1125 /* Update vectorial force */
1126 fix1
= _mm_add_pd(fix1
,tx
);
1127 fiy1
= _mm_add_pd(fiy1
,ty
);
1128 fiz1
= _mm_add_pd(fiz1
,tz
);
1130 fjx3
= _mm_add_pd(fjx3
,tx
);
1131 fjy3
= _mm_add_pd(fjy3
,ty
);
1132 fjz3
= _mm_add_pd(fjz3
,tz
);
1136 /**************************
1137 * CALCULATE INTERACTIONS *
1138 **************************/
1140 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1143 r21
= _mm_mul_pd(rsq21
,rinv21
);
1145 /* EWALD ELECTROSTATICS */
1147 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1148 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1149 ewitab
= _mm_cvttpd_epi32(ewrt
);
1150 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1151 ewitab
= _mm_slli_epi32(ewitab
,2);
1152 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1153 ewtabD
= _mm_setzero_pd();
1154 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1155 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1156 ewtabFn
= _mm_setzero_pd();
1157 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1158 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1159 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1160 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_sub_pd(rinv21
,sh_ewald
),velec
));
1161 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1163 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1165 /* Update potential sum for this i atom from the interaction with this j atom. */
1166 velec
= _mm_and_pd(velec
,cutoff_mask
);
1167 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1168 velecsum
= _mm_add_pd(velecsum
,velec
);
1172 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1174 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1176 /* Calculate temporary vectorial force */
1177 tx
= _mm_mul_pd(fscal
,dx21
);
1178 ty
= _mm_mul_pd(fscal
,dy21
);
1179 tz
= _mm_mul_pd(fscal
,dz21
);
1181 /* Update vectorial force */
1182 fix2
= _mm_add_pd(fix2
,tx
);
1183 fiy2
= _mm_add_pd(fiy2
,ty
);
1184 fiz2
= _mm_add_pd(fiz2
,tz
);
1186 fjx1
= _mm_add_pd(fjx1
,tx
);
1187 fjy1
= _mm_add_pd(fjy1
,ty
);
1188 fjz1
= _mm_add_pd(fjz1
,tz
);
1192 /**************************
1193 * CALCULATE INTERACTIONS *
1194 **************************/
1196 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1199 r22
= _mm_mul_pd(rsq22
,rinv22
);
1201 /* EWALD ELECTROSTATICS */
1203 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1204 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
1205 ewitab
= _mm_cvttpd_epi32(ewrt
);
1206 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1207 ewitab
= _mm_slli_epi32(ewitab
,2);
1208 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1209 ewtabD
= _mm_setzero_pd();
1210 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1211 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1212 ewtabFn
= _mm_setzero_pd();
1213 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1214 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1215 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1216 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_sub_pd(rinv22
,sh_ewald
),velec
));
1217 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
1219 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
1221 /* Update potential sum for this i atom from the interaction with this j atom. */
1222 velec
= _mm_and_pd(velec
,cutoff_mask
);
1223 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1224 velecsum
= _mm_add_pd(velecsum
,velec
);
1228 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1230 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1232 /* Calculate temporary vectorial force */
1233 tx
= _mm_mul_pd(fscal
,dx22
);
1234 ty
= _mm_mul_pd(fscal
,dy22
);
1235 tz
= _mm_mul_pd(fscal
,dz22
);
1237 /* Update vectorial force */
1238 fix2
= _mm_add_pd(fix2
,tx
);
1239 fiy2
= _mm_add_pd(fiy2
,ty
);
1240 fiz2
= _mm_add_pd(fiz2
,tz
);
1242 fjx2
= _mm_add_pd(fjx2
,tx
);
1243 fjy2
= _mm_add_pd(fjy2
,ty
);
1244 fjz2
= _mm_add_pd(fjz2
,tz
);
1248 /**************************
1249 * CALCULATE INTERACTIONS *
1250 **************************/
1252 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
1255 r23
= _mm_mul_pd(rsq23
,rinv23
);
1257 /* EWALD ELECTROSTATICS */
1259 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1260 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
1261 ewitab
= _mm_cvttpd_epi32(ewrt
);
1262 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1263 ewitab
= _mm_slli_epi32(ewitab
,2);
1264 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1265 ewtabD
= _mm_setzero_pd();
1266 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1267 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1268 ewtabFn
= _mm_setzero_pd();
1269 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1270 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1271 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1272 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_sub_pd(rinv23
,sh_ewald
),velec
));
1273 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
1275 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
1277 /* Update potential sum for this i atom from the interaction with this j atom. */
1278 velec
= _mm_and_pd(velec
,cutoff_mask
);
1279 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1280 velecsum
= _mm_add_pd(velecsum
,velec
);
1284 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1286 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1288 /* Calculate temporary vectorial force */
1289 tx
= _mm_mul_pd(fscal
,dx23
);
1290 ty
= _mm_mul_pd(fscal
,dy23
);
1291 tz
= _mm_mul_pd(fscal
,dz23
);
1293 /* Update vectorial force */
1294 fix2
= _mm_add_pd(fix2
,tx
);
1295 fiy2
= _mm_add_pd(fiy2
,ty
);
1296 fiz2
= _mm_add_pd(fiz2
,tz
);
1298 fjx3
= _mm_add_pd(fjx3
,tx
);
1299 fjy3
= _mm_add_pd(fjy3
,ty
);
1300 fjz3
= _mm_add_pd(fjz3
,tz
);
1304 /**************************
1305 * CALCULATE INTERACTIONS *
1306 **************************/
1308 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
1311 r31
= _mm_mul_pd(rsq31
,rinv31
);
1313 /* EWALD ELECTROSTATICS */
1315 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1316 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
1317 ewitab
= _mm_cvttpd_epi32(ewrt
);
1318 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1319 ewitab
= _mm_slli_epi32(ewitab
,2);
1320 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1321 ewtabD
= _mm_setzero_pd();
1322 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1323 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1324 ewtabFn
= _mm_setzero_pd();
1325 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1326 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1327 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1328 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_sub_pd(rinv31
,sh_ewald
),velec
));
1329 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
1331 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
1333 /* Update potential sum for this i atom from the interaction with this j atom. */
1334 velec
= _mm_and_pd(velec
,cutoff_mask
);
1335 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1336 velecsum
= _mm_add_pd(velecsum
,velec
);
1340 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1342 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1344 /* Calculate temporary vectorial force */
1345 tx
= _mm_mul_pd(fscal
,dx31
);
1346 ty
= _mm_mul_pd(fscal
,dy31
);
1347 tz
= _mm_mul_pd(fscal
,dz31
);
1349 /* Update vectorial force */
1350 fix3
= _mm_add_pd(fix3
,tx
);
1351 fiy3
= _mm_add_pd(fiy3
,ty
);
1352 fiz3
= _mm_add_pd(fiz3
,tz
);
1354 fjx1
= _mm_add_pd(fjx1
,tx
);
1355 fjy1
= _mm_add_pd(fjy1
,ty
);
1356 fjz1
= _mm_add_pd(fjz1
,tz
);
1360 /**************************
1361 * CALCULATE INTERACTIONS *
1362 **************************/
1364 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
1367 r32
= _mm_mul_pd(rsq32
,rinv32
);
1369 /* EWALD ELECTROSTATICS */
1371 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1372 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
1373 ewitab
= _mm_cvttpd_epi32(ewrt
);
1374 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1375 ewitab
= _mm_slli_epi32(ewitab
,2);
1376 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1377 ewtabD
= _mm_setzero_pd();
1378 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1379 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1380 ewtabFn
= _mm_setzero_pd();
1381 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1382 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1383 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1384 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_sub_pd(rinv32
,sh_ewald
),velec
));
1385 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
1387 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
1389 /* Update potential sum for this i atom from the interaction with this j atom. */
1390 velec
= _mm_and_pd(velec
,cutoff_mask
);
1391 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1392 velecsum
= _mm_add_pd(velecsum
,velec
);
1396 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1398 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1400 /* Calculate temporary vectorial force */
1401 tx
= _mm_mul_pd(fscal
,dx32
);
1402 ty
= _mm_mul_pd(fscal
,dy32
);
1403 tz
= _mm_mul_pd(fscal
,dz32
);
1405 /* Update vectorial force */
1406 fix3
= _mm_add_pd(fix3
,tx
);
1407 fiy3
= _mm_add_pd(fiy3
,ty
);
1408 fiz3
= _mm_add_pd(fiz3
,tz
);
1410 fjx2
= _mm_add_pd(fjx2
,tx
);
1411 fjy2
= _mm_add_pd(fjy2
,ty
);
1412 fjz2
= _mm_add_pd(fjz2
,tz
);
1416 /**************************
1417 * CALCULATE INTERACTIONS *
1418 **************************/
1420 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
1423 r33
= _mm_mul_pd(rsq33
,rinv33
);
1425 /* EWALD ELECTROSTATICS */
1427 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1428 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
1429 ewitab
= _mm_cvttpd_epi32(ewrt
);
1430 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1431 ewitab
= _mm_slli_epi32(ewitab
,2);
1432 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1433 ewtabD
= _mm_setzero_pd();
1434 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1435 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1436 ewtabFn
= _mm_setzero_pd();
1437 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1438 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1439 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1440 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_sub_pd(rinv33
,sh_ewald
),velec
));
1441 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
1443 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
1445 /* Update potential sum for this i atom from the interaction with this j atom. */
1446 velec
= _mm_and_pd(velec
,cutoff_mask
);
1447 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1448 velecsum
= _mm_add_pd(velecsum
,velec
);
1452 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1454 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1456 /* Calculate temporary vectorial force */
1457 tx
= _mm_mul_pd(fscal
,dx33
);
1458 ty
= _mm_mul_pd(fscal
,dy33
);
1459 tz
= _mm_mul_pd(fscal
,dz33
);
1461 /* Update vectorial force */
1462 fix3
= _mm_add_pd(fix3
,tx
);
1463 fiy3
= _mm_add_pd(fiy3
,ty
);
1464 fiz3
= _mm_add_pd(fiz3
,tz
);
1466 fjx3
= _mm_add_pd(fjx3
,tx
);
1467 fjy3
= _mm_add_pd(fjy3
,ty
);
1468 fjz3
= _mm_add_pd(fjz3
,tz
);
1472 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1474 /* Inner loop uses 458 flops */
1477 /* End of innermost loop */
1479 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1480 f
+i_coord_offset
,fshift
+i_shift_offset
);
1483 /* Update potential energies */
1484 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1485 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1487 /* Increment number of inner iterations */
1488 inneriter
+= j_index_end
- j_index_start
;
1490 /* Outer loop uses 26 flops */
1493 /* Increment number of outer iterations */
1496 /* Update outer/inner flops */
1498 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*458);
1501 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse4_1_double
1502 * Electrostatics interaction: Ewald
1503 * VdW interaction: LennardJones
1504 * Geometry: Water4-Water4
1505 * Calculate force/pot: Force
1508 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse4_1_double
1509 (t_nblist
* gmx_restrict nlist
,
1510 rvec
* gmx_restrict xx
,
1511 rvec
* gmx_restrict ff
,
1512 t_forcerec
* gmx_restrict fr
,
1513 t_mdatoms
* gmx_restrict mdatoms
,
1514 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1515 t_nrnb
* gmx_restrict nrnb
)
1517 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1518 * just 0 for non-waters.
1519 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1520 * jnr indices corresponding to data put in the four positions in the SIMD register.
1522 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1523 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1525 int j_coord_offsetA
,j_coord_offsetB
;
1526 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1527 real rcutoff_scalar
;
1528 real
*shiftvec
,*fshift
,*x
,*f
;
1529 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1531 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1533 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1535 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1537 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1538 int vdwjidx0A
,vdwjidx0B
;
1539 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1540 int vdwjidx1A
,vdwjidx1B
;
1541 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1542 int vdwjidx2A
,vdwjidx2B
;
1543 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1544 int vdwjidx3A
,vdwjidx3B
;
1545 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1546 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1547 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1548 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1549 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1550 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1551 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1552 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1553 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1554 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1555 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1556 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1559 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1562 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1563 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1565 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1567 __m128d dummy_mask
,cutoff_mask
;
1568 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1569 __m128d one
= _mm_set1_pd(1.0);
1570 __m128d two
= _mm_set1_pd(2.0);
1576 jindex
= nlist
->jindex
;
1578 shiftidx
= nlist
->shift
;
1580 shiftvec
= fr
->shift_vec
[0];
1581 fshift
= fr
->fshift
[0];
1582 facel
= _mm_set1_pd(fr
->epsfac
);
1583 charge
= mdatoms
->chargeA
;
1584 nvdwtype
= fr
->ntype
;
1585 vdwparam
= fr
->nbfp
;
1586 vdwtype
= mdatoms
->typeA
;
1588 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
1589 ewtab
= fr
->ic
->tabq_coul_F
;
1590 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
1591 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
1593 /* Setup water-specific parameters */
1594 inr
= nlist
->iinr
[0];
1595 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1596 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1597 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
1598 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1600 jq1
= _mm_set1_pd(charge
[inr
+1]);
1601 jq2
= _mm_set1_pd(charge
[inr
+2]);
1602 jq3
= _mm_set1_pd(charge
[inr
+3]);
1603 vdwjidx0A
= 2*vdwtype
[inr
+0];
1604 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1605 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1606 qq11
= _mm_mul_pd(iq1
,jq1
);
1607 qq12
= _mm_mul_pd(iq1
,jq2
);
1608 qq13
= _mm_mul_pd(iq1
,jq3
);
1609 qq21
= _mm_mul_pd(iq2
,jq1
);
1610 qq22
= _mm_mul_pd(iq2
,jq2
);
1611 qq23
= _mm_mul_pd(iq2
,jq3
);
1612 qq31
= _mm_mul_pd(iq3
,jq1
);
1613 qq32
= _mm_mul_pd(iq3
,jq2
);
1614 qq33
= _mm_mul_pd(iq3
,jq3
);
1616 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1617 rcutoff_scalar
= fr
->rcoulomb
;
1618 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
1619 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
1621 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
1622 rvdw
= _mm_set1_pd(fr
->rvdw
);
1624 /* Avoid stupid compiler warnings */
1626 j_coord_offsetA
= 0;
1627 j_coord_offsetB
= 0;
1632 /* Start outer loop over neighborlists */
1633 for(iidx
=0; iidx
<nri
; iidx
++)
1635 /* Load shift vector for this list */
1636 i_shift_offset
= DIM
*shiftidx
[iidx
];
1638 /* Load limits for loop over neighbors */
1639 j_index_start
= jindex
[iidx
];
1640 j_index_end
= jindex
[iidx
+1];
1642 /* Get outer coordinate index */
1644 i_coord_offset
= DIM
*inr
;
1646 /* Load i particle coords and add shift vector */
1647 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1648 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1650 fix0
= _mm_setzero_pd();
1651 fiy0
= _mm_setzero_pd();
1652 fiz0
= _mm_setzero_pd();
1653 fix1
= _mm_setzero_pd();
1654 fiy1
= _mm_setzero_pd();
1655 fiz1
= _mm_setzero_pd();
1656 fix2
= _mm_setzero_pd();
1657 fiy2
= _mm_setzero_pd();
1658 fiz2
= _mm_setzero_pd();
1659 fix3
= _mm_setzero_pd();
1660 fiy3
= _mm_setzero_pd();
1661 fiz3
= _mm_setzero_pd();
1663 /* Start inner kernel loop */
1664 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1667 /* Get j neighbor index, and coordinate index */
1669 jnrB
= jjnr
[jidx
+1];
1670 j_coord_offsetA
= DIM
*jnrA
;
1671 j_coord_offsetB
= DIM
*jnrB
;
1673 /* load j atom coordinates */
1674 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1675 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1676 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1678 /* Calculate displacement vector */
1679 dx00
= _mm_sub_pd(ix0
,jx0
);
1680 dy00
= _mm_sub_pd(iy0
,jy0
);
1681 dz00
= _mm_sub_pd(iz0
,jz0
);
1682 dx11
= _mm_sub_pd(ix1
,jx1
);
1683 dy11
= _mm_sub_pd(iy1
,jy1
);
1684 dz11
= _mm_sub_pd(iz1
,jz1
);
1685 dx12
= _mm_sub_pd(ix1
,jx2
);
1686 dy12
= _mm_sub_pd(iy1
,jy2
);
1687 dz12
= _mm_sub_pd(iz1
,jz2
);
1688 dx13
= _mm_sub_pd(ix1
,jx3
);
1689 dy13
= _mm_sub_pd(iy1
,jy3
);
1690 dz13
= _mm_sub_pd(iz1
,jz3
);
1691 dx21
= _mm_sub_pd(ix2
,jx1
);
1692 dy21
= _mm_sub_pd(iy2
,jy1
);
1693 dz21
= _mm_sub_pd(iz2
,jz1
);
1694 dx22
= _mm_sub_pd(ix2
,jx2
);
1695 dy22
= _mm_sub_pd(iy2
,jy2
);
1696 dz22
= _mm_sub_pd(iz2
,jz2
);
1697 dx23
= _mm_sub_pd(ix2
,jx3
);
1698 dy23
= _mm_sub_pd(iy2
,jy3
);
1699 dz23
= _mm_sub_pd(iz2
,jz3
);
1700 dx31
= _mm_sub_pd(ix3
,jx1
);
1701 dy31
= _mm_sub_pd(iy3
,jy1
);
1702 dz31
= _mm_sub_pd(iz3
,jz1
);
1703 dx32
= _mm_sub_pd(ix3
,jx2
);
1704 dy32
= _mm_sub_pd(iy3
,jy2
);
1705 dz32
= _mm_sub_pd(iz3
,jz2
);
1706 dx33
= _mm_sub_pd(ix3
,jx3
);
1707 dy33
= _mm_sub_pd(iy3
,jy3
);
1708 dz33
= _mm_sub_pd(iz3
,jz3
);
1710 /* Calculate squared distance and things based on it */
1711 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1712 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1713 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1714 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1715 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1716 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1717 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1718 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1719 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1720 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1722 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1723 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1724 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1725 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1726 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1727 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1728 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1729 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1730 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1732 rinvsq00
= gmx_mm_inv_pd(rsq00
);
1733 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1734 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1735 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1736 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1737 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1738 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1739 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1740 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1741 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1743 fjx0
= _mm_setzero_pd();
1744 fjy0
= _mm_setzero_pd();
1745 fjz0
= _mm_setzero_pd();
1746 fjx1
= _mm_setzero_pd();
1747 fjy1
= _mm_setzero_pd();
1748 fjz1
= _mm_setzero_pd();
1749 fjx2
= _mm_setzero_pd();
1750 fjy2
= _mm_setzero_pd();
1751 fjz2
= _mm_setzero_pd();
1752 fjx3
= _mm_setzero_pd();
1753 fjy3
= _mm_setzero_pd();
1754 fjz3
= _mm_setzero_pd();
1756 /**************************
1757 * CALCULATE INTERACTIONS *
1758 **************************/
1760 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1763 /* LENNARD-JONES DISPERSION/REPULSION */
1765 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1766 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1768 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1772 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1774 /* Calculate temporary vectorial force */
1775 tx
= _mm_mul_pd(fscal
,dx00
);
1776 ty
= _mm_mul_pd(fscal
,dy00
);
1777 tz
= _mm_mul_pd(fscal
,dz00
);
1779 /* Update vectorial force */
1780 fix0
= _mm_add_pd(fix0
,tx
);
1781 fiy0
= _mm_add_pd(fiy0
,ty
);
1782 fiz0
= _mm_add_pd(fiz0
,tz
);
1784 fjx0
= _mm_add_pd(fjx0
,tx
);
1785 fjy0
= _mm_add_pd(fjy0
,ty
);
1786 fjz0
= _mm_add_pd(fjz0
,tz
);
1790 /**************************
1791 * CALCULATE INTERACTIONS *
1792 **************************/
1794 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1797 r11
= _mm_mul_pd(rsq11
,rinv11
);
1799 /* EWALD ELECTROSTATICS */
1801 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1802 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1803 ewitab
= _mm_cvttpd_epi32(ewrt
);
1804 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1805 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1807 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1808 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1810 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1814 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1816 /* Calculate temporary vectorial force */
1817 tx
= _mm_mul_pd(fscal
,dx11
);
1818 ty
= _mm_mul_pd(fscal
,dy11
);
1819 tz
= _mm_mul_pd(fscal
,dz11
);
1821 /* Update vectorial force */
1822 fix1
= _mm_add_pd(fix1
,tx
);
1823 fiy1
= _mm_add_pd(fiy1
,ty
);
1824 fiz1
= _mm_add_pd(fiz1
,tz
);
1826 fjx1
= _mm_add_pd(fjx1
,tx
);
1827 fjy1
= _mm_add_pd(fjy1
,ty
);
1828 fjz1
= _mm_add_pd(fjz1
,tz
);
1832 /**************************
1833 * CALCULATE INTERACTIONS *
1834 **************************/
1836 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1839 r12
= _mm_mul_pd(rsq12
,rinv12
);
1841 /* EWALD ELECTROSTATICS */
1843 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1844 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1845 ewitab
= _mm_cvttpd_epi32(ewrt
);
1846 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1847 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1849 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1850 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1852 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1856 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1858 /* Calculate temporary vectorial force */
1859 tx
= _mm_mul_pd(fscal
,dx12
);
1860 ty
= _mm_mul_pd(fscal
,dy12
);
1861 tz
= _mm_mul_pd(fscal
,dz12
);
1863 /* Update vectorial force */
1864 fix1
= _mm_add_pd(fix1
,tx
);
1865 fiy1
= _mm_add_pd(fiy1
,ty
);
1866 fiz1
= _mm_add_pd(fiz1
,tz
);
1868 fjx2
= _mm_add_pd(fjx2
,tx
);
1869 fjy2
= _mm_add_pd(fjy2
,ty
);
1870 fjz2
= _mm_add_pd(fjz2
,tz
);
1874 /**************************
1875 * CALCULATE INTERACTIONS *
1876 **************************/
1878 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
1881 r13
= _mm_mul_pd(rsq13
,rinv13
);
1883 /* EWALD ELECTROSTATICS */
1885 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1886 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
1887 ewitab
= _mm_cvttpd_epi32(ewrt
);
1888 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1889 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1891 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1892 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
1894 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
1898 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1900 /* Calculate temporary vectorial force */
1901 tx
= _mm_mul_pd(fscal
,dx13
);
1902 ty
= _mm_mul_pd(fscal
,dy13
);
1903 tz
= _mm_mul_pd(fscal
,dz13
);
1905 /* Update vectorial force */
1906 fix1
= _mm_add_pd(fix1
,tx
);
1907 fiy1
= _mm_add_pd(fiy1
,ty
);
1908 fiz1
= _mm_add_pd(fiz1
,tz
);
1910 fjx3
= _mm_add_pd(fjx3
,tx
);
1911 fjy3
= _mm_add_pd(fjy3
,ty
);
1912 fjz3
= _mm_add_pd(fjz3
,tz
);
1916 /**************************
1917 * CALCULATE INTERACTIONS *
1918 **************************/
1920 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1923 r21
= _mm_mul_pd(rsq21
,rinv21
);
1925 /* EWALD ELECTROSTATICS */
1927 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1928 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1929 ewitab
= _mm_cvttpd_epi32(ewrt
);
1930 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1931 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1933 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1934 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1936 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1940 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1942 /* Calculate temporary vectorial force */
1943 tx
= _mm_mul_pd(fscal
,dx21
);
1944 ty
= _mm_mul_pd(fscal
,dy21
);
1945 tz
= _mm_mul_pd(fscal
,dz21
);
1947 /* Update vectorial force */
1948 fix2
= _mm_add_pd(fix2
,tx
);
1949 fiy2
= _mm_add_pd(fiy2
,ty
);
1950 fiz2
= _mm_add_pd(fiz2
,tz
);
1952 fjx1
= _mm_add_pd(fjx1
,tx
);
1953 fjy1
= _mm_add_pd(fjy1
,ty
);
1954 fjz1
= _mm_add_pd(fjz1
,tz
);
1958 /**************************
1959 * CALCULATE INTERACTIONS *
1960 **************************/
1962 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1965 r22
= _mm_mul_pd(rsq22
,rinv22
);
1967 /* EWALD ELECTROSTATICS */
1969 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1970 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
1971 ewitab
= _mm_cvttpd_epi32(ewrt
);
1972 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1973 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1975 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1976 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
1978 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
1982 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1984 /* Calculate temporary vectorial force */
1985 tx
= _mm_mul_pd(fscal
,dx22
);
1986 ty
= _mm_mul_pd(fscal
,dy22
);
1987 tz
= _mm_mul_pd(fscal
,dz22
);
1989 /* Update vectorial force */
1990 fix2
= _mm_add_pd(fix2
,tx
);
1991 fiy2
= _mm_add_pd(fiy2
,ty
);
1992 fiz2
= _mm_add_pd(fiz2
,tz
);
1994 fjx2
= _mm_add_pd(fjx2
,tx
);
1995 fjy2
= _mm_add_pd(fjy2
,ty
);
1996 fjz2
= _mm_add_pd(fjz2
,tz
);
2000 /**************************
2001 * CALCULATE INTERACTIONS *
2002 **************************/
2004 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
2007 r23
= _mm_mul_pd(rsq23
,rinv23
);
2009 /* EWALD ELECTROSTATICS */
2011 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2012 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
2013 ewitab
= _mm_cvttpd_epi32(ewrt
);
2014 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2015 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2017 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2018 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
2020 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
2024 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2026 /* Calculate temporary vectorial force */
2027 tx
= _mm_mul_pd(fscal
,dx23
);
2028 ty
= _mm_mul_pd(fscal
,dy23
);
2029 tz
= _mm_mul_pd(fscal
,dz23
);
2031 /* Update vectorial force */
2032 fix2
= _mm_add_pd(fix2
,tx
);
2033 fiy2
= _mm_add_pd(fiy2
,ty
);
2034 fiz2
= _mm_add_pd(fiz2
,tz
);
2036 fjx3
= _mm_add_pd(fjx3
,tx
);
2037 fjy3
= _mm_add_pd(fjy3
,ty
);
2038 fjz3
= _mm_add_pd(fjz3
,tz
);
2042 /**************************
2043 * CALCULATE INTERACTIONS *
2044 **************************/
2046 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
2049 r31
= _mm_mul_pd(rsq31
,rinv31
);
2051 /* EWALD ELECTROSTATICS */
2053 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2054 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
2055 ewitab
= _mm_cvttpd_epi32(ewrt
);
2056 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2057 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2059 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2060 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
2062 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
2066 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2068 /* Calculate temporary vectorial force */
2069 tx
= _mm_mul_pd(fscal
,dx31
);
2070 ty
= _mm_mul_pd(fscal
,dy31
);
2071 tz
= _mm_mul_pd(fscal
,dz31
);
2073 /* Update vectorial force */
2074 fix3
= _mm_add_pd(fix3
,tx
);
2075 fiy3
= _mm_add_pd(fiy3
,ty
);
2076 fiz3
= _mm_add_pd(fiz3
,tz
);
2078 fjx1
= _mm_add_pd(fjx1
,tx
);
2079 fjy1
= _mm_add_pd(fjy1
,ty
);
2080 fjz1
= _mm_add_pd(fjz1
,tz
);
2084 /**************************
2085 * CALCULATE INTERACTIONS *
2086 **************************/
2088 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
2091 r32
= _mm_mul_pd(rsq32
,rinv32
);
2093 /* EWALD ELECTROSTATICS */
2095 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2096 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
2097 ewitab
= _mm_cvttpd_epi32(ewrt
);
2098 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2099 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2101 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2102 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
2104 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
2108 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2110 /* Calculate temporary vectorial force */
2111 tx
= _mm_mul_pd(fscal
,dx32
);
2112 ty
= _mm_mul_pd(fscal
,dy32
);
2113 tz
= _mm_mul_pd(fscal
,dz32
);
2115 /* Update vectorial force */
2116 fix3
= _mm_add_pd(fix3
,tx
);
2117 fiy3
= _mm_add_pd(fiy3
,ty
);
2118 fiz3
= _mm_add_pd(fiz3
,tz
);
2120 fjx2
= _mm_add_pd(fjx2
,tx
);
2121 fjy2
= _mm_add_pd(fjy2
,ty
);
2122 fjz2
= _mm_add_pd(fjz2
,tz
);
2126 /**************************
2127 * CALCULATE INTERACTIONS *
2128 **************************/
2130 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
2133 r33
= _mm_mul_pd(rsq33
,rinv33
);
2135 /* EWALD ELECTROSTATICS */
2137 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2138 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
2139 ewitab
= _mm_cvttpd_epi32(ewrt
);
2140 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2141 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2143 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2144 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
2146 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
2150 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2152 /* Calculate temporary vectorial force */
2153 tx
= _mm_mul_pd(fscal
,dx33
);
2154 ty
= _mm_mul_pd(fscal
,dy33
);
2155 tz
= _mm_mul_pd(fscal
,dz33
);
2157 /* Update vectorial force */
2158 fix3
= _mm_add_pd(fix3
,tx
);
2159 fiy3
= _mm_add_pd(fiy3
,ty
);
2160 fiz3
= _mm_add_pd(fiz3
,tz
);
2162 fjx3
= _mm_add_pd(fjx3
,tx
);
2163 fjy3
= _mm_add_pd(fjy3
,ty
);
2164 fjz3
= _mm_add_pd(fjz3
,tz
);
2168 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2170 /* Inner loop uses 384 flops */
2173 if(jidx
<j_index_end
)
2177 j_coord_offsetA
= DIM
*jnrA
;
2179 /* load j atom coordinates */
2180 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
2181 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
2182 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
2184 /* Calculate displacement vector */
2185 dx00
= _mm_sub_pd(ix0
,jx0
);
2186 dy00
= _mm_sub_pd(iy0
,jy0
);
2187 dz00
= _mm_sub_pd(iz0
,jz0
);
2188 dx11
= _mm_sub_pd(ix1
,jx1
);
2189 dy11
= _mm_sub_pd(iy1
,jy1
);
2190 dz11
= _mm_sub_pd(iz1
,jz1
);
2191 dx12
= _mm_sub_pd(ix1
,jx2
);
2192 dy12
= _mm_sub_pd(iy1
,jy2
);
2193 dz12
= _mm_sub_pd(iz1
,jz2
);
2194 dx13
= _mm_sub_pd(ix1
,jx3
);
2195 dy13
= _mm_sub_pd(iy1
,jy3
);
2196 dz13
= _mm_sub_pd(iz1
,jz3
);
2197 dx21
= _mm_sub_pd(ix2
,jx1
);
2198 dy21
= _mm_sub_pd(iy2
,jy1
);
2199 dz21
= _mm_sub_pd(iz2
,jz1
);
2200 dx22
= _mm_sub_pd(ix2
,jx2
);
2201 dy22
= _mm_sub_pd(iy2
,jy2
);
2202 dz22
= _mm_sub_pd(iz2
,jz2
);
2203 dx23
= _mm_sub_pd(ix2
,jx3
);
2204 dy23
= _mm_sub_pd(iy2
,jy3
);
2205 dz23
= _mm_sub_pd(iz2
,jz3
);
2206 dx31
= _mm_sub_pd(ix3
,jx1
);
2207 dy31
= _mm_sub_pd(iy3
,jy1
);
2208 dz31
= _mm_sub_pd(iz3
,jz1
);
2209 dx32
= _mm_sub_pd(ix3
,jx2
);
2210 dy32
= _mm_sub_pd(iy3
,jy2
);
2211 dz32
= _mm_sub_pd(iz3
,jz2
);
2212 dx33
= _mm_sub_pd(ix3
,jx3
);
2213 dy33
= _mm_sub_pd(iy3
,jy3
);
2214 dz33
= _mm_sub_pd(iz3
,jz3
);
2216 /* Calculate squared distance and things based on it */
2217 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
2218 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
2219 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
2220 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
2221 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
2222 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
2223 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
2224 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
2225 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
2226 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
2228 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
2229 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
2230 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
2231 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
2232 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
2233 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
2234 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
2235 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
2236 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
2238 rinvsq00
= gmx_mm_inv_pd(rsq00
);
2239 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
2240 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
2241 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
2242 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
2243 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
2244 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
2245 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
2246 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
2247 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
2249 fjx0
= _mm_setzero_pd();
2250 fjy0
= _mm_setzero_pd();
2251 fjz0
= _mm_setzero_pd();
2252 fjx1
= _mm_setzero_pd();
2253 fjy1
= _mm_setzero_pd();
2254 fjz1
= _mm_setzero_pd();
2255 fjx2
= _mm_setzero_pd();
2256 fjy2
= _mm_setzero_pd();
2257 fjz2
= _mm_setzero_pd();
2258 fjx3
= _mm_setzero_pd();
2259 fjy3
= _mm_setzero_pd();
2260 fjz3
= _mm_setzero_pd();
2262 /**************************
2263 * CALCULATE INTERACTIONS *
2264 **************************/
2266 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
2269 /* LENNARD-JONES DISPERSION/REPULSION */
2271 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
2272 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
2274 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
2278 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2280 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2282 /* Calculate temporary vectorial force */
2283 tx
= _mm_mul_pd(fscal
,dx00
);
2284 ty
= _mm_mul_pd(fscal
,dy00
);
2285 tz
= _mm_mul_pd(fscal
,dz00
);
2287 /* Update vectorial force */
2288 fix0
= _mm_add_pd(fix0
,tx
);
2289 fiy0
= _mm_add_pd(fiy0
,ty
);
2290 fiz0
= _mm_add_pd(fiz0
,tz
);
2292 fjx0
= _mm_add_pd(fjx0
,tx
);
2293 fjy0
= _mm_add_pd(fjy0
,ty
);
2294 fjz0
= _mm_add_pd(fjz0
,tz
);
2298 /**************************
2299 * CALCULATE INTERACTIONS *
2300 **************************/
2302 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2305 r11
= _mm_mul_pd(rsq11
,rinv11
);
2307 /* EWALD ELECTROSTATICS */
2309 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2310 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
2311 ewitab
= _mm_cvttpd_epi32(ewrt
);
2312 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2313 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2314 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2315 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
2317 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
2321 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2323 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2325 /* Calculate temporary vectorial force */
2326 tx
= _mm_mul_pd(fscal
,dx11
);
2327 ty
= _mm_mul_pd(fscal
,dy11
);
2328 tz
= _mm_mul_pd(fscal
,dz11
);
2330 /* Update vectorial force */
2331 fix1
= _mm_add_pd(fix1
,tx
);
2332 fiy1
= _mm_add_pd(fiy1
,ty
);
2333 fiz1
= _mm_add_pd(fiz1
,tz
);
2335 fjx1
= _mm_add_pd(fjx1
,tx
);
2336 fjy1
= _mm_add_pd(fjy1
,ty
);
2337 fjz1
= _mm_add_pd(fjz1
,tz
);
2341 /**************************
2342 * CALCULATE INTERACTIONS *
2343 **************************/
2345 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2348 r12
= _mm_mul_pd(rsq12
,rinv12
);
2350 /* EWALD ELECTROSTATICS */
2352 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2353 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
2354 ewitab
= _mm_cvttpd_epi32(ewrt
);
2355 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2356 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2357 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2358 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
2360 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
2364 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2366 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2368 /* Calculate temporary vectorial force */
2369 tx
= _mm_mul_pd(fscal
,dx12
);
2370 ty
= _mm_mul_pd(fscal
,dy12
);
2371 tz
= _mm_mul_pd(fscal
,dz12
);
2373 /* Update vectorial force */
2374 fix1
= _mm_add_pd(fix1
,tx
);
2375 fiy1
= _mm_add_pd(fiy1
,ty
);
2376 fiz1
= _mm_add_pd(fiz1
,tz
);
2378 fjx2
= _mm_add_pd(fjx2
,tx
);
2379 fjy2
= _mm_add_pd(fjy2
,ty
);
2380 fjz2
= _mm_add_pd(fjz2
,tz
);
2384 /**************************
2385 * CALCULATE INTERACTIONS *
2386 **************************/
2388 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
2391 r13
= _mm_mul_pd(rsq13
,rinv13
);
2393 /* EWALD ELECTROSTATICS */
2395 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2396 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
2397 ewitab
= _mm_cvttpd_epi32(ewrt
);
2398 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2399 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2400 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2401 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
2403 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
2407 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2409 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2411 /* Calculate temporary vectorial force */
2412 tx
= _mm_mul_pd(fscal
,dx13
);
2413 ty
= _mm_mul_pd(fscal
,dy13
);
2414 tz
= _mm_mul_pd(fscal
,dz13
);
2416 /* Update vectorial force */
2417 fix1
= _mm_add_pd(fix1
,tx
);
2418 fiy1
= _mm_add_pd(fiy1
,ty
);
2419 fiz1
= _mm_add_pd(fiz1
,tz
);
2421 fjx3
= _mm_add_pd(fjx3
,tx
);
2422 fjy3
= _mm_add_pd(fjy3
,ty
);
2423 fjz3
= _mm_add_pd(fjz3
,tz
);
2427 /**************************
2428 * CALCULATE INTERACTIONS *
2429 **************************/
2431 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2434 r21
= _mm_mul_pd(rsq21
,rinv21
);
2436 /* EWALD ELECTROSTATICS */
2438 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2439 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
2440 ewitab
= _mm_cvttpd_epi32(ewrt
);
2441 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2442 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2443 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2444 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
2446 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
2450 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2452 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2454 /* Calculate temporary vectorial force */
2455 tx
= _mm_mul_pd(fscal
,dx21
);
2456 ty
= _mm_mul_pd(fscal
,dy21
);
2457 tz
= _mm_mul_pd(fscal
,dz21
);
2459 /* Update vectorial force */
2460 fix2
= _mm_add_pd(fix2
,tx
);
2461 fiy2
= _mm_add_pd(fiy2
,ty
);
2462 fiz2
= _mm_add_pd(fiz2
,tz
);
2464 fjx1
= _mm_add_pd(fjx1
,tx
);
2465 fjy1
= _mm_add_pd(fjy1
,ty
);
2466 fjz1
= _mm_add_pd(fjz1
,tz
);
2470 /**************************
2471 * CALCULATE INTERACTIONS *
2472 **************************/
2474 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2477 r22
= _mm_mul_pd(rsq22
,rinv22
);
2479 /* EWALD ELECTROSTATICS */
2481 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2482 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2483 ewitab
= _mm_cvttpd_epi32(ewrt
);
2484 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2485 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2486 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2487 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
2489 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
2493 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2495 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2497 /* Calculate temporary vectorial force */
2498 tx
= _mm_mul_pd(fscal
,dx22
);
2499 ty
= _mm_mul_pd(fscal
,dy22
);
2500 tz
= _mm_mul_pd(fscal
,dz22
);
2502 /* Update vectorial force */
2503 fix2
= _mm_add_pd(fix2
,tx
);
2504 fiy2
= _mm_add_pd(fiy2
,ty
);
2505 fiz2
= _mm_add_pd(fiz2
,tz
);
2507 fjx2
= _mm_add_pd(fjx2
,tx
);
2508 fjy2
= _mm_add_pd(fjy2
,ty
);
2509 fjz2
= _mm_add_pd(fjz2
,tz
);
2513 /**************************
2514 * CALCULATE INTERACTIONS *
2515 **************************/
2517 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
2520 r23
= _mm_mul_pd(rsq23
,rinv23
);
2522 /* EWALD ELECTROSTATICS */
2524 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2525 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
2526 ewitab
= _mm_cvttpd_epi32(ewrt
);
2527 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2528 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2529 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2530 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
2532 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
2536 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2538 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2540 /* Calculate temporary vectorial force */
2541 tx
= _mm_mul_pd(fscal
,dx23
);
2542 ty
= _mm_mul_pd(fscal
,dy23
);
2543 tz
= _mm_mul_pd(fscal
,dz23
);
2545 /* Update vectorial force */
2546 fix2
= _mm_add_pd(fix2
,tx
);
2547 fiy2
= _mm_add_pd(fiy2
,ty
);
2548 fiz2
= _mm_add_pd(fiz2
,tz
);
2550 fjx3
= _mm_add_pd(fjx3
,tx
);
2551 fjy3
= _mm_add_pd(fjy3
,ty
);
2552 fjz3
= _mm_add_pd(fjz3
,tz
);
2556 /**************************
2557 * CALCULATE INTERACTIONS *
2558 **************************/
2560 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
2563 r31
= _mm_mul_pd(rsq31
,rinv31
);
2565 /* EWALD ELECTROSTATICS */
2567 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2568 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
2569 ewitab
= _mm_cvttpd_epi32(ewrt
);
2570 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2571 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2572 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2573 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
2575 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
2579 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2581 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2583 /* Calculate temporary vectorial force */
2584 tx
= _mm_mul_pd(fscal
,dx31
);
2585 ty
= _mm_mul_pd(fscal
,dy31
);
2586 tz
= _mm_mul_pd(fscal
,dz31
);
2588 /* Update vectorial force */
2589 fix3
= _mm_add_pd(fix3
,tx
);
2590 fiy3
= _mm_add_pd(fiy3
,ty
);
2591 fiz3
= _mm_add_pd(fiz3
,tz
);
2593 fjx1
= _mm_add_pd(fjx1
,tx
);
2594 fjy1
= _mm_add_pd(fjy1
,ty
);
2595 fjz1
= _mm_add_pd(fjz1
,tz
);
2599 /**************************
2600 * CALCULATE INTERACTIONS *
2601 **************************/
2603 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
2606 r32
= _mm_mul_pd(rsq32
,rinv32
);
2608 /* EWALD ELECTROSTATICS */
2610 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2611 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
2612 ewitab
= _mm_cvttpd_epi32(ewrt
);
2613 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2614 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2615 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2616 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
2618 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
2622 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2624 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2626 /* Calculate temporary vectorial force */
2627 tx
= _mm_mul_pd(fscal
,dx32
);
2628 ty
= _mm_mul_pd(fscal
,dy32
);
2629 tz
= _mm_mul_pd(fscal
,dz32
);
2631 /* Update vectorial force */
2632 fix3
= _mm_add_pd(fix3
,tx
);
2633 fiy3
= _mm_add_pd(fiy3
,ty
);
2634 fiz3
= _mm_add_pd(fiz3
,tz
);
2636 fjx2
= _mm_add_pd(fjx2
,tx
);
2637 fjy2
= _mm_add_pd(fjy2
,ty
);
2638 fjz2
= _mm_add_pd(fjz2
,tz
);
2642 /**************************
2643 * CALCULATE INTERACTIONS *
2644 **************************/
2646 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
2649 r33
= _mm_mul_pd(rsq33
,rinv33
);
2651 /* EWALD ELECTROSTATICS */
2653 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2654 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
2655 ewitab
= _mm_cvttpd_epi32(ewrt
);
2656 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2657 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2658 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2659 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
2661 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
2665 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2667 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2669 /* Calculate temporary vectorial force */
2670 tx
= _mm_mul_pd(fscal
,dx33
);
2671 ty
= _mm_mul_pd(fscal
,dy33
);
2672 tz
= _mm_mul_pd(fscal
,dz33
);
2674 /* Update vectorial force */
2675 fix3
= _mm_add_pd(fix3
,tx
);
2676 fiy3
= _mm_add_pd(fiy3
,ty
);
2677 fiz3
= _mm_add_pd(fiz3
,tz
);
2679 fjx3
= _mm_add_pd(fjx3
,tx
);
2680 fjy3
= _mm_add_pd(fjy3
,ty
);
2681 fjz3
= _mm_add_pd(fjz3
,tz
);
2685 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2687 /* Inner loop uses 384 flops */
2690 /* End of innermost loop */
2692 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2693 f
+i_coord_offset
,fshift
+i_shift_offset
);
2695 /* Increment number of inner iterations */
2696 inneriter
+= j_index_end
- j_index_start
;
2698 /* Outer loop uses 24 flops */
2701 /* Increment number of outer iterations */
2704 /* Update outer/inner flops */
2706 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*384);