2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse4_1_double.h"
49 #include "kernelutil_x86_sse4_1_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_double
53 * Electrostatics interaction: Ewald
54 * VdW interaction: LJEwald
55 * Geometry: Water4-Water4
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
89 int vdwjidx0A
,vdwjidx0B
;
90 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
91 int vdwjidx1A
,vdwjidx1B
;
92 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
93 int vdwjidx2A
,vdwjidx2B
;
94 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
95 int vdwjidx3A
,vdwjidx3B
;
96 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
97 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
98 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
99 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
100 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
101 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
102 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
103 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
104 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
105 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
106 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
107 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
110 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
113 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
114 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
125 __m128d ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
127 __m128d one_half
= _mm_set1_pd(0.5);
128 __m128d minus_one
= _mm_set1_pd(-1.0);
130 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
132 __m128d dummy_mask
,cutoff_mask
;
133 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
134 __m128d one
= _mm_set1_pd(1.0);
135 __m128d two
= _mm_set1_pd(2.0);
141 jindex
= nlist
->jindex
;
143 shiftidx
= nlist
->shift
;
145 shiftvec
= fr
->shift_vec
[0];
146 fshift
= fr
->fshift
[0];
147 facel
= _mm_set1_pd(fr
->epsfac
);
148 charge
= mdatoms
->chargeA
;
149 nvdwtype
= fr
->ntype
;
151 vdwtype
= mdatoms
->typeA
;
152 vdwgridparam
= fr
->ljpme_c6grid
;
153 sh_lj_ewald
= _mm_set1_pd(fr
->ic
->sh_lj_ewald
);
154 ewclj
= _mm_set1_pd(fr
->ewaldcoeff_lj
);
155 ewclj2
= _mm_mul_pd(minus_one
,_mm_mul_pd(ewclj
,ewclj
));
157 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
158 ewtab
= fr
->ic
->tabq_coul_FDV0
;
159 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
160 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
162 /* Setup water-specific parameters */
163 inr
= nlist
->iinr
[0];
164 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
165 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
166 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
167 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
169 jq1
= _mm_set1_pd(charge
[inr
+1]);
170 jq2
= _mm_set1_pd(charge
[inr
+2]);
171 jq3
= _mm_set1_pd(charge
[inr
+3]);
172 vdwjidx0A
= 2*vdwtype
[inr
+0];
173 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
174 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
175 c6grid_00
= _mm_set1_pd(vdwgridparam
[vdwioffset0
+vdwjidx0A
]);
176 qq11
= _mm_mul_pd(iq1
,jq1
);
177 qq12
= _mm_mul_pd(iq1
,jq2
);
178 qq13
= _mm_mul_pd(iq1
,jq3
);
179 qq21
= _mm_mul_pd(iq2
,jq1
);
180 qq22
= _mm_mul_pd(iq2
,jq2
);
181 qq23
= _mm_mul_pd(iq2
,jq3
);
182 qq31
= _mm_mul_pd(iq3
,jq1
);
183 qq32
= _mm_mul_pd(iq3
,jq2
);
184 qq33
= _mm_mul_pd(iq3
,jq3
);
186 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
187 rcutoff_scalar
= fr
->rcoulomb
;
188 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
189 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
191 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
192 rvdw
= _mm_set1_pd(fr
->rvdw
);
194 /* Avoid stupid compiler warnings */
202 /* Start outer loop over neighborlists */
203 for(iidx
=0; iidx
<nri
; iidx
++)
205 /* Load shift vector for this list */
206 i_shift_offset
= DIM
*shiftidx
[iidx
];
208 /* Load limits for loop over neighbors */
209 j_index_start
= jindex
[iidx
];
210 j_index_end
= jindex
[iidx
+1];
212 /* Get outer coordinate index */
214 i_coord_offset
= DIM
*inr
;
216 /* Load i particle coords and add shift vector */
217 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
218 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
220 fix0
= _mm_setzero_pd();
221 fiy0
= _mm_setzero_pd();
222 fiz0
= _mm_setzero_pd();
223 fix1
= _mm_setzero_pd();
224 fiy1
= _mm_setzero_pd();
225 fiz1
= _mm_setzero_pd();
226 fix2
= _mm_setzero_pd();
227 fiy2
= _mm_setzero_pd();
228 fiz2
= _mm_setzero_pd();
229 fix3
= _mm_setzero_pd();
230 fiy3
= _mm_setzero_pd();
231 fiz3
= _mm_setzero_pd();
233 /* Reset potential sums */
234 velecsum
= _mm_setzero_pd();
235 vvdwsum
= _mm_setzero_pd();
237 /* Start inner kernel loop */
238 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
241 /* Get j neighbor index, and coordinate index */
244 j_coord_offsetA
= DIM
*jnrA
;
245 j_coord_offsetB
= DIM
*jnrB
;
247 /* load j atom coordinates */
248 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
249 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
250 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
252 /* Calculate displacement vector */
253 dx00
= _mm_sub_pd(ix0
,jx0
);
254 dy00
= _mm_sub_pd(iy0
,jy0
);
255 dz00
= _mm_sub_pd(iz0
,jz0
);
256 dx11
= _mm_sub_pd(ix1
,jx1
);
257 dy11
= _mm_sub_pd(iy1
,jy1
);
258 dz11
= _mm_sub_pd(iz1
,jz1
);
259 dx12
= _mm_sub_pd(ix1
,jx2
);
260 dy12
= _mm_sub_pd(iy1
,jy2
);
261 dz12
= _mm_sub_pd(iz1
,jz2
);
262 dx13
= _mm_sub_pd(ix1
,jx3
);
263 dy13
= _mm_sub_pd(iy1
,jy3
);
264 dz13
= _mm_sub_pd(iz1
,jz3
);
265 dx21
= _mm_sub_pd(ix2
,jx1
);
266 dy21
= _mm_sub_pd(iy2
,jy1
);
267 dz21
= _mm_sub_pd(iz2
,jz1
);
268 dx22
= _mm_sub_pd(ix2
,jx2
);
269 dy22
= _mm_sub_pd(iy2
,jy2
);
270 dz22
= _mm_sub_pd(iz2
,jz2
);
271 dx23
= _mm_sub_pd(ix2
,jx3
);
272 dy23
= _mm_sub_pd(iy2
,jy3
);
273 dz23
= _mm_sub_pd(iz2
,jz3
);
274 dx31
= _mm_sub_pd(ix3
,jx1
);
275 dy31
= _mm_sub_pd(iy3
,jy1
);
276 dz31
= _mm_sub_pd(iz3
,jz1
);
277 dx32
= _mm_sub_pd(ix3
,jx2
);
278 dy32
= _mm_sub_pd(iy3
,jy2
);
279 dz32
= _mm_sub_pd(iz3
,jz2
);
280 dx33
= _mm_sub_pd(ix3
,jx3
);
281 dy33
= _mm_sub_pd(iy3
,jy3
);
282 dz33
= _mm_sub_pd(iz3
,jz3
);
284 /* Calculate squared distance and things based on it */
285 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
286 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
287 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
288 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
289 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
290 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
291 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
292 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
293 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
294 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
296 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
297 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
298 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
299 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
300 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
301 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
302 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
303 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
304 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
305 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
307 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
308 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
309 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
310 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
311 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
312 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
313 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
314 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
315 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
316 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
318 fjx0
= _mm_setzero_pd();
319 fjy0
= _mm_setzero_pd();
320 fjz0
= _mm_setzero_pd();
321 fjx1
= _mm_setzero_pd();
322 fjy1
= _mm_setzero_pd();
323 fjz1
= _mm_setzero_pd();
324 fjx2
= _mm_setzero_pd();
325 fjy2
= _mm_setzero_pd();
326 fjz2
= _mm_setzero_pd();
327 fjx3
= _mm_setzero_pd();
328 fjy3
= _mm_setzero_pd();
329 fjz3
= _mm_setzero_pd();
331 /**************************
332 * CALCULATE INTERACTIONS *
333 **************************/
335 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
338 r00
= _mm_mul_pd(rsq00
,rinv00
);
340 /* Analytical LJ-PME */
341 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
342 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
343 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
344 exponent
= gmx_simd_exp_d(ewcljrsq
);
345 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
346 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
347 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
348 vvdw6
= _mm_mul_pd(_mm_sub_pd(c6_00
,_mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
))),rinvsix
);
349 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
350 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))),one_twelfth
),
351 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_add_pd(_mm_mul_pd(c6_00
,sh_vdw_invrcut6
),_mm_mul_pd(c6grid_00
,sh_lj_ewald
))),one_sixth
));
352 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
353 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,_mm_sub_pd(vvdw6
,_mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
)))),rinvsq00
);
355 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
359 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
363 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
365 /* Calculate temporary vectorial force */
366 tx
= _mm_mul_pd(fscal
,dx00
);
367 ty
= _mm_mul_pd(fscal
,dy00
);
368 tz
= _mm_mul_pd(fscal
,dz00
);
370 /* Update vectorial force */
371 fix0
= _mm_add_pd(fix0
,tx
);
372 fiy0
= _mm_add_pd(fiy0
,ty
);
373 fiz0
= _mm_add_pd(fiz0
,tz
);
375 fjx0
= _mm_add_pd(fjx0
,tx
);
376 fjy0
= _mm_add_pd(fjy0
,ty
);
377 fjz0
= _mm_add_pd(fjz0
,tz
);
381 /**************************
382 * CALCULATE INTERACTIONS *
383 **************************/
385 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
388 r11
= _mm_mul_pd(rsq11
,rinv11
);
390 /* EWALD ELECTROSTATICS */
392 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
393 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
394 ewitab
= _mm_cvttpd_epi32(ewrt
);
395 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
396 ewitab
= _mm_slli_epi32(ewitab
,2);
397 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
398 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
399 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
400 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
401 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
402 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
403 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
404 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
405 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_sub_pd(rinv11
,sh_ewald
),velec
));
406 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
408 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
410 /* Update potential sum for this i atom from the interaction with this j atom. */
411 velec
= _mm_and_pd(velec
,cutoff_mask
);
412 velecsum
= _mm_add_pd(velecsum
,velec
);
416 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
418 /* Calculate temporary vectorial force */
419 tx
= _mm_mul_pd(fscal
,dx11
);
420 ty
= _mm_mul_pd(fscal
,dy11
);
421 tz
= _mm_mul_pd(fscal
,dz11
);
423 /* Update vectorial force */
424 fix1
= _mm_add_pd(fix1
,tx
);
425 fiy1
= _mm_add_pd(fiy1
,ty
);
426 fiz1
= _mm_add_pd(fiz1
,tz
);
428 fjx1
= _mm_add_pd(fjx1
,tx
);
429 fjy1
= _mm_add_pd(fjy1
,ty
);
430 fjz1
= _mm_add_pd(fjz1
,tz
);
434 /**************************
435 * CALCULATE INTERACTIONS *
436 **************************/
438 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
441 r12
= _mm_mul_pd(rsq12
,rinv12
);
443 /* EWALD ELECTROSTATICS */
445 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
446 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
447 ewitab
= _mm_cvttpd_epi32(ewrt
);
448 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
449 ewitab
= _mm_slli_epi32(ewitab
,2);
450 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
451 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
452 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
453 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
454 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
455 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
456 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
457 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
458 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_sub_pd(rinv12
,sh_ewald
),velec
));
459 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
461 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
463 /* Update potential sum for this i atom from the interaction with this j atom. */
464 velec
= _mm_and_pd(velec
,cutoff_mask
);
465 velecsum
= _mm_add_pd(velecsum
,velec
);
469 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
471 /* Calculate temporary vectorial force */
472 tx
= _mm_mul_pd(fscal
,dx12
);
473 ty
= _mm_mul_pd(fscal
,dy12
);
474 tz
= _mm_mul_pd(fscal
,dz12
);
476 /* Update vectorial force */
477 fix1
= _mm_add_pd(fix1
,tx
);
478 fiy1
= _mm_add_pd(fiy1
,ty
);
479 fiz1
= _mm_add_pd(fiz1
,tz
);
481 fjx2
= _mm_add_pd(fjx2
,tx
);
482 fjy2
= _mm_add_pd(fjy2
,ty
);
483 fjz2
= _mm_add_pd(fjz2
,tz
);
487 /**************************
488 * CALCULATE INTERACTIONS *
489 **************************/
491 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
494 r13
= _mm_mul_pd(rsq13
,rinv13
);
496 /* EWALD ELECTROSTATICS */
498 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
499 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
500 ewitab
= _mm_cvttpd_epi32(ewrt
);
501 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
502 ewitab
= _mm_slli_epi32(ewitab
,2);
503 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
504 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
505 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
506 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
507 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
508 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
509 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
510 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
511 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_sub_pd(rinv13
,sh_ewald
),velec
));
512 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
514 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
516 /* Update potential sum for this i atom from the interaction with this j atom. */
517 velec
= _mm_and_pd(velec
,cutoff_mask
);
518 velecsum
= _mm_add_pd(velecsum
,velec
);
522 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
524 /* Calculate temporary vectorial force */
525 tx
= _mm_mul_pd(fscal
,dx13
);
526 ty
= _mm_mul_pd(fscal
,dy13
);
527 tz
= _mm_mul_pd(fscal
,dz13
);
529 /* Update vectorial force */
530 fix1
= _mm_add_pd(fix1
,tx
);
531 fiy1
= _mm_add_pd(fiy1
,ty
);
532 fiz1
= _mm_add_pd(fiz1
,tz
);
534 fjx3
= _mm_add_pd(fjx3
,tx
);
535 fjy3
= _mm_add_pd(fjy3
,ty
);
536 fjz3
= _mm_add_pd(fjz3
,tz
);
540 /**************************
541 * CALCULATE INTERACTIONS *
542 **************************/
544 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
547 r21
= _mm_mul_pd(rsq21
,rinv21
);
549 /* EWALD ELECTROSTATICS */
551 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
552 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
553 ewitab
= _mm_cvttpd_epi32(ewrt
);
554 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
555 ewitab
= _mm_slli_epi32(ewitab
,2);
556 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
557 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
558 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
559 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
560 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
561 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
562 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
563 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
564 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_sub_pd(rinv21
,sh_ewald
),velec
));
565 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
567 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
569 /* Update potential sum for this i atom from the interaction with this j atom. */
570 velec
= _mm_and_pd(velec
,cutoff_mask
);
571 velecsum
= _mm_add_pd(velecsum
,velec
);
575 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
577 /* Calculate temporary vectorial force */
578 tx
= _mm_mul_pd(fscal
,dx21
);
579 ty
= _mm_mul_pd(fscal
,dy21
);
580 tz
= _mm_mul_pd(fscal
,dz21
);
582 /* Update vectorial force */
583 fix2
= _mm_add_pd(fix2
,tx
);
584 fiy2
= _mm_add_pd(fiy2
,ty
);
585 fiz2
= _mm_add_pd(fiz2
,tz
);
587 fjx1
= _mm_add_pd(fjx1
,tx
);
588 fjy1
= _mm_add_pd(fjy1
,ty
);
589 fjz1
= _mm_add_pd(fjz1
,tz
);
593 /**************************
594 * CALCULATE INTERACTIONS *
595 **************************/
597 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
600 r22
= _mm_mul_pd(rsq22
,rinv22
);
602 /* EWALD ELECTROSTATICS */
604 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
605 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
606 ewitab
= _mm_cvttpd_epi32(ewrt
);
607 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
608 ewitab
= _mm_slli_epi32(ewitab
,2);
609 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
610 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
611 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
612 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
613 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
614 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
615 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
616 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
617 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_sub_pd(rinv22
,sh_ewald
),velec
));
618 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
620 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
622 /* Update potential sum for this i atom from the interaction with this j atom. */
623 velec
= _mm_and_pd(velec
,cutoff_mask
);
624 velecsum
= _mm_add_pd(velecsum
,velec
);
628 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
630 /* Calculate temporary vectorial force */
631 tx
= _mm_mul_pd(fscal
,dx22
);
632 ty
= _mm_mul_pd(fscal
,dy22
);
633 tz
= _mm_mul_pd(fscal
,dz22
);
635 /* Update vectorial force */
636 fix2
= _mm_add_pd(fix2
,tx
);
637 fiy2
= _mm_add_pd(fiy2
,ty
);
638 fiz2
= _mm_add_pd(fiz2
,tz
);
640 fjx2
= _mm_add_pd(fjx2
,tx
);
641 fjy2
= _mm_add_pd(fjy2
,ty
);
642 fjz2
= _mm_add_pd(fjz2
,tz
);
646 /**************************
647 * CALCULATE INTERACTIONS *
648 **************************/
650 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
653 r23
= _mm_mul_pd(rsq23
,rinv23
);
655 /* EWALD ELECTROSTATICS */
657 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
658 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
659 ewitab
= _mm_cvttpd_epi32(ewrt
);
660 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
661 ewitab
= _mm_slli_epi32(ewitab
,2);
662 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
663 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
664 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
665 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
666 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
667 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
668 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
669 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
670 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_sub_pd(rinv23
,sh_ewald
),velec
));
671 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
673 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
675 /* Update potential sum for this i atom from the interaction with this j atom. */
676 velec
= _mm_and_pd(velec
,cutoff_mask
);
677 velecsum
= _mm_add_pd(velecsum
,velec
);
681 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
683 /* Calculate temporary vectorial force */
684 tx
= _mm_mul_pd(fscal
,dx23
);
685 ty
= _mm_mul_pd(fscal
,dy23
);
686 tz
= _mm_mul_pd(fscal
,dz23
);
688 /* Update vectorial force */
689 fix2
= _mm_add_pd(fix2
,tx
);
690 fiy2
= _mm_add_pd(fiy2
,ty
);
691 fiz2
= _mm_add_pd(fiz2
,tz
);
693 fjx3
= _mm_add_pd(fjx3
,tx
);
694 fjy3
= _mm_add_pd(fjy3
,ty
);
695 fjz3
= _mm_add_pd(fjz3
,tz
);
699 /**************************
700 * CALCULATE INTERACTIONS *
701 **************************/
703 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
706 r31
= _mm_mul_pd(rsq31
,rinv31
);
708 /* EWALD ELECTROSTATICS */
710 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
711 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
712 ewitab
= _mm_cvttpd_epi32(ewrt
);
713 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
714 ewitab
= _mm_slli_epi32(ewitab
,2);
715 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
716 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
717 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
718 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
719 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
720 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
721 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
722 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
723 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_sub_pd(rinv31
,sh_ewald
),velec
));
724 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
726 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
728 /* Update potential sum for this i atom from the interaction with this j atom. */
729 velec
= _mm_and_pd(velec
,cutoff_mask
);
730 velecsum
= _mm_add_pd(velecsum
,velec
);
734 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
736 /* Calculate temporary vectorial force */
737 tx
= _mm_mul_pd(fscal
,dx31
);
738 ty
= _mm_mul_pd(fscal
,dy31
);
739 tz
= _mm_mul_pd(fscal
,dz31
);
741 /* Update vectorial force */
742 fix3
= _mm_add_pd(fix3
,tx
);
743 fiy3
= _mm_add_pd(fiy3
,ty
);
744 fiz3
= _mm_add_pd(fiz3
,tz
);
746 fjx1
= _mm_add_pd(fjx1
,tx
);
747 fjy1
= _mm_add_pd(fjy1
,ty
);
748 fjz1
= _mm_add_pd(fjz1
,tz
);
752 /**************************
753 * CALCULATE INTERACTIONS *
754 **************************/
756 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
759 r32
= _mm_mul_pd(rsq32
,rinv32
);
761 /* EWALD ELECTROSTATICS */
763 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
764 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
765 ewitab
= _mm_cvttpd_epi32(ewrt
);
766 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
767 ewitab
= _mm_slli_epi32(ewitab
,2);
768 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
769 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
770 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
771 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
772 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
773 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
774 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
775 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
776 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_sub_pd(rinv32
,sh_ewald
),velec
));
777 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
779 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
781 /* Update potential sum for this i atom from the interaction with this j atom. */
782 velec
= _mm_and_pd(velec
,cutoff_mask
);
783 velecsum
= _mm_add_pd(velecsum
,velec
);
787 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
789 /* Calculate temporary vectorial force */
790 tx
= _mm_mul_pd(fscal
,dx32
);
791 ty
= _mm_mul_pd(fscal
,dy32
);
792 tz
= _mm_mul_pd(fscal
,dz32
);
794 /* Update vectorial force */
795 fix3
= _mm_add_pd(fix3
,tx
);
796 fiy3
= _mm_add_pd(fiy3
,ty
);
797 fiz3
= _mm_add_pd(fiz3
,tz
);
799 fjx2
= _mm_add_pd(fjx2
,tx
);
800 fjy2
= _mm_add_pd(fjy2
,ty
);
801 fjz2
= _mm_add_pd(fjz2
,tz
);
805 /**************************
806 * CALCULATE INTERACTIONS *
807 **************************/
809 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
812 r33
= _mm_mul_pd(rsq33
,rinv33
);
814 /* EWALD ELECTROSTATICS */
816 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
817 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
818 ewitab
= _mm_cvttpd_epi32(ewrt
);
819 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
820 ewitab
= _mm_slli_epi32(ewitab
,2);
821 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
822 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
823 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
824 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
825 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
826 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
827 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
828 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
829 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_sub_pd(rinv33
,sh_ewald
),velec
));
830 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
832 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
834 /* Update potential sum for this i atom from the interaction with this j atom. */
835 velec
= _mm_and_pd(velec
,cutoff_mask
);
836 velecsum
= _mm_add_pd(velecsum
,velec
);
840 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
842 /* Calculate temporary vectorial force */
843 tx
= _mm_mul_pd(fscal
,dx33
);
844 ty
= _mm_mul_pd(fscal
,dy33
);
845 tz
= _mm_mul_pd(fscal
,dz33
);
847 /* Update vectorial force */
848 fix3
= _mm_add_pd(fix3
,tx
);
849 fiy3
= _mm_add_pd(fiy3
,ty
);
850 fiz3
= _mm_add_pd(fiz3
,tz
);
852 fjx3
= _mm_add_pd(fjx3
,tx
);
853 fjy3
= _mm_add_pd(fjy3
,ty
);
854 fjz3
= _mm_add_pd(fjz3
,tz
);
858 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
860 /* Inner loop uses 478 flops */
867 j_coord_offsetA
= DIM
*jnrA
;
869 /* load j atom coordinates */
870 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
871 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
872 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
874 /* Calculate displacement vector */
875 dx00
= _mm_sub_pd(ix0
,jx0
);
876 dy00
= _mm_sub_pd(iy0
,jy0
);
877 dz00
= _mm_sub_pd(iz0
,jz0
);
878 dx11
= _mm_sub_pd(ix1
,jx1
);
879 dy11
= _mm_sub_pd(iy1
,jy1
);
880 dz11
= _mm_sub_pd(iz1
,jz1
);
881 dx12
= _mm_sub_pd(ix1
,jx2
);
882 dy12
= _mm_sub_pd(iy1
,jy2
);
883 dz12
= _mm_sub_pd(iz1
,jz2
);
884 dx13
= _mm_sub_pd(ix1
,jx3
);
885 dy13
= _mm_sub_pd(iy1
,jy3
);
886 dz13
= _mm_sub_pd(iz1
,jz3
);
887 dx21
= _mm_sub_pd(ix2
,jx1
);
888 dy21
= _mm_sub_pd(iy2
,jy1
);
889 dz21
= _mm_sub_pd(iz2
,jz1
);
890 dx22
= _mm_sub_pd(ix2
,jx2
);
891 dy22
= _mm_sub_pd(iy2
,jy2
);
892 dz22
= _mm_sub_pd(iz2
,jz2
);
893 dx23
= _mm_sub_pd(ix2
,jx3
);
894 dy23
= _mm_sub_pd(iy2
,jy3
);
895 dz23
= _mm_sub_pd(iz2
,jz3
);
896 dx31
= _mm_sub_pd(ix3
,jx1
);
897 dy31
= _mm_sub_pd(iy3
,jy1
);
898 dz31
= _mm_sub_pd(iz3
,jz1
);
899 dx32
= _mm_sub_pd(ix3
,jx2
);
900 dy32
= _mm_sub_pd(iy3
,jy2
);
901 dz32
= _mm_sub_pd(iz3
,jz2
);
902 dx33
= _mm_sub_pd(ix3
,jx3
);
903 dy33
= _mm_sub_pd(iy3
,jy3
);
904 dz33
= _mm_sub_pd(iz3
,jz3
);
906 /* Calculate squared distance and things based on it */
907 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
908 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
909 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
910 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
911 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
912 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
913 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
914 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
915 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
916 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
918 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
919 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
920 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
921 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
922 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
923 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
924 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
925 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
926 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
927 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
929 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
930 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
931 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
932 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
933 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
934 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
935 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
936 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
937 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
938 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
940 fjx0
= _mm_setzero_pd();
941 fjy0
= _mm_setzero_pd();
942 fjz0
= _mm_setzero_pd();
943 fjx1
= _mm_setzero_pd();
944 fjy1
= _mm_setzero_pd();
945 fjz1
= _mm_setzero_pd();
946 fjx2
= _mm_setzero_pd();
947 fjy2
= _mm_setzero_pd();
948 fjz2
= _mm_setzero_pd();
949 fjx3
= _mm_setzero_pd();
950 fjy3
= _mm_setzero_pd();
951 fjz3
= _mm_setzero_pd();
953 /**************************
954 * CALCULATE INTERACTIONS *
955 **************************/
957 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
960 r00
= _mm_mul_pd(rsq00
,rinv00
);
962 /* Analytical LJ-PME */
963 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
964 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
965 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
966 exponent
= gmx_simd_exp_d(ewcljrsq
);
967 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
968 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
969 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
970 vvdw6
= _mm_mul_pd(_mm_sub_pd(c6_00
,_mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
))),rinvsix
);
971 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
972 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))),one_twelfth
),
973 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_add_pd(_mm_mul_pd(c6_00
,sh_vdw_invrcut6
),_mm_mul_pd(c6grid_00
,sh_lj_ewald
))),one_sixth
));
974 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
975 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,_mm_sub_pd(vvdw6
,_mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
)))),rinvsq00
);
977 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
979 /* Update potential sum for this i atom from the interaction with this j atom. */
980 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
981 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
982 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
986 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
988 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
990 /* Calculate temporary vectorial force */
991 tx
= _mm_mul_pd(fscal
,dx00
);
992 ty
= _mm_mul_pd(fscal
,dy00
);
993 tz
= _mm_mul_pd(fscal
,dz00
);
995 /* Update vectorial force */
996 fix0
= _mm_add_pd(fix0
,tx
);
997 fiy0
= _mm_add_pd(fiy0
,ty
);
998 fiz0
= _mm_add_pd(fiz0
,tz
);
1000 fjx0
= _mm_add_pd(fjx0
,tx
);
1001 fjy0
= _mm_add_pd(fjy0
,ty
);
1002 fjz0
= _mm_add_pd(fjz0
,tz
);
1006 /**************************
1007 * CALCULATE INTERACTIONS *
1008 **************************/
1010 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1013 r11
= _mm_mul_pd(rsq11
,rinv11
);
1015 /* EWALD ELECTROSTATICS */
1017 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1018 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1019 ewitab
= _mm_cvttpd_epi32(ewrt
);
1020 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1021 ewitab
= _mm_slli_epi32(ewitab
,2);
1022 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1023 ewtabD
= _mm_setzero_pd();
1024 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1025 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1026 ewtabFn
= _mm_setzero_pd();
1027 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1028 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1029 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1030 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_sub_pd(rinv11
,sh_ewald
),velec
));
1031 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1033 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1035 /* Update potential sum for this i atom from the interaction with this j atom. */
1036 velec
= _mm_and_pd(velec
,cutoff_mask
);
1037 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1038 velecsum
= _mm_add_pd(velecsum
,velec
);
1042 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1044 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1046 /* Calculate temporary vectorial force */
1047 tx
= _mm_mul_pd(fscal
,dx11
);
1048 ty
= _mm_mul_pd(fscal
,dy11
);
1049 tz
= _mm_mul_pd(fscal
,dz11
);
1051 /* Update vectorial force */
1052 fix1
= _mm_add_pd(fix1
,tx
);
1053 fiy1
= _mm_add_pd(fiy1
,ty
);
1054 fiz1
= _mm_add_pd(fiz1
,tz
);
1056 fjx1
= _mm_add_pd(fjx1
,tx
);
1057 fjy1
= _mm_add_pd(fjy1
,ty
);
1058 fjz1
= _mm_add_pd(fjz1
,tz
);
1062 /**************************
1063 * CALCULATE INTERACTIONS *
1064 **************************/
1066 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1069 r12
= _mm_mul_pd(rsq12
,rinv12
);
1071 /* EWALD ELECTROSTATICS */
1073 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1074 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1075 ewitab
= _mm_cvttpd_epi32(ewrt
);
1076 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1077 ewitab
= _mm_slli_epi32(ewitab
,2);
1078 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1079 ewtabD
= _mm_setzero_pd();
1080 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1081 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1082 ewtabFn
= _mm_setzero_pd();
1083 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1084 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1085 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1086 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_sub_pd(rinv12
,sh_ewald
),velec
));
1087 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1089 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1091 /* Update potential sum for this i atom from the interaction with this j atom. */
1092 velec
= _mm_and_pd(velec
,cutoff_mask
);
1093 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1094 velecsum
= _mm_add_pd(velecsum
,velec
);
1098 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1100 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1102 /* Calculate temporary vectorial force */
1103 tx
= _mm_mul_pd(fscal
,dx12
);
1104 ty
= _mm_mul_pd(fscal
,dy12
);
1105 tz
= _mm_mul_pd(fscal
,dz12
);
1107 /* Update vectorial force */
1108 fix1
= _mm_add_pd(fix1
,tx
);
1109 fiy1
= _mm_add_pd(fiy1
,ty
);
1110 fiz1
= _mm_add_pd(fiz1
,tz
);
1112 fjx2
= _mm_add_pd(fjx2
,tx
);
1113 fjy2
= _mm_add_pd(fjy2
,ty
);
1114 fjz2
= _mm_add_pd(fjz2
,tz
);
1118 /**************************
1119 * CALCULATE INTERACTIONS *
1120 **************************/
1122 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
1125 r13
= _mm_mul_pd(rsq13
,rinv13
);
1127 /* EWALD ELECTROSTATICS */
1129 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1130 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
1131 ewitab
= _mm_cvttpd_epi32(ewrt
);
1132 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1133 ewitab
= _mm_slli_epi32(ewitab
,2);
1134 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1135 ewtabD
= _mm_setzero_pd();
1136 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1137 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1138 ewtabFn
= _mm_setzero_pd();
1139 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1140 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1141 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1142 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_sub_pd(rinv13
,sh_ewald
),velec
));
1143 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
1145 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
1147 /* Update potential sum for this i atom from the interaction with this j atom. */
1148 velec
= _mm_and_pd(velec
,cutoff_mask
);
1149 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1150 velecsum
= _mm_add_pd(velecsum
,velec
);
1154 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1156 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1158 /* Calculate temporary vectorial force */
1159 tx
= _mm_mul_pd(fscal
,dx13
);
1160 ty
= _mm_mul_pd(fscal
,dy13
);
1161 tz
= _mm_mul_pd(fscal
,dz13
);
1163 /* Update vectorial force */
1164 fix1
= _mm_add_pd(fix1
,tx
);
1165 fiy1
= _mm_add_pd(fiy1
,ty
);
1166 fiz1
= _mm_add_pd(fiz1
,tz
);
1168 fjx3
= _mm_add_pd(fjx3
,tx
);
1169 fjy3
= _mm_add_pd(fjy3
,ty
);
1170 fjz3
= _mm_add_pd(fjz3
,tz
);
1174 /**************************
1175 * CALCULATE INTERACTIONS *
1176 **************************/
1178 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1181 r21
= _mm_mul_pd(rsq21
,rinv21
);
1183 /* EWALD ELECTROSTATICS */
1185 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1186 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1187 ewitab
= _mm_cvttpd_epi32(ewrt
);
1188 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1189 ewitab
= _mm_slli_epi32(ewitab
,2);
1190 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1191 ewtabD
= _mm_setzero_pd();
1192 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1193 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1194 ewtabFn
= _mm_setzero_pd();
1195 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1196 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1197 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1198 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_sub_pd(rinv21
,sh_ewald
),velec
));
1199 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1201 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1203 /* Update potential sum for this i atom from the interaction with this j atom. */
1204 velec
= _mm_and_pd(velec
,cutoff_mask
);
1205 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1206 velecsum
= _mm_add_pd(velecsum
,velec
);
1210 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1212 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1214 /* Calculate temporary vectorial force */
1215 tx
= _mm_mul_pd(fscal
,dx21
);
1216 ty
= _mm_mul_pd(fscal
,dy21
);
1217 tz
= _mm_mul_pd(fscal
,dz21
);
1219 /* Update vectorial force */
1220 fix2
= _mm_add_pd(fix2
,tx
);
1221 fiy2
= _mm_add_pd(fiy2
,ty
);
1222 fiz2
= _mm_add_pd(fiz2
,tz
);
1224 fjx1
= _mm_add_pd(fjx1
,tx
);
1225 fjy1
= _mm_add_pd(fjy1
,ty
);
1226 fjz1
= _mm_add_pd(fjz1
,tz
);
1230 /**************************
1231 * CALCULATE INTERACTIONS *
1232 **************************/
1234 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1237 r22
= _mm_mul_pd(rsq22
,rinv22
);
1239 /* EWALD ELECTROSTATICS */
1241 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1242 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
1243 ewitab
= _mm_cvttpd_epi32(ewrt
);
1244 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1245 ewitab
= _mm_slli_epi32(ewitab
,2);
1246 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1247 ewtabD
= _mm_setzero_pd();
1248 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1249 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1250 ewtabFn
= _mm_setzero_pd();
1251 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1252 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1253 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1254 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_sub_pd(rinv22
,sh_ewald
),velec
));
1255 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
1257 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
1259 /* Update potential sum for this i atom from the interaction with this j atom. */
1260 velec
= _mm_and_pd(velec
,cutoff_mask
);
1261 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1262 velecsum
= _mm_add_pd(velecsum
,velec
);
1266 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1268 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1270 /* Calculate temporary vectorial force */
1271 tx
= _mm_mul_pd(fscal
,dx22
);
1272 ty
= _mm_mul_pd(fscal
,dy22
);
1273 tz
= _mm_mul_pd(fscal
,dz22
);
1275 /* Update vectorial force */
1276 fix2
= _mm_add_pd(fix2
,tx
);
1277 fiy2
= _mm_add_pd(fiy2
,ty
);
1278 fiz2
= _mm_add_pd(fiz2
,tz
);
1280 fjx2
= _mm_add_pd(fjx2
,tx
);
1281 fjy2
= _mm_add_pd(fjy2
,ty
);
1282 fjz2
= _mm_add_pd(fjz2
,tz
);
1286 /**************************
1287 * CALCULATE INTERACTIONS *
1288 **************************/
1290 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
1293 r23
= _mm_mul_pd(rsq23
,rinv23
);
1295 /* EWALD ELECTROSTATICS */
1297 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1298 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
1299 ewitab
= _mm_cvttpd_epi32(ewrt
);
1300 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1301 ewitab
= _mm_slli_epi32(ewitab
,2);
1302 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1303 ewtabD
= _mm_setzero_pd();
1304 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1305 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1306 ewtabFn
= _mm_setzero_pd();
1307 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1308 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1309 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1310 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_sub_pd(rinv23
,sh_ewald
),velec
));
1311 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
1313 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
1315 /* Update potential sum for this i atom from the interaction with this j atom. */
1316 velec
= _mm_and_pd(velec
,cutoff_mask
);
1317 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1318 velecsum
= _mm_add_pd(velecsum
,velec
);
1322 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1324 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1326 /* Calculate temporary vectorial force */
1327 tx
= _mm_mul_pd(fscal
,dx23
);
1328 ty
= _mm_mul_pd(fscal
,dy23
);
1329 tz
= _mm_mul_pd(fscal
,dz23
);
1331 /* Update vectorial force */
1332 fix2
= _mm_add_pd(fix2
,tx
);
1333 fiy2
= _mm_add_pd(fiy2
,ty
);
1334 fiz2
= _mm_add_pd(fiz2
,tz
);
1336 fjx3
= _mm_add_pd(fjx3
,tx
);
1337 fjy3
= _mm_add_pd(fjy3
,ty
);
1338 fjz3
= _mm_add_pd(fjz3
,tz
);
1342 /**************************
1343 * CALCULATE INTERACTIONS *
1344 **************************/
1346 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
1349 r31
= _mm_mul_pd(rsq31
,rinv31
);
1351 /* EWALD ELECTROSTATICS */
1353 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1354 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
1355 ewitab
= _mm_cvttpd_epi32(ewrt
);
1356 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1357 ewitab
= _mm_slli_epi32(ewitab
,2);
1358 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1359 ewtabD
= _mm_setzero_pd();
1360 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1361 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1362 ewtabFn
= _mm_setzero_pd();
1363 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1364 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1365 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1366 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_sub_pd(rinv31
,sh_ewald
),velec
));
1367 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
1369 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
1371 /* Update potential sum for this i atom from the interaction with this j atom. */
1372 velec
= _mm_and_pd(velec
,cutoff_mask
);
1373 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1374 velecsum
= _mm_add_pd(velecsum
,velec
);
1378 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1380 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1382 /* Calculate temporary vectorial force */
1383 tx
= _mm_mul_pd(fscal
,dx31
);
1384 ty
= _mm_mul_pd(fscal
,dy31
);
1385 tz
= _mm_mul_pd(fscal
,dz31
);
1387 /* Update vectorial force */
1388 fix3
= _mm_add_pd(fix3
,tx
);
1389 fiy3
= _mm_add_pd(fiy3
,ty
);
1390 fiz3
= _mm_add_pd(fiz3
,tz
);
1392 fjx1
= _mm_add_pd(fjx1
,tx
);
1393 fjy1
= _mm_add_pd(fjy1
,ty
);
1394 fjz1
= _mm_add_pd(fjz1
,tz
);
1398 /**************************
1399 * CALCULATE INTERACTIONS *
1400 **************************/
1402 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
1405 r32
= _mm_mul_pd(rsq32
,rinv32
);
1407 /* EWALD ELECTROSTATICS */
1409 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1410 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
1411 ewitab
= _mm_cvttpd_epi32(ewrt
);
1412 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1413 ewitab
= _mm_slli_epi32(ewitab
,2);
1414 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1415 ewtabD
= _mm_setzero_pd();
1416 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1417 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1418 ewtabFn
= _mm_setzero_pd();
1419 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1420 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1421 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1422 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_sub_pd(rinv32
,sh_ewald
),velec
));
1423 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
1425 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
1427 /* Update potential sum for this i atom from the interaction with this j atom. */
1428 velec
= _mm_and_pd(velec
,cutoff_mask
);
1429 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1430 velecsum
= _mm_add_pd(velecsum
,velec
);
1434 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1436 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1438 /* Calculate temporary vectorial force */
1439 tx
= _mm_mul_pd(fscal
,dx32
);
1440 ty
= _mm_mul_pd(fscal
,dy32
);
1441 tz
= _mm_mul_pd(fscal
,dz32
);
1443 /* Update vectorial force */
1444 fix3
= _mm_add_pd(fix3
,tx
);
1445 fiy3
= _mm_add_pd(fiy3
,ty
);
1446 fiz3
= _mm_add_pd(fiz3
,tz
);
1448 fjx2
= _mm_add_pd(fjx2
,tx
);
1449 fjy2
= _mm_add_pd(fjy2
,ty
);
1450 fjz2
= _mm_add_pd(fjz2
,tz
);
1454 /**************************
1455 * CALCULATE INTERACTIONS *
1456 **************************/
1458 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
1461 r33
= _mm_mul_pd(rsq33
,rinv33
);
1463 /* EWALD ELECTROSTATICS */
1465 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1466 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
1467 ewitab
= _mm_cvttpd_epi32(ewrt
);
1468 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1469 ewitab
= _mm_slli_epi32(ewitab
,2);
1470 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1471 ewtabD
= _mm_setzero_pd();
1472 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1473 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1474 ewtabFn
= _mm_setzero_pd();
1475 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1476 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1477 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1478 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_sub_pd(rinv33
,sh_ewald
),velec
));
1479 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
1481 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
1483 /* Update potential sum for this i atom from the interaction with this j atom. */
1484 velec
= _mm_and_pd(velec
,cutoff_mask
);
1485 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1486 velecsum
= _mm_add_pd(velecsum
,velec
);
1490 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1492 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1494 /* Calculate temporary vectorial force */
1495 tx
= _mm_mul_pd(fscal
,dx33
);
1496 ty
= _mm_mul_pd(fscal
,dy33
);
1497 tz
= _mm_mul_pd(fscal
,dz33
);
1499 /* Update vectorial force */
1500 fix3
= _mm_add_pd(fix3
,tx
);
1501 fiy3
= _mm_add_pd(fiy3
,ty
);
1502 fiz3
= _mm_add_pd(fiz3
,tz
);
1504 fjx3
= _mm_add_pd(fjx3
,tx
);
1505 fjy3
= _mm_add_pd(fjy3
,ty
);
1506 fjz3
= _mm_add_pd(fjz3
,tz
);
1510 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1512 /* Inner loop uses 478 flops */
1515 /* End of innermost loop */
1517 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1518 f
+i_coord_offset
,fshift
+i_shift_offset
);
1521 /* Update potential energies */
1522 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1523 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1525 /* Increment number of inner iterations */
1526 inneriter
+= j_index_end
- j_index_start
;
1528 /* Outer loop uses 26 flops */
1531 /* Increment number of outer iterations */
1534 /* Update outer/inner flops */
1536 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*478);
1539 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_double
1540 * Electrostatics interaction: Ewald
1541 * VdW interaction: LJEwald
1542 * Geometry: Water4-Water4
1543 * Calculate force/pot: Force
1546 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_double
1547 (t_nblist
* gmx_restrict nlist
,
1548 rvec
* gmx_restrict xx
,
1549 rvec
* gmx_restrict ff
,
1550 t_forcerec
* gmx_restrict fr
,
1551 t_mdatoms
* gmx_restrict mdatoms
,
1552 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1553 t_nrnb
* gmx_restrict nrnb
)
1555 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1556 * just 0 for non-waters.
1557 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1558 * jnr indices corresponding to data put in the four positions in the SIMD register.
1560 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1561 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1563 int j_coord_offsetA
,j_coord_offsetB
;
1564 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1565 real rcutoff_scalar
;
1566 real
*shiftvec
,*fshift
,*x
,*f
;
1567 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1569 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1571 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1573 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1575 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1576 int vdwjidx0A
,vdwjidx0B
;
1577 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1578 int vdwjidx1A
,vdwjidx1B
;
1579 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1580 int vdwjidx2A
,vdwjidx2B
;
1581 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1582 int vdwjidx3A
,vdwjidx3B
;
1583 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1584 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1585 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1586 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1587 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1588 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1589 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1590 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1591 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1592 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1593 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1594 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1597 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1600 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1601 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1612 __m128d ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
1614 __m128d one_half
= _mm_set1_pd(0.5);
1615 __m128d minus_one
= _mm_set1_pd(-1.0);
1617 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1619 __m128d dummy_mask
,cutoff_mask
;
1620 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1621 __m128d one
= _mm_set1_pd(1.0);
1622 __m128d two
= _mm_set1_pd(2.0);
1628 jindex
= nlist
->jindex
;
1630 shiftidx
= nlist
->shift
;
1632 shiftvec
= fr
->shift_vec
[0];
1633 fshift
= fr
->fshift
[0];
1634 facel
= _mm_set1_pd(fr
->epsfac
);
1635 charge
= mdatoms
->chargeA
;
1636 nvdwtype
= fr
->ntype
;
1637 vdwparam
= fr
->nbfp
;
1638 vdwtype
= mdatoms
->typeA
;
1639 vdwgridparam
= fr
->ljpme_c6grid
;
1640 sh_lj_ewald
= _mm_set1_pd(fr
->ic
->sh_lj_ewald
);
1641 ewclj
= _mm_set1_pd(fr
->ewaldcoeff_lj
);
1642 ewclj2
= _mm_mul_pd(minus_one
,_mm_mul_pd(ewclj
,ewclj
));
1644 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
1645 ewtab
= fr
->ic
->tabq_coul_F
;
1646 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
1647 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
1649 /* Setup water-specific parameters */
1650 inr
= nlist
->iinr
[0];
1651 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1652 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1653 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
1654 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1656 jq1
= _mm_set1_pd(charge
[inr
+1]);
1657 jq2
= _mm_set1_pd(charge
[inr
+2]);
1658 jq3
= _mm_set1_pd(charge
[inr
+3]);
1659 vdwjidx0A
= 2*vdwtype
[inr
+0];
1660 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1661 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1662 c6grid_00
= _mm_set1_pd(vdwgridparam
[vdwioffset0
+vdwjidx0A
]);
1663 qq11
= _mm_mul_pd(iq1
,jq1
);
1664 qq12
= _mm_mul_pd(iq1
,jq2
);
1665 qq13
= _mm_mul_pd(iq1
,jq3
);
1666 qq21
= _mm_mul_pd(iq2
,jq1
);
1667 qq22
= _mm_mul_pd(iq2
,jq2
);
1668 qq23
= _mm_mul_pd(iq2
,jq3
);
1669 qq31
= _mm_mul_pd(iq3
,jq1
);
1670 qq32
= _mm_mul_pd(iq3
,jq2
);
1671 qq33
= _mm_mul_pd(iq3
,jq3
);
1673 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1674 rcutoff_scalar
= fr
->rcoulomb
;
1675 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
1676 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
1678 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
1679 rvdw
= _mm_set1_pd(fr
->rvdw
);
1681 /* Avoid stupid compiler warnings */
1683 j_coord_offsetA
= 0;
1684 j_coord_offsetB
= 0;
1689 /* Start outer loop over neighborlists */
1690 for(iidx
=0; iidx
<nri
; iidx
++)
1692 /* Load shift vector for this list */
1693 i_shift_offset
= DIM
*shiftidx
[iidx
];
1695 /* Load limits for loop over neighbors */
1696 j_index_start
= jindex
[iidx
];
1697 j_index_end
= jindex
[iidx
+1];
1699 /* Get outer coordinate index */
1701 i_coord_offset
= DIM
*inr
;
1703 /* Load i particle coords and add shift vector */
1704 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1705 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1707 fix0
= _mm_setzero_pd();
1708 fiy0
= _mm_setzero_pd();
1709 fiz0
= _mm_setzero_pd();
1710 fix1
= _mm_setzero_pd();
1711 fiy1
= _mm_setzero_pd();
1712 fiz1
= _mm_setzero_pd();
1713 fix2
= _mm_setzero_pd();
1714 fiy2
= _mm_setzero_pd();
1715 fiz2
= _mm_setzero_pd();
1716 fix3
= _mm_setzero_pd();
1717 fiy3
= _mm_setzero_pd();
1718 fiz3
= _mm_setzero_pd();
1720 /* Start inner kernel loop */
1721 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1724 /* Get j neighbor index, and coordinate index */
1726 jnrB
= jjnr
[jidx
+1];
1727 j_coord_offsetA
= DIM
*jnrA
;
1728 j_coord_offsetB
= DIM
*jnrB
;
1730 /* load j atom coordinates */
1731 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1732 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1733 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1735 /* Calculate displacement vector */
1736 dx00
= _mm_sub_pd(ix0
,jx0
);
1737 dy00
= _mm_sub_pd(iy0
,jy0
);
1738 dz00
= _mm_sub_pd(iz0
,jz0
);
1739 dx11
= _mm_sub_pd(ix1
,jx1
);
1740 dy11
= _mm_sub_pd(iy1
,jy1
);
1741 dz11
= _mm_sub_pd(iz1
,jz1
);
1742 dx12
= _mm_sub_pd(ix1
,jx2
);
1743 dy12
= _mm_sub_pd(iy1
,jy2
);
1744 dz12
= _mm_sub_pd(iz1
,jz2
);
1745 dx13
= _mm_sub_pd(ix1
,jx3
);
1746 dy13
= _mm_sub_pd(iy1
,jy3
);
1747 dz13
= _mm_sub_pd(iz1
,jz3
);
1748 dx21
= _mm_sub_pd(ix2
,jx1
);
1749 dy21
= _mm_sub_pd(iy2
,jy1
);
1750 dz21
= _mm_sub_pd(iz2
,jz1
);
1751 dx22
= _mm_sub_pd(ix2
,jx2
);
1752 dy22
= _mm_sub_pd(iy2
,jy2
);
1753 dz22
= _mm_sub_pd(iz2
,jz2
);
1754 dx23
= _mm_sub_pd(ix2
,jx3
);
1755 dy23
= _mm_sub_pd(iy2
,jy3
);
1756 dz23
= _mm_sub_pd(iz2
,jz3
);
1757 dx31
= _mm_sub_pd(ix3
,jx1
);
1758 dy31
= _mm_sub_pd(iy3
,jy1
);
1759 dz31
= _mm_sub_pd(iz3
,jz1
);
1760 dx32
= _mm_sub_pd(ix3
,jx2
);
1761 dy32
= _mm_sub_pd(iy3
,jy2
);
1762 dz32
= _mm_sub_pd(iz3
,jz2
);
1763 dx33
= _mm_sub_pd(ix3
,jx3
);
1764 dy33
= _mm_sub_pd(iy3
,jy3
);
1765 dz33
= _mm_sub_pd(iz3
,jz3
);
1767 /* Calculate squared distance and things based on it */
1768 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1769 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1770 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1771 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1772 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1773 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1774 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1775 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1776 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1777 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1779 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1780 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1781 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1782 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1783 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1784 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1785 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1786 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1787 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1788 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1790 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1791 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1792 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1793 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1794 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1795 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1796 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1797 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1798 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1799 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1801 fjx0
= _mm_setzero_pd();
1802 fjy0
= _mm_setzero_pd();
1803 fjz0
= _mm_setzero_pd();
1804 fjx1
= _mm_setzero_pd();
1805 fjy1
= _mm_setzero_pd();
1806 fjz1
= _mm_setzero_pd();
1807 fjx2
= _mm_setzero_pd();
1808 fjy2
= _mm_setzero_pd();
1809 fjz2
= _mm_setzero_pd();
1810 fjx3
= _mm_setzero_pd();
1811 fjy3
= _mm_setzero_pd();
1812 fjz3
= _mm_setzero_pd();
1814 /**************************
1815 * CALCULATE INTERACTIONS *
1816 **************************/
1818 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1821 r00
= _mm_mul_pd(rsq00
,rinv00
);
1823 /* Analytical LJ-PME */
1824 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1825 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
1826 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
1827 exponent
= gmx_simd_exp_d(ewcljrsq
);
1828 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1829 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
1830 /* f6A = 6 * C6grid * (1 - poly) */
1831 f6A
= _mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
));
1832 /* f6B = C6grid * exponent * beta^6 */
1833 f6B
= _mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
));
1834 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1835 fvdw
= _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),_mm_sub_pd(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
1837 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1841 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1843 /* Calculate temporary vectorial force */
1844 tx
= _mm_mul_pd(fscal
,dx00
);
1845 ty
= _mm_mul_pd(fscal
,dy00
);
1846 tz
= _mm_mul_pd(fscal
,dz00
);
1848 /* Update vectorial force */
1849 fix0
= _mm_add_pd(fix0
,tx
);
1850 fiy0
= _mm_add_pd(fiy0
,ty
);
1851 fiz0
= _mm_add_pd(fiz0
,tz
);
1853 fjx0
= _mm_add_pd(fjx0
,tx
);
1854 fjy0
= _mm_add_pd(fjy0
,ty
);
1855 fjz0
= _mm_add_pd(fjz0
,tz
);
1859 /**************************
1860 * CALCULATE INTERACTIONS *
1861 **************************/
1863 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1866 r11
= _mm_mul_pd(rsq11
,rinv11
);
1868 /* EWALD ELECTROSTATICS */
1870 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1871 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1872 ewitab
= _mm_cvttpd_epi32(ewrt
);
1873 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1874 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1876 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1877 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1879 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1883 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1885 /* Calculate temporary vectorial force */
1886 tx
= _mm_mul_pd(fscal
,dx11
);
1887 ty
= _mm_mul_pd(fscal
,dy11
);
1888 tz
= _mm_mul_pd(fscal
,dz11
);
1890 /* Update vectorial force */
1891 fix1
= _mm_add_pd(fix1
,tx
);
1892 fiy1
= _mm_add_pd(fiy1
,ty
);
1893 fiz1
= _mm_add_pd(fiz1
,tz
);
1895 fjx1
= _mm_add_pd(fjx1
,tx
);
1896 fjy1
= _mm_add_pd(fjy1
,ty
);
1897 fjz1
= _mm_add_pd(fjz1
,tz
);
1901 /**************************
1902 * CALCULATE INTERACTIONS *
1903 **************************/
1905 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1908 r12
= _mm_mul_pd(rsq12
,rinv12
);
1910 /* EWALD ELECTROSTATICS */
1912 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1913 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1914 ewitab
= _mm_cvttpd_epi32(ewrt
);
1915 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1916 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1918 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1919 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1921 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1925 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1927 /* Calculate temporary vectorial force */
1928 tx
= _mm_mul_pd(fscal
,dx12
);
1929 ty
= _mm_mul_pd(fscal
,dy12
);
1930 tz
= _mm_mul_pd(fscal
,dz12
);
1932 /* Update vectorial force */
1933 fix1
= _mm_add_pd(fix1
,tx
);
1934 fiy1
= _mm_add_pd(fiy1
,ty
);
1935 fiz1
= _mm_add_pd(fiz1
,tz
);
1937 fjx2
= _mm_add_pd(fjx2
,tx
);
1938 fjy2
= _mm_add_pd(fjy2
,ty
);
1939 fjz2
= _mm_add_pd(fjz2
,tz
);
1943 /**************************
1944 * CALCULATE INTERACTIONS *
1945 **************************/
1947 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
1950 r13
= _mm_mul_pd(rsq13
,rinv13
);
1952 /* EWALD ELECTROSTATICS */
1954 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1955 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
1956 ewitab
= _mm_cvttpd_epi32(ewrt
);
1957 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1958 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1960 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1961 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
1963 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
1967 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1969 /* Calculate temporary vectorial force */
1970 tx
= _mm_mul_pd(fscal
,dx13
);
1971 ty
= _mm_mul_pd(fscal
,dy13
);
1972 tz
= _mm_mul_pd(fscal
,dz13
);
1974 /* Update vectorial force */
1975 fix1
= _mm_add_pd(fix1
,tx
);
1976 fiy1
= _mm_add_pd(fiy1
,ty
);
1977 fiz1
= _mm_add_pd(fiz1
,tz
);
1979 fjx3
= _mm_add_pd(fjx3
,tx
);
1980 fjy3
= _mm_add_pd(fjy3
,ty
);
1981 fjz3
= _mm_add_pd(fjz3
,tz
);
1985 /**************************
1986 * CALCULATE INTERACTIONS *
1987 **************************/
1989 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1992 r21
= _mm_mul_pd(rsq21
,rinv21
);
1994 /* EWALD ELECTROSTATICS */
1996 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1997 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1998 ewitab
= _mm_cvttpd_epi32(ewrt
);
1999 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2000 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2002 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2003 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
2005 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
2009 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2011 /* Calculate temporary vectorial force */
2012 tx
= _mm_mul_pd(fscal
,dx21
);
2013 ty
= _mm_mul_pd(fscal
,dy21
);
2014 tz
= _mm_mul_pd(fscal
,dz21
);
2016 /* Update vectorial force */
2017 fix2
= _mm_add_pd(fix2
,tx
);
2018 fiy2
= _mm_add_pd(fiy2
,ty
);
2019 fiz2
= _mm_add_pd(fiz2
,tz
);
2021 fjx1
= _mm_add_pd(fjx1
,tx
);
2022 fjy1
= _mm_add_pd(fjy1
,ty
);
2023 fjz1
= _mm_add_pd(fjz1
,tz
);
2027 /**************************
2028 * CALCULATE INTERACTIONS *
2029 **************************/
2031 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2034 r22
= _mm_mul_pd(rsq22
,rinv22
);
2036 /* EWALD ELECTROSTATICS */
2038 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2039 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2040 ewitab
= _mm_cvttpd_epi32(ewrt
);
2041 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2042 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2044 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2045 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
2047 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
2051 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2053 /* Calculate temporary vectorial force */
2054 tx
= _mm_mul_pd(fscal
,dx22
);
2055 ty
= _mm_mul_pd(fscal
,dy22
);
2056 tz
= _mm_mul_pd(fscal
,dz22
);
2058 /* Update vectorial force */
2059 fix2
= _mm_add_pd(fix2
,tx
);
2060 fiy2
= _mm_add_pd(fiy2
,ty
);
2061 fiz2
= _mm_add_pd(fiz2
,tz
);
2063 fjx2
= _mm_add_pd(fjx2
,tx
);
2064 fjy2
= _mm_add_pd(fjy2
,ty
);
2065 fjz2
= _mm_add_pd(fjz2
,tz
);
2069 /**************************
2070 * CALCULATE INTERACTIONS *
2071 **************************/
2073 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
2076 r23
= _mm_mul_pd(rsq23
,rinv23
);
2078 /* EWALD ELECTROSTATICS */
2080 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2081 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
2082 ewitab
= _mm_cvttpd_epi32(ewrt
);
2083 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2084 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2086 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2087 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
2089 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
2093 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2095 /* Calculate temporary vectorial force */
2096 tx
= _mm_mul_pd(fscal
,dx23
);
2097 ty
= _mm_mul_pd(fscal
,dy23
);
2098 tz
= _mm_mul_pd(fscal
,dz23
);
2100 /* Update vectorial force */
2101 fix2
= _mm_add_pd(fix2
,tx
);
2102 fiy2
= _mm_add_pd(fiy2
,ty
);
2103 fiz2
= _mm_add_pd(fiz2
,tz
);
2105 fjx3
= _mm_add_pd(fjx3
,tx
);
2106 fjy3
= _mm_add_pd(fjy3
,ty
);
2107 fjz3
= _mm_add_pd(fjz3
,tz
);
2111 /**************************
2112 * CALCULATE INTERACTIONS *
2113 **************************/
2115 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
2118 r31
= _mm_mul_pd(rsq31
,rinv31
);
2120 /* EWALD ELECTROSTATICS */
2122 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2123 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
2124 ewitab
= _mm_cvttpd_epi32(ewrt
);
2125 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2126 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2128 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2129 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
2131 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
2135 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2137 /* Calculate temporary vectorial force */
2138 tx
= _mm_mul_pd(fscal
,dx31
);
2139 ty
= _mm_mul_pd(fscal
,dy31
);
2140 tz
= _mm_mul_pd(fscal
,dz31
);
2142 /* Update vectorial force */
2143 fix3
= _mm_add_pd(fix3
,tx
);
2144 fiy3
= _mm_add_pd(fiy3
,ty
);
2145 fiz3
= _mm_add_pd(fiz3
,tz
);
2147 fjx1
= _mm_add_pd(fjx1
,tx
);
2148 fjy1
= _mm_add_pd(fjy1
,ty
);
2149 fjz1
= _mm_add_pd(fjz1
,tz
);
2153 /**************************
2154 * CALCULATE INTERACTIONS *
2155 **************************/
2157 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
2160 r32
= _mm_mul_pd(rsq32
,rinv32
);
2162 /* EWALD ELECTROSTATICS */
2164 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2165 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
2166 ewitab
= _mm_cvttpd_epi32(ewrt
);
2167 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2168 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2170 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2171 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
2173 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
2177 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2179 /* Calculate temporary vectorial force */
2180 tx
= _mm_mul_pd(fscal
,dx32
);
2181 ty
= _mm_mul_pd(fscal
,dy32
);
2182 tz
= _mm_mul_pd(fscal
,dz32
);
2184 /* Update vectorial force */
2185 fix3
= _mm_add_pd(fix3
,tx
);
2186 fiy3
= _mm_add_pd(fiy3
,ty
);
2187 fiz3
= _mm_add_pd(fiz3
,tz
);
2189 fjx2
= _mm_add_pd(fjx2
,tx
);
2190 fjy2
= _mm_add_pd(fjy2
,ty
);
2191 fjz2
= _mm_add_pd(fjz2
,tz
);
2195 /**************************
2196 * CALCULATE INTERACTIONS *
2197 **************************/
2199 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
2202 r33
= _mm_mul_pd(rsq33
,rinv33
);
2204 /* EWALD ELECTROSTATICS */
2206 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2207 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
2208 ewitab
= _mm_cvttpd_epi32(ewrt
);
2209 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2210 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2212 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2213 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
2215 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
2219 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2221 /* Calculate temporary vectorial force */
2222 tx
= _mm_mul_pd(fscal
,dx33
);
2223 ty
= _mm_mul_pd(fscal
,dy33
);
2224 tz
= _mm_mul_pd(fscal
,dz33
);
2226 /* Update vectorial force */
2227 fix3
= _mm_add_pd(fix3
,tx
);
2228 fiy3
= _mm_add_pd(fiy3
,ty
);
2229 fiz3
= _mm_add_pd(fiz3
,tz
);
2231 fjx3
= _mm_add_pd(fjx3
,tx
);
2232 fjy3
= _mm_add_pd(fjy3
,ty
);
2233 fjz3
= _mm_add_pd(fjz3
,tz
);
2237 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2239 /* Inner loop uses 403 flops */
2242 if(jidx
<j_index_end
)
2246 j_coord_offsetA
= DIM
*jnrA
;
2248 /* load j atom coordinates */
2249 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
2250 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
2251 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
2253 /* Calculate displacement vector */
2254 dx00
= _mm_sub_pd(ix0
,jx0
);
2255 dy00
= _mm_sub_pd(iy0
,jy0
);
2256 dz00
= _mm_sub_pd(iz0
,jz0
);
2257 dx11
= _mm_sub_pd(ix1
,jx1
);
2258 dy11
= _mm_sub_pd(iy1
,jy1
);
2259 dz11
= _mm_sub_pd(iz1
,jz1
);
2260 dx12
= _mm_sub_pd(ix1
,jx2
);
2261 dy12
= _mm_sub_pd(iy1
,jy2
);
2262 dz12
= _mm_sub_pd(iz1
,jz2
);
2263 dx13
= _mm_sub_pd(ix1
,jx3
);
2264 dy13
= _mm_sub_pd(iy1
,jy3
);
2265 dz13
= _mm_sub_pd(iz1
,jz3
);
2266 dx21
= _mm_sub_pd(ix2
,jx1
);
2267 dy21
= _mm_sub_pd(iy2
,jy1
);
2268 dz21
= _mm_sub_pd(iz2
,jz1
);
2269 dx22
= _mm_sub_pd(ix2
,jx2
);
2270 dy22
= _mm_sub_pd(iy2
,jy2
);
2271 dz22
= _mm_sub_pd(iz2
,jz2
);
2272 dx23
= _mm_sub_pd(ix2
,jx3
);
2273 dy23
= _mm_sub_pd(iy2
,jy3
);
2274 dz23
= _mm_sub_pd(iz2
,jz3
);
2275 dx31
= _mm_sub_pd(ix3
,jx1
);
2276 dy31
= _mm_sub_pd(iy3
,jy1
);
2277 dz31
= _mm_sub_pd(iz3
,jz1
);
2278 dx32
= _mm_sub_pd(ix3
,jx2
);
2279 dy32
= _mm_sub_pd(iy3
,jy2
);
2280 dz32
= _mm_sub_pd(iz3
,jz2
);
2281 dx33
= _mm_sub_pd(ix3
,jx3
);
2282 dy33
= _mm_sub_pd(iy3
,jy3
);
2283 dz33
= _mm_sub_pd(iz3
,jz3
);
2285 /* Calculate squared distance and things based on it */
2286 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
2287 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
2288 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
2289 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
2290 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
2291 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
2292 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
2293 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
2294 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
2295 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
2297 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
2298 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
2299 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
2300 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
2301 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
2302 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
2303 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
2304 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
2305 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
2306 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
2308 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
2309 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
2310 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
2311 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
2312 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
2313 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
2314 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
2315 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
2316 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
2317 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
2319 fjx0
= _mm_setzero_pd();
2320 fjy0
= _mm_setzero_pd();
2321 fjz0
= _mm_setzero_pd();
2322 fjx1
= _mm_setzero_pd();
2323 fjy1
= _mm_setzero_pd();
2324 fjz1
= _mm_setzero_pd();
2325 fjx2
= _mm_setzero_pd();
2326 fjy2
= _mm_setzero_pd();
2327 fjz2
= _mm_setzero_pd();
2328 fjx3
= _mm_setzero_pd();
2329 fjy3
= _mm_setzero_pd();
2330 fjz3
= _mm_setzero_pd();
2332 /**************************
2333 * CALCULATE INTERACTIONS *
2334 **************************/
2336 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
2339 r00
= _mm_mul_pd(rsq00
,rinv00
);
2341 /* Analytical LJ-PME */
2342 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
2343 ewcljrsq
= _mm_mul_pd(ewclj2
,rsq00
);
2344 ewclj6
= _mm_mul_pd(ewclj2
,_mm_mul_pd(ewclj2
,ewclj2
));
2345 exponent
= gmx_simd_exp_d(ewcljrsq
);
2346 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2347 poly
= _mm_mul_pd(exponent
,_mm_add_pd(_mm_sub_pd(one
,ewcljrsq
),_mm_mul_pd(_mm_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
2348 /* f6A = 6 * C6grid * (1 - poly) */
2349 f6A
= _mm_mul_pd(c6grid_00
,_mm_sub_pd(one
,poly
));
2350 /* f6B = C6grid * exponent * beta^6 */
2351 f6B
= _mm_mul_pd(_mm_mul_pd(c6grid_00
,one_sixth
),_mm_mul_pd(exponent
,ewclj6
));
2352 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2353 fvdw
= _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),_mm_sub_pd(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
2355 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
2359 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2361 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2363 /* Calculate temporary vectorial force */
2364 tx
= _mm_mul_pd(fscal
,dx00
);
2365 ty
= _mm_mul_pd(fscal
,dy00
);
2366 tz
= _mm_mul_pd(fscal
,dz00
);
2368 /* Update vectorial force */
2369 fix0
= _mm_add_pd(fix0
,tx
);
2370 fiy0
= _mm_add_pd(fiy0
,ty
);
2371 fiz0
= _mm_add_pd(fiz0
,tz
);
2373 fjx0
= _mm_add_pd(fjx0
,tx
);
2374 fjy0
= _mm_add_pd(fjy0
,ty
);
2375 fjz0
= _mm_add_pd(fjz0
,tz
);
2379 /**************************
2380 * CALCULATE INTERACTIONS *
2381 **************************/
2383 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2386 r11
= _mm_mul_pd(rsq11
,rinv11
);
2388 /* EWALD ELECTROSTATICS */
2390 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2391 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
2392 ewitab
= _mm_cvttpd_epi32(ewrt
);
2393 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2394 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2395 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2396 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
2398 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
2402 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2404 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2406 /* Calculate temporary vectorial force */
2407 tx
= _mm_mul_pd(fscal
,dx11
);
2408 ty
= _mm_mul_pd(fscal
,dy11
);
2409 tz
= _mm_mul_pd(fscal
,dz11
);
2411 /* Update vectorial force */
2412 fix1
= _mm_add_pd(fix1
,tx
);
2413 fiy1
= _mm_add_pd(fiy1
,ty
);
2414 fiz1
= _mm_add_pd(fiz1
,tz
);
2416 fjx1
= _mm_add_pd(fjx1
,tx
);
2417 fjy1
= _mm_add_pd(fjy1
,ty
);
2418 fjz1
= _mm_add_pd(fjz1
,tz
);
2422 /**************************
2423 * CALCULATE INTERACTIONS *
2424 **************************/
2426 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2429 r12
= _mm_mul_pd(rsq12
,rinv12
);
2431 /* EWALD ELECTROSTATICS */
2433 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2434 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
2435 ewitab
= _mm_cvttpd_epi32(ewrt
);
2436 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2437 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2438 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2439 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
2441 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
2445 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2447 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2449 /* Calculate temporary vectorial force */
2450 tx
= _mm_mul_pd(fscal
,dx12
);
2451 ty
= _mm_mul_pd(fscal
,dy12
);
2452 tz
= _mm_mul_pd(fscal
,dz12
);
2454 /* Update vectorial force */
2455 fix1
= _mm_add_pd(fix1
,tx
);
2456 fiy1
= _mm_add_pd(fiy1
,ty
);
2457 fiz1
= _mm_add_pd(fiz1
,tz
);
2459 fjx2
= _mm_add_pd(fjx2
,tx
);
2460 fjy2
= _mm_add_pd(fjy2
,ty
);
2461 fjz2
= _mm_add_pd(fjz2
,tz
);
2465 /**************************
2466 * CALCULATE INTERACTIONS *
2467 **************************/
2469 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
2472 r13
= _mm_mul_pd(rsq13
,rinv13
);
2474 /* EWALD ELECTROSTATICS */
2476 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2477 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
2478 ewitab
= _mm_cvttpd_epi32(ewrt
);
2479 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2480 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2481 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2482 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
2484 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
2488 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2490 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2492 /* Calculate temporary vectorial force */
2493 tx
= _mm_mul_pd(fscal
,dx13
);
2494 ty
= _mm_mul_pd(fscal
,dy13
);
2495 tz
= _mm_mul_pd(fscal
,dz13
);
2497 /* Update vectorial force */
2498 fix1
= _mm_add_pd(fix1
,tx
);
2499 fiy1
= _mm_add_pd(fiy1
,ty
);
2500 fiz1
= _mm_add_pd(fiz1
,tz
);
2502 fjx3
= _mm_add_pd(fjx3
,tx
);
2503 fjy3
= _mm_add_pd(fjy3
,ty
);
2504 fjz3
= _mm_add_pd(fjz3
,tz
);
2508 /**************************
2509 * CALCULATE INTERACTIONS *
2510 **************************/
2512 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2515 r21
= _mm_mul_pd(rsq21
,rinv21
);
2517 /* EWALD ELECTROSTATICS */
2519 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2520 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
2521 ewitab
= _mm_cvttpd_epi32(ewrt
);
2522 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2523 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2524 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2525 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
2527 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
2531 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2533 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2535 /* Calculate temporary vectorial force */
2536 tx
= _mm_mul_pd(fscal
,dx21
);
2537 ty
= _mm_mul_pd(fscal
,dy21
);
2538 tz
= _mm_mul_pd(fscal
,dz21
);
2540 /* Update vectorial force */
2541 fix2
= _mm_add_pd(fix2
,tx
);
2542 fiy2
= _mm_add_pd(fiy2
,ty
);
2543 fiz2
= _mm_add_pd(fiz2
,tz
);
2545 fjx1
= _mm_add_pd(fjx1
,tx
);
2546 fjy1
= _mm_add_pd(fjy1
,ty
);
2547 fjz1
= _mm_add_pd(fjz1
,tz
);
2551 /**************************
2552 * CALCULATE INTERACTIONS *
2553 **************************/
2555 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2558 r22
= _mm_mul_pd(rsq22
,rinv22
);
2560 /* EWALD ELECTROSTATICS */
2562 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2563 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2564 ewitab
= _mm_cvttpd_epi32(ewrt
);
2565 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2566 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2567 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2568 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
2570 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
2574 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2576 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2578 /* Calculate temporary vectorial force */
2579 tx
= _mm_mul_pd(fscal
,dx22
);
2580 ty
= _mm_mul_pd(fscal
,dy22
);
2581 tz
= _mm_mul_pd(fscal
,dz22
);
2583 /* Update vectorial force */
2584 fix2
= _mm_add_pd(fix2
,tx
);
2585 fiy2
= _mm_add_pd(fiy2
,ty
);
2586 fiz2
= _mm_add_pd(fiz2
,tz
);
2588 fjx2
= _mm_add_pd(fjx2
,tx
);
2589 fjy2
= _mm_add_pd(fjy2
,ty
);
2590 fjz2
= _mm_add_pd(fjz2
,tz
);
2594 /**************************
2595 * CALCULATE INTERACTIONS *
2596 **************************/
2598 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
2601 r23
= _mm_mul_pd(rsq23
,rinv23
);
2603 /* EWALD ELECTROSTATICS */
2605 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2606 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
2607 ewitab
= _mm_cvttpd_epi32(ewrt
);
2608 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2609 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2610 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2611 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
2613 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
2617 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2619 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2621 /* Calculate temporary vectorial force */
2622 tx
= _mm_mul_pd(fscal
,dx23
);
2623 ty
= _mm_mul_pd(fscal
,dy23
);
2624 tz
= _mm_mul_pd(fscal
,dz23
);
2626 /* Update vectorial force */
2627 fix2
= _mm_add_pd(fix2
,tx
);
2628 fiy2
= _mm_add_pd(fiy2
,ty
);
2629 fiz2
= _mm_add_pd(fiz2
,tz
);
2631 fjx3
= _mm_add_pd(fjx3
,tx
);
2632 fjy3
= _mm_add_pd(fjy3
,ty
);
2633 fjz3
= _mm_add_pd(fjz3
,tz
);
2637 /**************************
2638 * CALCULATE INTERACTIONS *
2639 **************************/
2641 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
2644 r31
= _mm_mul_pd(rsq31
,rinv31
);
2646 /* EWALD ELECTROSTATICS */
2648 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2649 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
2650 ewitab
= _mm_cvttpd_epi32(ewrt
);
2651 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2652 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2653 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2654 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
2656 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
2660 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2662 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2664 /* Calculate temporary vectorial force */
2665 tx
= _mm_mul_pd(fscal
,dx31
);
2666 ty
= _mm_mul_pd(fscal
,dy31
);
2667 tz
= _mm_mul_pd(fscal
,dz31
);
2669 /* Update vectorial force */
2670 fix3
= _mm_add_pd(fix3
,tx
);
2671 fiy3
= _mm_add_pd(fiy3
,ty
);
2672 fiz3
= _mm_add_pd(fiz3
,tz
);
2674 fjx1
= _mm_add_pd(fjx1
,tx
);
2675 fjy1
= _mm_add_pd(fjy1
,ty
);
2676 fjz1
= _mm_add_pd(fjz1
,tz
);
2680 /**************************
2681 * CALCULATE INTERACTIONS *
2682 **************************/
2684 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
2687 r32
= _mm_mul_pd(rsq32
,rinv32
);
2689 /* EWALD ELECTROSTATICS */
2691 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2692 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
2693 ewitab
= _mm_cvttpd_epi32(ewrt
);
2694 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2695 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2696 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2697 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
2699 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
2703 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2705 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2707 /* Calculate temporary vectorial force */
2708 tx
= _mm_mul_pd(fscal
,dx32
);
2709 ty
= _mm_mul_pd(fscal
,dy32
);
2710 tz
= _mm_mul_pd(fscal
,dz32
);
2712 /* Update vectorial force */
2713 fix3
= _mm_add_pd(fix3
,tx
);
2714 fiy3
= _mm_add_pd(fiy3
,ty
);
2715 fiz3
= _mm_add_pd(fiz3
,tz
);
2717 fjx2
= _mm_add_pd(fjx2
,tx
);
2718 fjy2
= _mm_add_pd(fjy2
,ty
);
2719 fjz2
= _mm_add_pd(fjz2
,tz
);
2723 /**************************
2724 * CALCULATE INTERACTIONS *
2725 **************************/
2727 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
2730 r33
= _mm_mul_pd(rsq33
,rinv33
);
2732 /* EWALD ELECTROSTATICS */
2734 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2735 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
2736 ewitab
= _mm_cvttpd_epi32(ewrt
);
2737 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2738 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2739 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2740 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
2742 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
2746 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2748 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2750 /* Calculate temporary vectorial force */
2751 tx
= _mm_mul_pd(fscal
,dx33
);
2752 ty
= _mm_mul_pd(fscal
,dy33
);
2753 tz
= _mm_mul_pd(fscal
,dz33
);
2755 /* Update vectorial force */
2756 fix3
= _mm_add_pd(fix3
,tx
);
2757 fiy3
= _mm_add_pd(fiy3
,ty
);
2758 fiz3
= _mm_add_pd(fiz3
,tz
);
2760 fjx3
= _mm_add_pd(fjx3
,tx
);
2761 fjy3
= _mm_add_pd(fjy3
,ty
);
2762 fjz3
= _mm_add_pd(fjz3
,tz
);
2766 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2768 /* Inner loop uses 403 flops */
2771 /* End of innermost loop */
2773 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2774 f
+i_coord_offset
,fshift
+i_shift_offset
);
2776 /* Increment number of inner iterations */
2777 inneriter
+= j_index_end
- j_index_start
;
2779 /* Outer loop uses 24 flops */
2782 /* Increment number of outer iterations */
2785 /* Update outer/inner flops */
2787 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*403);