2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_single.h"
49 #include "kernelutil_x86_sse2_single.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_single
53 * Electrostatics interaction: Ewald
54 * VdW interaction: LJEwald
55 * Geometry: Water3-Water3
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_single
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
75 int jnrA
,jnrB
,jnrC
,jnrD
;
76 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
77 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
78 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
80 real
*shiftvec
,*fshift
,*x
,*f
;
81 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
83 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
85 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
87 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
89 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
90 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
91 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
92 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
93 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
94 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
95 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
96 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
97 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
98 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
99 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
100 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
101 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
102 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
103 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
104 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
105 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
108 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
111 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
112 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
122 __m128 ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
124 __m128 one_half
= _mm_set1_ps(0.5);
125 __m128 minus_one
= _mm_set1_ps(-1.0);
127 __m128 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
129 __m128 dummy_mask
,cutoff_mask
;
130 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
131 __m128 one
= _mm_set1_ps(1.0);
132 __m128 two
= _mm_set1_ps(2.0);
138 jindex
= nlist
->jindex
;
140 shiftidx
= nlist
->shift
;
142 shiftvec
= fr
->shift_vec
[0];
143 fshift
= fr
->fshift
[0];
144 facel
= _mm_set1_ps(fr
->epsfac
);
145 charge
= mdatoms
->chargeA
;
146 nvdwtype
= fr
->ntype
;
148 vdwtype
= mdatoms
->typeA
;
149 vdwgridparam
= fr
->ljpme_c6grid
;
150 sh_lj_ewald
= _mm_set1_ps(fr
->ic
->sh_lj_ewald
);
151 ewclj
= _mm_set1_ps(fr
->ewaldcoeff_lj
);
152 ewclj2
= _mm_mul_ps(minus_one
,_mm_mul_ps(ewclj
,ewclj
));
154 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
155 ewtab
= fr
->ic
->tabq_coul_FDV0
;
156 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
157 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
159 /* Setup water-specific parameters */
160 inr
= nlist
->iinr
[0];
161 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
162 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
163 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
164 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
166 jq0
= _mm_set1_ps(charge
[inr
+0]);
167 jq1
= _mm_set1_ps(charge
[inr
+1]);
168 jq2
= _mm_set1_ps(charge
[inr
+2]);
169 vdwjidx0A
= 2*vdwtype
[inr
+0];
170 qq00
= _mm_mul_ps(iq0
,jq0
);
171 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
172 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
173 c6grid_00
= _mm_set1_ps(vdwgridparam
[vdwioffset0
+vdwjidx0A
]);
174 qq01
= _mm_mul_ps(iq0
,jq1
);
175 qq02
= _mm_mul_ps(iq0
,jq2
);
176 qq10
= _mm_mul_ps(iq1
,jq0
);
177 qq11
= _mm_mul_ps(iq1
,jq1
);
178 qq12
= _mm_mul_ps(iq1
,jq2
);
179 qq20
= _mm_mul_ps(iq2
,jq0
);
180 qq21
= _mm_mul_ps(iq2
,jq1
);
181 qq22
= _mm_mul_ps(iq2
,jq2
);
183 /* Avoid stupid compiler warnings */
184 jnrA
= jnrB
= jnrC
= jnrD
= 0;
193 for(iidx
=0;iidx
<4*DIM
;iidx
++)
198 /* Start outer loop over neighborlists */
199 for(iidx
=0; iidx
<nri
; iidx
++)
201 /* Load shift vector for this list */
202 i_shift_offset
= DIM
*shiftidx
[iidx
];
204 /* Load limits for loop over neighbors */
205 j_index_start
= jindex
[iidx
];
206 j_index_end
= jindex
[iidx
+1];
208 /* Get outer coordinate index */
210 i_coord_offset
= DIM
*inr
;
212 /* Load i particle coords and add shift vector */
213 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
214 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
216 fix0
= _mm_setzero_ps();
217 fiy0
= _mm_setzero_ps();
218 fiz0
= _mm_setzero_ps();
219 fix1
= _mm_setzero_ps();
220 fiy1
= _mm_setzero_ps();
221 fiz1
= _mm_setzero_ps();
222 fix2
= _mm_setzero_ps();
223 fiy2
= _mm_setzero_ps();
224 fiz2
= _mm_setzero_ps();
226 /* Reset potential sums */
227 velecsum
= _mm_setzero_ps();
228 vvdwsum
= _mm_setzero_ps();
230 /* Start inner kernel loop */
231 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
234 /* Get j neighbor index, and coordinate index */
239 j_coord_offsetA
= DIM
*jnrA
;
240 j_coord_offsetB
= DIM
*jnrB
;
241 j_coord_offsetC
= DIM
*jnrC
;
242 j_coord_offsetD
= DIM
*jnrD
;
244 /* load j atom coordinates */
245 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
246 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
247 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
249 /* Calculate displacement vector */
250 dx00
= _mm_sub_ps(ix0
,jx0
);
251 dy00
= _mm_sub_ps(iy0
,jy0
);
252 dz00
= _mm_sub_ps(iz0
,jz0
);
253 dx01
= _mm_sub_ps(ix0
,jx1
);
254 dy01
= _mm_sub_ps(iy0
,jy1
);
255 dz01
= _mm_sub_ps(iz0
,jz1
);
256 dx02
= _mm_sub_ps(ix0
,jx2
);
257 dy02
= _mm_sub_ps(iy0
,jy2
);
258 dz02
= _mm_sub_ps(iz0
,jz2
);
259 dx10
= _mm_sub_ps(ix1
,jx0
);
260 dy10
= _mm_sub_ps(iy1
,jy0
);
261 dz10
= _mm_sub_ps(iz1
,jz0
);
262 dx11
= _mm_sub_ps(ix1
,jx1
);
263 dy11
= _mm_sub_ps(iy1
,jy1
);
264 dz11
= _mm_sub_ps(iz1
,jz1
);
265 dx12
= _mm_sub_ps(ix1
,jx2
);
266 dy12
= _mm_sub_ps(iy1
,jy2
);
267 dz12
= _mm_sub_ps(iz1
,jz2
);
268 dx20
= _mm_sub_ps(ix2
,jx0
);
269 dy20
= _mm_sub_ps(iy2
,jy0
);
270 dz20
= _mm_sub_ps(iz2
,jz0
);
271 dx21
= _mm_sub_ps(ix2
,jx1
);
272 dy21
= _mm_sub_ps(iy2
,jy1
);
273 dz21
= _mm_sub_ps(iz2
,jz1
);
274 dx22
= _mm_sub_ps(ix2
,jx2
);
275 dy22
= _mm_sub_ps(iy2
,jy2
);
276 dz22
= _mm_sub_ps(iz2
,jz2
);
278 /* Calculate squared distance and things based on it */
279 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
280 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
281 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
282 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
283 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
284 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
285 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
286 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
287 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
289 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
290 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
291 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
292 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
293 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
294 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
295 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
296 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
297 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
299 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
300 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
301 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
302 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
303 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
304 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
305 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
306 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
307 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
309 fjx0
= _mm_setzero_ps();
310 fjy0
= _mm_setzero_ps();
311 fjz0
= _mm_setzero_ps();
312 fjx1
= _mm_setzero_ps();
313 fjy1
= _mm_setzero_ps();
314 fjz1
= _mm_setzero_ps();
315 fjx2
= _mm_setzero_ps();
316 fjy2
= _mm_setzero_ps();
317 fjz2
= _mm_setzero_ps();
319 /**************************
320 * CALCULATE INTERACTIONS *
321 **************************/
323 r00
= _mm_mul_ps(rsq00
,rinv00
);
325 /* EWALD ELECTROSTATICS */
327 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
328 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
329 ewitab
= _mm_cvttps_epi32(ewrt
);
330 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
331 ewitab
= _mm_slli_epi32(ewitab
,2);
332 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
333 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
334 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
335 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
336 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
337 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
338 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
339 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(rinv00
,velec
));
340 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
342 /* Analytical LJ-PME */
343 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
344 ewcljrsq
= _mm_mul_ps(ewclj2
,rsq00
);
345 ewclj6
= _mm_mul_ps(ewclj2
,_mm_mul_ps(ewclj2
,ewclj2
));
346 exponent
= gmx_simd_exp_r(ewcljrsq
);
347 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
348 poly
= _mm_mul_ps(exponent
,_mm_add_ps(_mm_sub_ps(one
,ewcljrsq
),_mm_mul_ps(_mm_mul_ps(ewcljrsq
,ewcljrsq
),one_half
)));
349 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
350 vvdw6
= _mm_mul_ps(_mm_sub_ps(c6_00
,_mm_mul_ps(c6grid_00
,_mm_sub_ps(one
,poly
))),rinvsix
);
351 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
352 vvdw
= _mm_sub_ps(_mm_mul_ps(vvdw12
,one_twelfth
),_mm_mul_ps(vvdw6
,one_sixth
));
353 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
354 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,_mm_sub_ps(vvdw6
,_mm_mul_ps(_mm_mul_ps(c6grid_00
,one_sixth
),_mm_mul_ps(exponent
,ewclj6
)))),rinvsq00
);
356 /* Update potential sum for this i atom from the interaction with this j atom. */
357 velecsum
= _mm_add_ps(velecsum
,velec
);
358 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
360 fscal
= _mm_add_ps(felec
,fvdw
);
362 /* Calculate temporary vectorial force */
363 tx
= _mm_mul_ps(fscal
,dx00
);
364 ty
= _mm_mul_ps(fscal
,dy00
);
365 tz
= _mm_mul_ps(fscal
,dz00
);
367 /* Update vectorial force */
368 fix0
= _mm_add_ps(fix0
,tx
);
369 fiy0
= _mm_add_ps(fiy0
,ty
);
370 fiz0
= _mm_add_ps(fiz0
,tz
);
372 fjx0
= _mm_add_ps(fjx0
,tx
);
373 fjy0
= _mm_add_ps(fjy0
,ty
);
374 fjz0
= _mm_add_ps(fjz0
,tz
);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r01
= _mm_mul_ps(rsq01
,rinv01
);
382 /* EWALD ELECTROSTATICS */
384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
385 ewrt
= _mm_mul_ps(r01
,ewtabscale
);
386 ewitab
= _mm_cvttps_epi32(ewrt
);
387 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
388 ewitab
= _mm_slli_epi32(ewitab
,2);
389 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
390 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
391 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
392 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
393 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
394 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
395 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
396 velec
= _mm_mul_ps(qq01
,_mm_sub_ps(rinv01
,velec
));
397 felec
= _mm_mul_ps(_mm_mul_ps(qq01
,rinv01
),_mm_sub_ps(rinvsq01
,felec
));
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velecsum
= _mm_add_ps(velecsum
,velec
);
404 /* Calculate temporary vectorial force */
405 tx
= _mm_mul_ps(fscal
,dx01
);
406 ty
= _mm_mul_ps(fscal
,dy01
);
407 tz
= _mm_mul_ps(fscal
,dz01
);
409 /* Update vectorial force */
410 fix0
= _mm_add_ps(fix0
,tx
);
411 fiy0
= _mm_add_ps(fiy0
,ty
);
412 fiz0
= _mm_add_ps(fiz0
,tz
);
414 fjx1
= _mm_add_ps(fjx1
,tx
);
415 fjy1
= _mm_add_ps(fjy1
,ty
);
416 fjz1
= _mm_add_ps(fjz1
,tz
);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 r02
= _mm_mul_ps(rsq02
,rinv02
);
424 /* EWALD ELECTROSTATICS */
426 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
427 ewrt
= _mm_mul_ps(r02
,ewtabscale
);
428 ewitab
= _mm_cvttps_epi32(ewrt
);
429 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
430 ewitab
= _mm_slli_epi32(ewitab
,2);
431 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
432 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
433 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
434 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
435 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
436 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
437 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
438 velec
= _mm_mul_ps(qq02
,_mm_sub_ps(rinv02
,velec
));
439 felec
= _mm_mul_ps(_mm_mul_ps(qq02
,rinv02
),_mm_sub_ps(rinvsq02
,felec
));
441 /* Update potential sum for this i atom from the interaction with this j atom. */
442 velecsum
= _mm_add_ps(velecsum
,velec
);
446 /* Calculate temporary vectorial force */
447 tx
= _mm_mul_ps(fscal
,dx02
);
448 ty
= _mm_mul_ps(fscal
,dy02
);
449 tz
= _mm_mul_ps(fscal
,dz02
);
451 /* Update vectorial force */
452 fix0
= _mm_add_ps(fix0
,tx
);
453 fiy0
= _mm_add_ps(fiy0
,ty
);
454 fiz0
= _mm_add_ps(fiz0
,tz
);
456 fjx2
= _mm_add_ps(fjx2
,tx
);
457 fjy2
= _mm_add_ps(fjy2
,ty
);
458 fjz2
= _mm_add_ps(fjz2
,tz
);
460 /**************************
461 * CALCULATE INTERACTIONS *
462 **************************/
464 r10
= _mm_mul_ps(rsq10
,rinv10
);
466 /* EWALD ELECTROSTATICS */
468 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
469 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
470 ewitab
= _mm_cvttps_epi32(ewrt
);
471 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
472 ewitab
= _mm_slli_epi32(ewitab
,2);
473 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
474 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
475 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
476 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
477 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
478 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
479 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
480 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(rinv10
,velec
));
481 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velecsum
= _mm_add_ps(velecsum
,velec
);
488 /* Calculate temporary vectorial force */
489 tx
= _mm_mul_ps(fscal
,dx10
);
490 ty
= _mm_mul_ps(fscal
,dy10
);
491 tz
= _mm_mul_ps(fscal
,dz10
);
493 /* Update vectorial force */
494 fix1
= _mm_add_ps(fix1
,tx
);
495 fiy1
= _mm_add_ps(fiy1
,ty
);
496 fiz1
= _mm_add_ps(fiz1
,tz
);
498 fjx0
= _mm_add_ps(fjx0
,tx
);
499 fjy0
= _mm_add_ps(fjy0
,ty
);
500 fjz0
= _mm_add_ps(fjz0
,tz
);
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 r11
= _mm_mul_ps(rsq11
,rinv11
);
508 /* EWALD ELECTROSTATICS */
510 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
511 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
512 ewitab
= _mm_cvttps_epi32(ewrt
);
513 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
514 ewitab
= _mm_slli_epi32(ewitab
,2);
515 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
516 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
517 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
518 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
519 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
520 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
521 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
522 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(rinv11
,velec
));
523 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velecsum
= _mm_add_ps(velecsum
,velec
);
530 /* Calculate temporary vectorial force */
531 tx
= _mm_mul_ps(fscal
,dx11
);
532 ty
= _mm_mul_ps(fscal
,dy11
);
533 tz
= _mm_mul_ps(fscal
,dz11
);
535 /* Update vectorial force */
536 fix1
= _mm_add_ps(fix1
,tx
);
537 fiy1
= _mm_add_ps(fiy1
,ty
);
538 fiz1
= _mm_add_ps(fiz1
,tz
);
540 fjx1
= _mm_add_ps(fjx1
,tx
);
541 fjy1
= _mm_add_ps(fjy1
,ty
);
542 fjz1
= _mm_add_ps(fjz1
,tz
);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 r12
= _mm_mul_ps(rsq12
,rinv12
);
550 /* EWALD ELECTROSTATICS */
552 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
553 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
554 ewitab
= _mm_cvttps_epi32(ewrt
);
555 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
556 ewitab
= _mm_slli_epi32(ewitab
,2);
557 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
558 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
559 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
560 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
561 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
562 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
563 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
564 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(rinv12
,velec
));
565 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
567 /* Update potential sum for this i atom from the interaction with this j atom. */
568 velecsum
= _mm_add_ps(velecsum
,velec
);
572 /* Calculate temporary vectorial force */
573 tx
= _mm_mul_ps(fscal
,dx12
);
574 ty
= _mm_mul_ps(fscal
,dy12
);
575 tz
= _mm_mul_ps(fscal
,dz12
);
577 /* Update vectorial force */
578 fix1
= _mm_add_ps(fix1
,tx
);
579 fiy1
= _mm_add_ps(fiy1
,ty
);
580 fiz1
= _mm_add_ps(fiz1
,tz
);
582 fjx2
= _mm_add_ps(fjx2
,tx
);
583 fjy2
= _mm_add_ps(fjy2
,ty
);
584 fjz2
= _mm_add_ps(fjz2
,tz
);
586 /**************************
587 * CALCULATE INTERACTIONS *
588 **************************/
590 r20
= _mm_mul_ps(rsq20
,rinv20
);
592 /* EWALD ELECTROSTATICS */
594 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
595 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
596 ewitab
= _mm_cvttps_epi32(ewrt
);
597 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
598 ewitab
= _mm_slli_epi32(ewitab
,2);
599 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
600 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
601 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
602 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
603 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
604 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
605 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
606 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(rinv20
,velec
));
607 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velecsum
= _mm_add_ps(velecsum
,velec
);
614 /* Calculate temporary vectorial force */
615 tx
= _mm_mul_ps(fscal
,dx20
);
616 ty
= _mm_mul_ps(fscal
,dy20
);
617 tz
= _mm_mul_ps(fscal
,dz20
);
619 /* Update vectorial force */
620 fix2
= _mm_add_ps(fix2
,tx
);
621 fiy2
= _mm_add_ps(fiy2
,ty
);
622 fiz2
= _mm_add_ps(fiz2
,tz
);
624 fjx0
= _mm_add_ps(fjx0
,tx
);
625 fjy0
= _mm_add_ps(fjy0
,ty
);
626 fjz0
= _mm_add_ps(fjz0
,tz
);
628 /**************************
629 * CALCULATE INTERACTIONS *
630 **************************/
632 r21
= _mm_mul_ps(rsq21
,rinv21
);
634 /* EWALD ELECTROSTATICS */
636 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
637 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
638 ewitab
= _mm_cvttps_epi32(ewrt
);
639 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
640 ewitab
= _mm_slli_epi32(ewitab
,2);
641 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
642 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
643 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
644 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
645 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
646 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
647 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
648 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(rinv21
,velec
));
649 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
651 /* Update potential sum for this i atom from the interaction with this j atom. */
652 velecsum
= _mm_add_ps(velecsum
,velec
);
656 /* Calculate temporary vectorial force */
657 tx
= _mm_mul_ps(fscal
,dx21
);
658 ty
= _mm_mul_ps(fscal
,dy21
);
659 tz
= _mm_mul_ps(fscal
,dz21
);
661 /* Update vectorial force */
662 fix2
= _mm_add_ps(fix2
,tx
);
663 fiy2
= _mm_add_ps(fiy2
,ty
);
664 fiz2
= _mm_add_ps(fiz2
,tz
);
666 fjx1
= _mm_add_ps(fjx1
,tx
);
667 fjy1
= _mm_add_ps(fjy1
,ty
);
668 fjz1
= _mm_add_ps(fjz1
,tz
);
670 /**************************
671 * CALCULATE INTERACTIONS *
672 **************************/
674 r22
= _mm_mul_ps(rsq22
,rinv22
);
676 /* EWALD ELECTROSTATICS */
678 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
679 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
680 ewitab
= _mm_cvttps_epi32(ewrt
);
681 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
682 ewitab
= _mm_slli_epi32(ewitab
,2);
683 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
684 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
685 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
686 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
687 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
688 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
689 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
690 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(rinv22
,velec
));
691 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
693 /* Update potential sum for this i atom from the interaction with this j atom. */
694 velecsum
= _mm_add_ps(velecsum
,velec
);
698 /* Calculate temporary vectorial force */
699 tx
= _mm_mul_ps(fscal
,dx22
);
700 ty
= _mm_mul_ps(fscal
,dy22
);
701 tz
= _mm_mul_ps(fscal
,dz22
);
703 /* Update vectorial force */
704 fix2
= _mm_add_ps(fix2
,tx
);
705 fiy2
= _mm_add_ps(fiy2
,ty
);
706 fiz2
= _mm_add_ps(fiz2
,tz
);
708 fjx2
= _mm_add_ps(fjx2
,tx
);
709 fjy2
= _mm_add_ps(fjy2
,ty
);
710 fjz2
= _mm_add_ps(fjz2
,tz
);
712 fjptrA
= f
+j_coord_offsetA
;
713 fjptrB
= f
+j_coord_offsetB
;
714 fjptrC
= f
+j_coord_offsetC
;
715 fjptrD
= f
+j_coord_offsetD
;
717 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
718 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
720 /* Inner loop uses 397 flops */
726 /* Get j neighbor index, and coordinate index */
727 jnrlistA
= jjnr
[jidx
];
728 jnrlistB
= jjnr
[jidx
+1];
729 jnrlistC
= jjnr
[jidx
+2];
730 jnrlistD
= jjnr
[jidx
+3];
731 /* Sign of each element will be negative for non-real atoms.
732 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
733 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
735 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
736 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
737 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
738 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
739 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
740 j_coord_offsetA
= DIM
*jnrA
;
741 j_coord_offsetB
= DIM
*jnrB
;
742 j_coord_offsetC
= DIM
*jnrC
;
743 j_coord_offsetD
= DIM
*jnrD
;
745 /* load j atom coordinates */
746 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
747 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
748 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
750 /* Calculate displacement vector */
751 dx00
= _mm_sub_ps(ix0
,jx0
);
752 dy00
= _mm_sub_ps(iy0
,jy0
);
753 dz00
= _mm_sub_ps(iz0
,jz0
);
754 dx01
= _mm_sub_ps(ix0
,jx1
);
755 dy01
= _mm_sub_ps(iy0
,jy1
);
756 dz01
= _mm_sub_ps(iz0
,jz1
);
757 dx02
= _mm_sub_ps(ix0
,jx2
);
758 dy02
= _mm_sub_ps(iy0
,jy2
);
759 dz02
= _mm_sub_ps(iz0
,jz2
);
760 dx10
= _mm_sub_ps(ix1
,jx0
);
761 dy10
= _mm_sub_ps(iy1
,jy0
);
762 dz10
= _mm_sub_ps(iz1
,jz0
);
763 dx11
= _mm_sub_ps(ix1
,jx1
);
764 dy11
= _mm_sub_ps(iy1
,jy1
);
765 dz11
= _mm_sub_ps(iz1
,jz1
);
766 dx12
= _mm_sub_ps(ix1
,jx2
);
767 dy12
= _mm_sub_ps(iy1
,jy2
);
768 dz12
= _mm_sub_ps(iz1
,jz2
);
769 dx20
= _mm_sub_ps(ix2
,jx0
);
770 dy20
= _mm_sub_ps(iy2
,jy0
);
771 dz20
= _mm_sub_ps(iz2
,jz0
);
772 dx21
= _mm_sub_ps(ix2
,jx1
);
773 dy21
= _mm_sub_ps(iy2
,jy1
);
774 dz21
= _mm_sub_ps(iz2
,jz1
);
775 dx22
= _mm_sub_ps(ix2
,jx2
);
776 dy22
= _mm_sub_ps(iy2
,jy2
);
777 dz22
= _mm_sub_ps(iz2
,jz2
);
779 /* Calculate squared distance and things based on it */
780 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
781 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
782 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
783 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
784 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
785 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
786 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
787 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
788 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
790 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
791 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
792 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
793 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
794 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
795 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
796 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
797 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
798 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
800 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
801 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
802 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
803 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
804 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
805 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
806 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
807 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
808 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
810 fjx0
= _mm_setzero_ps();
811 fjy0
= _mm_setzero_ps();
812 fjz0
= _mm_setzero_ps();
813 fjx1
= _mm_setzero_ps();
814 fjy1
= _mm_setzero_ps();
815 fjz1
= _mm_setzero_ps();
816 fjx2
= _mm_setzero_ps();
817 fjy2
= _mm_setzero_ps();
818 fjz2
= _mm_setzero_ps();
820 /**************************
821 * CALCULATE INTERACTIONS *
822 **************************/
824 r00
= _mm_mul_ps(rsq00
,rinv00
);
825 r00
= _mm_andnot_ps(dummy_mask
,r00
);
827 /* EWALD ELECTROSTATICS */
829 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
830 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
831 ewitab
= _mm_cvttps_epi32(ewrt
);
832 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
833 ewitab
= _mm_slli_epi32(ewitab
,2);
834 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
835 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
836 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
837 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
838 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
839 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
840 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
841 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(rinv00
,velec
));
842 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
844 /* Analytical LJ-PME */
845 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
846 ewcljrsq
= _mm_mul_ps(ewclj2
,rsq00
);
847 ewclj6
= _mm_mul_ps(ewclj2
,_mm_mul_ps(ewclj2
,ewclj2
));
848 exponent
= gmx_simd_exp_r(ewcljrsq
);
849 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
850 poly
= _mm_mul_ps(exponent
,_mm_add_ps(_mm_sub_ps(one
,ewcljrsq
),_mm_mul_ps(_mm_mul_ps(ewcljrsq
,ewcljrsq
),one_half
)));
851 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
852 vvdw6
= _mm_mul_ps(_mm_sub_ps(c6_00
,_mm_mul_ps(c6grid_00
,_mm_sub_ps(one
,poly
))),rinvsix
);
853 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
854 vvdw
= _mm_sub_ps(_mm_mul_ps(vvdw12
,one_twelfth
),_mm_mul_ps(vvdw6
,one_sixth
));
855 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
856 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,_mm_sub_ps(vvdw6
,_mm_mul_ps(_mm_mul_ps(c6grid_00
,one_sixth
),_mm_mul_ps(exponent
,ewclj6
)))),rinvsq00
);
858 /* Update potential sum for this i atom from the interaction with this j atom. */
859 velec
= _mm_andnot_ps(dummy_mask
,velec
);
860 velecsum
= _mm_add_ps(velecsum
,velec
);
861 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
862 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
864 fscal
= _mm_add_ps(felec
,fvdw
);
866 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
868 /* Calculate temporary vectorial force */
869 tx
= _mm_mul_ps(fscal
,dx00
);
870 ty
= _mm_mul_ps(fscal
,dy00
);
871 tz
= _mm_mul_ps(fscal
,dz00
);
873 /* Update vectorial force */
874 fix0
= _mm_add_ps(fix0
,tx
);
875 fiy0
= _mm_add_ps(fiy0
,ty
);
876 fiz0
= _mm_add_ps(fiz0
,tz
);
878 fjx0
= _mm_add_ps(fjx0
,tx
);
879 fjy0
= _mm_add_ps(fjy0
,ty
);
880 fjz0
= _mm_add_ps(fjz0
,tz
);
882 /**************************
883 * CALCULATE INTERACTIONS *
884 **************************/
886 r01
= _mm_mul_ps(rsq01
,rinv01
);
887 r01
= _mm_andnot_ps(dummy_mask
,r01
);
889 /* EWALD ELECTROSTATICS */
891 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
892 ewrt
= _mm_mul_ps(r01
,ewtabscale
);
893 ewitab
= _mm_cvttps_epi32(ewrt
);
894 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
895 ewitab
= _mm_slli_epi32(ewitab
,2);
896 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
897 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
898 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
899 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
900 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
901 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
902 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
903 velec
= _mm_mul_ps(qq01
,_mm_sub_ps(rinv01
,velec
));
904 felec
= _mm_mul_ps(_mm_mul_ps(qq01
,rinv01
),_mm_sub_ps(rinvsq01
,felec
));
906 /* Update potential sum for this i atom from the interaction with this j atom. */
907 velec
= _mm_andnot_ps(dummy_mask
,velec
);
908 velecsum
= _mm_add_ps(velecsum
,velec
);
912 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
914 /* Calculate temporary vectorial force */
915 tx
= _mm_mul_ps(fscal
,dx01
);
916 ty
= _mm_mul_ps(fscal
,dy01
);
917 tz
= _mm_mul_ps(fscal
,dz01
);
919 /* Update vectorial force */
920 fix0
= _mm_add_ps(fix0
,tx
);
921 fiy0
= _mm_add_ps(fiy0
,ty
);
922 fiz0
= _mm_add_ps(fiz0
,tz
);
924 fjx1
= _mm_add_ps(fjx1
,tx
);
925 fjy1
= _mm_add_ps(fjy1
,ty
);
926 fjz1
= _mm_add_ps(fjz1
,tz
);
928 /**************************
929 * CALCULATE INTERACTIONS *
930 **************************/
932 r02
= _mm_mul_ps(rsq02
,rinv02
);
933 r02
= _mm_andnot_ps(dummy_mask
,r02
);
935 /* EWALD ELECTROSTATICS */
937 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
938 ewrt
= _mm_mul_ps(r02
,ewtabscale
);
939 ewitab
= _mm_cvttps_epi32(ewrt
);
940 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
941 ewitab
= _mm_slli_epi32(ewitab
,2);
942 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
943 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
944 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
945 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
946 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
947 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
948 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
949 velec
= _mm_mul_ps(qq02
,_mm_sub_ps(rinv02
,velec
));
950 felec
= _mm_mul_ps(_mm_mul_ps(qq02
,rinv02
),_mm_sub_ps(rinvsq02
,felec
));
952 /* Update potential sum for this i atom from the interaction with this j atom. */
953 velec
= _mm_andnot_ps(dummy_mask
,velec
);
954 velecsum
= _mm_add_ps(velecsum
,velec
);
958 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
960 /* Calculate temporary vectorial force */
961 tx
= _mm_mul_ps(fscal
,dx02
);
962 ty
= _mm_mul_ps(fscal
,dy02
);
963 tz
= _mm_mul_ps(fscal
,dz02
);
965 /* Update vectorial force */
966 fix0
= _mm_add_ps(fix0
,tx
);
967 fiy0
= _mm_add_ps(fiy0
,ty
);
968 fiz0
= _mm_add_ps(fiz0
,tz
);
970 fjx2
= _mm_add_ps(fjx2
,tx
);
971 fjy2
= _mm_add_ps(fjy2
,ty
);
972 fjz2
= _mm_add_ps(fjz2
,tz
);
974 /**************************
975 * CALCULATE INTERACTIONS *
976 **************************/
978 r10
= _mm_mul_ps(rsq10
,rinv10
);
979 r10
= _mm_andnot_ps(dummy_mask
,r10
);
981 /* EWALD ELECTROSTATICS */
983 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
984 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
985 ewitab
= _mm_cvttps_epi32(ewrt
);
986 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
987 ewitab
= _mm_slli_epi32(ewitab
,2);
988 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
989 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
990 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
991 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
992 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
993 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
994 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
995 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(rinv10
,velec
));
996 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
998 /* Update potential sum for this i atom from the interaction with this j atom. */
999 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1000 velecsum
= _mm_add_ps(velecsum
,velec
);
1004 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1006 /* Calculate temporary vectorial force */
1007 tx
= _mm_mul_ps(fscal
,dx10
);
1008 ty
= _mm_mul_ps(fscal
,dy10
);
1009 tz
= _mm_mul_ps(fscal
,dz10
);
1011 /* Update vectorial force */
1012 fix1
= _mm_add_ps(fix1
,tx
);
1013 fiy1
= _mm_add_ps(fiy1
,ty
);
1014 fiz1
= _mm_add_ps(fiz1
,tz
);
1016 fjx0
= _mm_add_ps(fjx0
,tx
);
1017 fjy0
= _mm_add_ps(fjy0
,ty
);
1018 fjz0
= _mm_add_ps(fjz0
,tz
);
1020 /**************************
1021 * CALCULATE INTERACTIONS *
1022 **************************/
1024 r11
= _mm_mul_ps(rsq11
,rinv11
);
1025 r11
= _mm_andnot_ps(dummy_mask
,r11
);
1027 /* EWALD ELECTROSTATICS */
1029 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1030 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
1031 ewitab
= _mm_cvttps_epi32(ewrt
);
1032 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1033 ewitab
= _mm_slli_epi32(ewitab
,2);
1034 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1035 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1036 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1037 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1038 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1039 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1040 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1041 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(rinv11
,velec
));
1042 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
1044 /* Update potential sum for this i atom from the interaction with this j atom. */
1045 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1046 velecsum
= _mm_add_ps(velecsum
,velec
);
1050 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1052 /* Calculate temporary vectorial force */
1053 tx
= _mm_mul_ps(fscal
,dx11
);
1054 ty
= _mm_mul_ps(fscal
,dy11
);
1055 tz
= _mm_mul_ps(fscal
,dz11
);
1057 /* Update vectorial force */
1058 fix1
= _mm_add_ps(fix1
,tx
);
1059 fiy1
= _mm_add_ps(fiy1
,ty
);
1060 fiz1
= _mm_add_ps(fiz1
,tz
);
1062 fjx1
= _mm_add_ps(fjx1
,tx
);
1063 fjy1
= _mm_add_ps(fjy1
,ty
);
1064 fjz1
= _mm_add_ps(fjz1
,tz
);
1066 /**************************
1067 * CALCULATE INTERACTIONS *
1068 **************************/
1070 r12
= _mm_mul_ps(rsq12
,rinv12
);
1071 r12
= _mm_andnot_ps(dummy_mask
,r12
);
1073 /* EWALD ELECTROSTATICS */
1075 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1076 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
1077 ewitab
= _mm_cvttps_epi32(ewrt
);
1078 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1079 ewitab
= _mm_slli_epi32(ewitab
,2);
1080 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1081 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1082 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1083 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1084 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1085 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1086 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1087 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(rinv12
,velec
));
1088 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
1090 /* Update potential sum for this i atom from the interaction with this j atom. */
1091 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1092 velecsum
= _mm_add_ps(velecsum
,velec
);
1096 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1098 /* Calculate temporary vectorial force */
1099 tx
= _mm_mul_ps(fscal
,dx12
);
1100 ty
= _mm_mul_ps(fscal
,dy12
);
1101 tz
= _mm_mul_ps(fscal
,dz12
);
1103 /* Update vectorial force */
1104 fix1
= _mm_add_ps(fix1
,tx
);
1105 fiy1
= _mm_add_ps(fiy1
,ty
);
1106 fiz1
= _mm_add_ps(fiz1
,tz
);
1108 fjx2
= _mm_add_ps(fjx2
,tx
);
1109 fjy2
= _mm_add_ps(fjy2
,ty
);
1110 fjz2
= _mm_add_ps(fjz2
,tz
);
1112 /**************************
1113 * CALCULATE INTERACTIONS *
1114 **************************/
1116 r20
= _mm_mul_ps(rsq20
,rinv20
);
1117 r20
= _mm_andnot_ps(dummy_mask
,r20
);
1119 /* EWALD ELECTROSTATICS */
1121 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1122 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
1123 ewitab
= _mm_cvttps_epi32(ewrt
);
1124 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1125 ewitab
= _mm_slli_epi32(ewitab
,2);
1126 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1127 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1128 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1129 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1130 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1131 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1132 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1133 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(rinv20
,velec
));
1134 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
1136 /* Update potential sum for this i atom from the interaction with this j atom. */
1137 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1138 velecsum
= _mm_add_ps(velecsum
,velec
);
1142 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1144 /* Calculate temporary vectorial force */
1145 tx
= _mm_mul_ps(fscal
,dx20
);
1146 ty
= _mm_mul_ps(fscal
,dy20
);
1147 tz
= _mm_mul_ps(fscal
,dz20
);
1149 /* Update vectorial force */
1150 fix2
= _mm_add_ps(fix2
,tx
);
1151 fiy2
= _mm_add_ps(fiy2
,ty
);
1152 fiz2
= _mm_add_ps(fiz2
,tz
);
1154 fjx0
= _mm_add_ps(fjx0
,tx
);
1155 fjy0
= _mm_add_ps(fjy0
,ty
);
1156 fjz0
= _mm_add_ps(fjz0
,tz
);
1158 /**************************
1159 * CALCULATE INTERACTIONS *
1160 **************************/
1162 r21
= _mm_mul_ps(rsq21
,rinv21
);
1163 r21
= _mm_andnot_ps(dummy_mask
,r21
);
1165 /* EWALD ELECTROSTATICS */
1167 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1168 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
1169 ewitab
= _mm_cvttps_epi32(ewrt
);
1170 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1171 ewitab
= _mm_slli_epi32(ewitab
,2);
1172 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1173 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1174 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1175 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1176 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1177 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1178 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1179 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(rinv21
,velec
));
1180 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
1182 /* Update potential sum for this i atom from the interaction with this j atom. */
1183 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1184 velecsum
= _mm_add_ps(velecsum
,velec
);
1188 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1190 /* Calculate temporary vectorial force */
1191 tx
= _mm_mul_ps(fscal
,dx21
);
1192 ty
= _mm_mul_ps(fscal
,dy21
);
1193 tz
= _mm_mul_ps(fscal
,dz21
);
1195 /* Update vectorial force */
1196 fix2
= _mm_add_ps(fix2
,tx
);
1197 fiy2
= _mm_add_ps(fiy2
,ty
);
1198 fiz2
= _mm_add_ps(fiz2
,tz
);
1200 fjx1
= _mm_add_ps(fjx1
,tx
);
1201 fjy1
= _mm_add_ps(fjy1
,ty
);
1202 fjz1
= _mm_add_ps(fjz1
,tz
);
1204 /**************************
1205 * CALCULATE INTERACTIONS *
1206 **************************/
1208 r22
= _mm_mul_ps(rsq22
,rinv22
);
1209 r22
= _mm_andnot_ps(dummy_mask
,r22
);
1211 /* EWALD ELECTROSTATICS */
1213 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1214 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
1215 ewitab
= _mm_cvttps_epi32(ewrt
);
1216 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1217 ewitab
= _mm_slli_epi32(ewitab
,2);
1218 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1219 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1220 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1221 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1222 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1223 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1224 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1225 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(rinv22
,velec
));
1226 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
1228 /* Update potential sum for this i atom from the interaction with this j atom. */
1229 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1230 velecsum
= _mm_add_ps(velecsum
,velec
);
1234 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1236 /* Calculate temporary vectorial force */
1237 tx
= _mm_mul_ps(fscal
,dx22
);
1238 ty
= _mm_mul_ps(fscal
,dy22
);
1239 tz
= _mm_mul_ps(fscal
,dz22
);
1241 /* Update vectorial force */
1242 fix2
= _mm_add_ps(fix2
,tx
);
1243 fiy2
= _mm_add_ps(fiy2
,ty
);
1244 fiz2
= _mm_add_ps(fiz2
,tz
);
1246 fjx2
= _mm_add_ps(fjx2
,tx
);
1247 fjy2
= _mm_add_ps(fjy2
,ty
);
1248 fjz2
= _mm_add_ps(fjz2
,tz
);
1250 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1251 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1252 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1253 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1255 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1256 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1258 /* Inner loop uses 406 flops */
1261 /* End of innermost loop */
1263 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1264 f
+i_coord_offset
,fshift
+i_shift_offset
);
1267 /* Update potential energies */
1268 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1269 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1271 /* Increment number of inner iterations */
1272 inneriter
+= j_index_end
- j_index_start
;
1274 /* Outer loop uses 20 flops */
1277 /* Increment number of outer iterations */
1280 /* Update outer/inner flops */
1282 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*406);
1285 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_single
1286 * Electrostatics interaction: Ewald
1287 * VdW interaction: LJEwald
1288 * Geometry: Water3-Water3
1289 * Calculate force/pot: Force
1292 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_single
1293 (t_nblist
* gmx_restrict nlist
,
1294 rvec
* gmx_restrict xx
,
1295 rvec
* gmx_restrict ff
,
1296 t_forcerec
* gmx_restrict fr
,
1297 t_mdatoms
* gmx_restrict mdatoms
,
1298 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1299 t_nrnb
* gmx_restrict nrnb
)
1301 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1302 * just 0 for non-waters.
1303 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1304 * jnr indices corresponding to data put in the four positions in the SIMD register.
1306 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1307 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1308 int jnrA
,jnrB
,jnrC
,jnrD
;
1309 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1310 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1311 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1312 real rcutoff_scalar
;
1313 real
*shiftvec
,*fshift
,*x
,*f
;
1314 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1315 real scratch
[4*DIM
];
1316 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1318 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1320 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1322 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1323 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1324 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1325 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1326 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1327 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1328 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1329 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1330 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1331 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1332 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1333 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1334 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1335 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1336 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1337 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1338 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1341 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1344 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1345 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1355 __m128 ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
1357 __m128 one_half
= _mm_set1_ps(0.5);
1358 __m128 minus_one
= _mm_set1_ps(-1.0);
1360 __m128 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1362 __m128 dummy_mask
,cutoff_mask
;
1363 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1364 __m128 one
= _mm_set1_ps(1.0);
1365 __m128 two
= _mm_set1_ps(2.0);
1371 jindex
= nlist
->jindex
;
1373 shiftidx
= nlist
->shift
;
1375 shiftvec
= fr
->shift_vec
[0];
1376 fshift
= fr
->fshift
[0];
1377 facel
= _mm_set1_ps(fr
->epsfac
);
1378 charge
= mdatoms
->chargeA
;
1379 nvdwtype
= fr
->ntype
;
1380 vdwparam
= fr
->nbfp
;
1381 vdwtype
= mdatoms
->typeA
;
1382 vdwgridparam
= fr
->ljpme_c6grid
;
1383 sh_lj_ewald
= _mm_set1_ps(fr
->ic
->sh_lj_ewald
);
1384 ewclj
= _mm_set1_ps(fr
->ewaldcoeff_lj
);
1385 ewclj2
= _mm_mul_ps(minus_one
,_mm_mul_ps(ewclj
,ewclj
));
1387 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
1388 ewtab
= fr
->ic
->tabq_coul_F
;
1389 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
1390 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
1392 /* Setup water-specific parameters */
1393 inr
= nlist
->iinr
[0];
1394 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
1395 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1396 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1397 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1399 jq0
= _mm_set1_ps(charge
[inr
+0]);
1400 jq1
= _mm_set1_ps(charge
[inr
+1]);
1401 jq2
= _mm_set1_ps(charge
[inr
+2]);
1402 vdwjidx0A
= 2*vdwtype
[inr
+0];
1403 qq00
= _mm_mul_ps(iq0
,jq0
);
1404 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1405 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1406 c6grid_00
= _mm_set1_ps(vdwgridparam
[vdwioffset0
+vdwjidx0A
]);
1407 qq01
= _mm_mul_ps(iq0
,jq1
);
1408 qq02
= _mm_mul_ps(iq0
,jq2
);
1409 qq10
= _mm_mul_ps(iq1
,jq0
);
1410 qq11
= _mm_mul_ps(iq1
,jq1
);
1411 qq12
= _mm_mul_ps(iq1
,jq2
);
1412 qq20
= _mm_mul_ps(iq2
,jq0
);
1413 qq21
= _mm_mul_ps(iq2
,jq1
);
1414 qq22
= _mm_mul_ps(iq2
,jq2
);
1416 /* Avoid stupid compiler warnings */
1417 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1418 j_coord_offsetA
= 0;
1419 j_coord_offsetB
= 0;
1420 j_coord_offsetC
= 0;
1421 j_coord_offsetD
= 0;
1426 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1428 scratch
[iidx
] = 0.0;
1431 /* Start outer loop over neighborlists */
1432 for(iidx
=0; iidx
<nri
; iidx
++)
1434 /* Load shift vector for this list */
1435 i_shift_offset
= DIM
*shiftidx
[iidx
];
1437 /* Load limits for loop over neighbors */
1438 j_index_start
= jindex
[iidx
];
1439 j_index_end
= jindex
[iidx
+1];
1441 /* Get outer coordinate index */
1443 i_coord_offset
= DIM
*inr
;
1445 /* Load i particle coords and add shift vector */
1446 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1447 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1449 fix0
= _mm_setzero_ps();
1450 fiy0
= _mm_setzero_ps();
1451 fiz0
= _mm_setzero_ps();
1452 fix1
= _mm_setzero_ps();
1453 fiy1
= _mm_setzero_ps();
1454 fiz1
= _mm_setzero_ps();
1455 fix2
= _mm_setzero_ps();
1456 fiy2
= _mm_setzero_ps();
1457 fiz2
= _mm_setzero_ps();
1459 /* Start inner kernel loop */
1460 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1463 /* Get j neighbor index, and coordinate index */
1465 jnrB
= jjnr
[jidx
+1];
1466 jnrC
= jjnr
[jidx
+2];
1467 jnrD
= jjnr
[jidx
+3];
1468 j_coord_offsetA
= DIM
*jnrA
;
1469 j_coord_offsetB
= DIM
*jnrB
;
1470 j_coord_offsetC
= DIM
*jnrC
;
1471 j_coord_offsetD
= DIM
*jnrD
;
1473 /* load j atom coordinates */
1474 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1475 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1476 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1478 /* Calculate displacement vector */
1479 dx00
= _mm_sub_ps(ix0
,jx0
);
1480 dy00
= _mm_sub_ps(iy0
,jy0
);
1481 dz00
= _mm_sub_ps(iz0
,jz0
);
1482 dx01
= _mm_sub_ps(ix0
,jx1
);
1483 dy01
= _mm_sub_ps(iy0
,jy1
);
1484 dz01
= _mm_sub_ps(iz0
,jz1
);
1485 dx02
= _mm_sub_ps(ix0
,jx2
);
1486 dy02
= _mm_sub_ps(iy0
,jy2
);
1487 dz02
= _mm_sub_ps(iz0
,jz2
);
1488 dx10
= _mm_sub_ps(ix1
,jx0
);
1489 dy10
= _mm_sub_ps(iy1
,jy0
);
1490 dz10
= _mm_sub_ps(iz1
,jz0
);
1491 dx11
= _mm_sub_ps(ix1
,jx1
);
1492 dy11
= _mm_sub_ps(iy1
,jy1
);
1493 dz11
= _mm_sub_ps(iz1
,jz1
);
1494 dx12
= _mm_sub_ps(ix1
,jx2
);
1495 dy12
= _mm_sub_ps(iy1
,jy2
);
1496 dz12
= _mm_sub_ps(iz1
,jz2
);
1497 dx20
= _mm_sub_ps(ix2
,jx0
);
1498 dy20
= _mm_sub_ps(iy2
,jy0
);
1499 dz20
= _mm_sub_ps(iz2
,jz0
);
1500 dx21
= _mm_sub_ps(ix2
,jx1
);
1501 dy21
= _mm_sub_ps(iy2
,jy1
);
1502 dz21
= _mm_sub_ps(iz2
,jz1
);
1503 dx22
= _mm_sub_ps(ix2
,jx2
);
1504 dy22
= _mm_sub_ps(iy2
,jy2
);
1505 dz22
= _mm_sub_ps(iz2
,jz2
);
1507 /* Calculate squared distance and things based on it */
1508 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1509 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1510 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1511 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1512 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1513 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1514 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1515 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1516 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1518 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
1519 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
1520 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
1521 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
1522 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1523 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1524 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
1525 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1526 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1528 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
1529 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
1530 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
1531 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
1532 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1533 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1534 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
1535 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1536 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1538 fjx0
= _mm_setzero_ps();
1539 fjy0
= _mm_setzero_ps();
1540 fjz0
= _mm_setzero_ps();
1541 fjx1
= _mm_setzero_ps();
1542 fjy1
= _mm_setzero_ps();
1543 fjz1
= _mm_setzero_ps();
1544 fjx2
= _mm_setzero_ps();
1545 fjy2
= _mm_setzero_ps();
1546 fjz2
= _mm_setzero_ps();
1548 /**************************
1549 * CALCULATE INTERACTIONS *
1550 **************************/
1552 r00
= _mm_mul_ps(rsq00
,rinv00
);
1554 /* EWALD ELECTROSTATICS */
1556 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1557 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
1558 ewitab
= _mm_cvttps_epi32(ewrt
);
1559 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1560 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1561 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1563 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1564 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
1566 /* Analytical LJ-PME */
1567 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1568 ewcljrsq
= _mm_mul_ps(ewclj2
,rsq00
);
1569 ewclj6
= _mm_mul_ps(ewclj2
,_mm_mul_ps(ewclj2
,ewclj2
));
1570 exponent
= gmx_simd_exp_r(ewcljrsq
);
1571 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1572 poly
= _mm_mul_ps(exponent
,_mm_add_ps(_mm_sub_ps(one
,ewcljrsq
),_mm_mul_ps(_mm_mul_ps(ewcljrsq
,ewcljrsq
),one_half
)));
1573 /* f6A = 6 * C6grid * (1 - poly) */
1574 f6A
= _mm_mul_ps(c6grid_00
,_mm_sub_ps(one
,poly
));
1575 /* f6B = C6grid * exponent * beta^6 */
1576 f6B
= _mm_mul_ps(_mm_mul_ps(c6grid_00
,one_sixth
),_mm_mul_ps(exponent
,ewclj6
));
1577 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1578 fvdw
= _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),_mm_sub_ps(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
1580 fscal
= _mm_add_ps(felec
,fvdw
);
1582 /* Calculate temporary vectorial force */
1583 tx
= _mm_mul_ps(fscal
,dx00
);
1584 ty
= _mm_mul_ps(fscal
,dy00
);
1585 tz
= _mm_mul_ps(fscal
,dz00
);
1587 /* Update vectorial force */
1588 fix0
= _mm_add_ps(fix0
,tx
);
1589 fiy0
= _mm_add_ps(fiy0
,ty
);
1590 fiz0
= _mm_add_ps(fiz0
,tz
);
1592 fjx0
= _mm_add_ps(fjx0
,tx
);
1593 fjy0
= _mm_add_ps(fjy0
,ty
);
1594 fjz0
= _mm_add_ps(fjz0
,tz
);
1596 /**************************
1597 * CALCULATE INTERACTIONS *
1598 **************************/
1600 r01
= _mm_mul_ps(rsq01
,rinv01
);
1602 /* EWALD ELECTROSTATICS */
1604 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1605 ewrt
= _mm_mul_ps(r01
,ewtabscale
);
1606 ewitab
= _mm_cvttps_epi32(ewrt
);
1607 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1608 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1609 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1611 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1612 felec
= _mm_mul_ps(_mm_mul_ps(qq01
,rinv01
),_mm_sub_ps(rinvsq01
,felec
));
1616 /* Calculate temporary vectorial force */
1617 tx
= _mm_mul_ps(fscal
,dx01
);
1618 ty
= _mm_mul_ps(fscal
,dy01
);
1619 tz
= _mm_mul_ps(fscal
,dz01
);
1621 /* Update vectorial force */
1622 fix0
= _mm_add_ps(fix0
,tx
);
1623 fiy0
= _mm_add_ps(fiy0
,ty
);
1624 fiz0
= _mm_add_ps(fiz0
,tz
);
1626 fjx1
= _mm_add_ps(fjx1
,tx
);
1627 fjy1
= _mm_add_ps(fjy1
,ty
);
1628 fjz1
= _mm_add_ps(fjz1
,tz
);
1630 /**************************
1631 * CALCULATE INTERACTIONS *
1632 **************************/
1634 r02
= _mm_mul_ps(rsq02
,rinv02
);
1636 /* EWALD ELECTROSTATICS */
1638 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1639 ewrt
= _mm_mul_ps(r02
,ewtabscale
);
1640 ewitab
= _mm_cvttps_epi32(ewrt
);
1641 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1642 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1643 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1645 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1646 felec
= _mm_mul_ps(_mm_mul_ps(qq02
,rinv02
),_mm_sub_ps(rinvsq02
,felec
));
1650 /* Calculate temporary vectorial force */
1651 tx
= _mm_mul_ps(fscal
,dx02
);
1652 ty
= _mm_mul_ps(fscal
,dy02
);
1653 tz
= _mm_mul_ps(fscal
,dz02
);
1655 /* Update vectorial force */
1656 fix0
= _mm_add_ps(fix0
,tx
);
1657 fiy0
= _mm_add_ps(fiy0
,ty
);
1658 fiz0
= _mm_add_ps(fiz0
,tz
);
1660 fjx2
= _mm_add_ps(fjx2
,tx
);
1661 fjy2
= _mm_add_ps(fjy2
,ty
);
1662 fjz2
= _mm_add_ps(fjz2
,tz
);
1664 /**************************
1665 * CALCULATE INTERACTIONS *
1666 **************************/
1668 r10
= _mm_mul_ps(rsq10
,rinv10
);
1670 /* EWALD ELECTROSTATICS */
1672 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1673 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
1674 ewitab
= _mm_cvttps_epi32(ewrt
);
1675 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1676 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1677 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1679 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1680 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
1684 /* Calculate temporary vectorial force */
1685 tx
= _mm_mul_ps(fscal
,dx10
);
1686 ty
= _mm_mul_ps(fscal
,dy10
);
1687 tz
= _mm_mul_ps(fscal
,dz10
);
1689 /* Update vectorial force */
1690 fix1
= _mm_add_ps(fix1
,tx
);
1691 fiy1
= _mm_add_ps(fiy1
,ty
);
1692 fiz1
= _mm_add_ps(fiz1
,tz
);
1694 fjx0
= _mm_add_ps(fjx0
,tx
);
1695 fjy0
= _mm_add_ps(fjy0
,ty
);
1696 fjz0
= _mm_add_ps(fjz0
,tz
);
1698 /**************************
1699 * CALCULATE INTERACTIONS *
1700 **************************/
1702 r11
= _mm_mul_ps(rsq11
,rinv11
);
1704 /* EWALD ELECTROSTATICS */
1706 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1707 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
1708 ewitab
= _mm_cvttps_epi32(ewrt
);
1709 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1710 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1711 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1713 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1714 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
1718 /* Calculate temporary vectorial force */
1719 tx
= _mm_mul_ps(fscal
,dx11
);
1720 ty
= _mm_mul_ps(fscal
,dy11
);
1721 tz
= _mm_mul_ps(fscal
,dz11
);
1723 /* Update vectorial force */
1724 fix1
= _mm_add_ps(fix1
,tx
);
1725 fiy1
= _mm_add_ps(fiy1
,ty
);
1726 fiz1
= _mm_add_ps(fiz1
,tz
);
1728 fjx1
= _mm_add_ps(fjx1
,tx
);
1729 fjy1
= _mm_add_ps(fjy1
,ty
);
1730 fjz1
= _mm_add_ps(fjz1
,tz
);
1732 /**************************
1733 * CALCULATE INTERACTIONS *
1734 **************************/
1736 r12
= _mm_mul_ps(rsq12
,rinv12
);
1738 /* EWALD ELECTROSTATICS */
1740 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1741 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
1742 ewitab
= _mm_cvttps_epi32(ewrt
);
1743 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1744 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1745 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1747 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1748 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
1752 /* Calculate temporary vectorial force */
1753 tx
= _mm_mul_ps(fscal
,dx12
);
1754 ty
= _mm_mul_ps(fscal
,dy12
);
1755 tz
= _mm_mul_ps(fscal
,dz12
);
1757 /* Update vectorial force */
1758 fix1
= _mm_add_ps(fix1
,tx
);
1759 fiy1
= _mm_add_ps(fiy1
,ty
);
1760 fiz1
= _mm_add_ps(fiz1
,tz
);
1762 fjx2
= _mm_add_ps(fjx2
,tx
);
1763 fjy2
= _mm_add_ps(fjy2
,ty
);
1764 fjz2
= _mm_add_ps(fjz2
,tz
);
1766 /**************************
1767 * CALCULATE INTERACTIONS *
1768 **************************/
1770 r20
= _mm_mul_ps(rsq20
,rinv20
);
1772 /* EWALD ELECTROSTATICS */
1774 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1775 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
1776 ewitab
= _mm_cvttps_epi32(ewrt
);
1777 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1778 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1779 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1781 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1782 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
1786 /* Calculate temporary vectorial force */
1787 tx
= _mm_mul_ps(fscal
,dx20
);
1788 ty
= _mm_mul_ps(fscal
,dy20
);
1789 tz
= _mm_mul_ps(fscal
,dz20
);
1791 /* Update vectorial force */
1792 fix2
= _mm_add_ps(fix2
,tx
);
1793 fiy2
= _mm_add_ps(fiy2
,ty
);
1794 fiz2
= _mm_add_ps(fiz2
,tz
);
1796 fjx0
= _mm_add_ps(fjx0
,tx
);
1797 fjy0
= _mm_add_ps(fjy0
,ty
);
1798 fjz0
= _mm_add_ps(fjz0
,tz
);
1800 /**************************
1801 * CALCULATE INTERACTIONS *
1802 **************************/
1804 r21
= _mm_mul_ps(rsq21
,rinv21
);
1806 /* EWALD ELECTROSTATICS */
1808 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1809 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
1810 ewitab
= _mm_cvttps_epi32(ewrt
);
1811 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1812 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1813 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1815 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1816 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
1820 /* Calculate temporary vectorial force */
1821 tx
= _mm_mul_ps(fscal
,dx21
);
1822 ty
= _mm_mul_ps(fscal
,dy21
);
1823 tz
= _mm_mul_ps(fscal
,dz21
);
1825 /* Update vectorial force */
1826 fix2
= _mm_add_ps(fix2
,tx
);
1827 fiy2
= _mm_add_ps(fiy2
,ty
);
1828 fiz2
= _mm_add_ps(fiz2
,tz
);
1830 fjx1
= _mm_add_ps(fjx1
,tx
);
1831 fjy1
= _mm_add_ps(fjy1
,ty
);
1832 fjz1
= _mm_add_ps(fjz1
,tz
);
1834 /**************************
1835 * CALCULATE INTERACTIONS *
1836 **************************/
1838 r22
= _mm_mul_ps(rsq22
,rinv22
);
1840 /* EWALD ELECTROSTATICS */
1842 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1843 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
1844 ewitab
= _mm_cvttps_epi32(ewrt
);
1845 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1846 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1847 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1849 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1850 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
1854 /* Calculate temporary vectorial force */
1855 tx
= _mm_mul_ps(fscal
,dx22
);
1856 ty
= _mm_mul_ps(fscal
,dy22
);
1857 tz
= _mm_mul_ps(fscal
,dz22
);
1859 /* Update vectorial force */
1860 fix2
= _mm_add_ps(fix2
,tx
);
1861 fiy2
= _mm_add_ps(fiy2
,ty
);
1862 fiz2
= _mm_add_ps(fiz2
,tz
);
1864 fjx2
= _mm_add_ps(fjx2
,tx
);
1865 fjy2
= _mm_add_ps(fjy2
,ty
);
1866 fjz2
= _mm_add_ps(fjz2
,tz
);
1868 fjptrA
= f
+j_coord_offsetA
;
1869 fjptrB
= f
+j_coord_offsetB
;
1870 fjptrC
= f
+j_coord_offsetC
;
1871 fjptrD
= f
+j_coord_offsetD
;
1873 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1874 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1876 /* Inner loop uses 347 flops */
1879 if(jidx
<j_index_end
)
1882 /* Get j neighbor index, and coordinate index */
1883 jnrlistA
= jjnr
[jidx
];
1884 jnrlistB
= jjnr
[jidx
+1];
1885 jnrlistC
= jjnr
[jidx
+2];
1886 jnrlistD
= jjnr
[jidx
+3];
1887 /* Sign of each element will be negative for non-real atoms.
1888 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1889 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1891 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1892 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1893 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1894 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1895 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1896 j_coord_offsetA
= DIM
*jnrA
;
1897 j_coord_offsetB
= DIM
*jnrB
;
1898 j_coord_offsetC
= DIM
*jnrC
;
1899 j_coord_offsetD
= DIM
*jnrD
;
1901 /* load j atom coordinates */
1902 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1903 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1904 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1906 /* Calculate displacement vector */
1907 dx00
= _mm_sub_ps(ix0
,jx0
);
1908 dy00
= _mm_sub_ps(iy0
,jy0
);
1909 dz00
= _mm_sub_ps(iz0
,jz0
);
1910 dx01
= _mm_sub_ps(ix0
,jx1
);
1911 dy01
= _mm_sub_ps(iy0
,jy1
);
1912 dz01
= _mm_sub_ps(iz0
,jz1
);
1913 dx02
= _mm_sub_ps(ix0
,jx2
);
1914 dy02
= _mm_sub_ps(iy0
,jy2
);
1915 dz02
= _mm_sub_ps(iz0
,jz2
);
1916 dx10
= _mm_sub_ps(ix1
,jx0
);
1917 dy10
= _mm_sub_ps(iy1
,jy0
);
1918 dz10
= _mm_sub_ps(iz1
,jz0
);
1919 dx11
= _mm_sub_ps(ix1
,jx1
);
1920 dy11
= _mm_sub_ps(iy1
,jy1
);
1921 dz11
= _mm_sub_ps(iz1
,jz1
);
1922 dx12
= _mm_sub_ps(ix1
,jx2
);
1923 dy12
= _mm_sub_ps(iy1
,jy2
);
1924 dz12
= _mm_sub_ps(iz1
,jz2
);
1925 dx20
= _mm_sub_ps(ix2
,jx0
);
1926 dy20
= _mm_sub_ps(iy2
,jy0
);
1927 dz20
= _mm_sub_ps(iz2
,jz0
);
1928 dx21
= _mm_sub_ps(ix2
,jx1
);
1929 dy21
= _mm_sub_ps(iy2
,jy1
);
1930 dz21
= _mm_sub_ps(iz2
,jz1
);
1931 dx22
= _mm_sub_ps(ix2
,jx2
);
1932 dy22
= _mm_sub_ps(iy2
,jy2
);
1933 dz22
= _mm_sub_ps(iz2
,jz2
);
1935 /* Calculate squared distance and things based on it */
1936 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1937 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1938 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1939 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1940 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1941 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1942 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1943 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1944 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1946 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
1947 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
1948 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
1949 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
1950 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1951 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1952 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
1953 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1954 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1956 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
1957 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
1958 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
1959 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
1960 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1961 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1962 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
1963 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1964 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1966 fjx0
= _mm_setzero_ps();
1967 fjy0
= _mm_setzero_ps();
1968 fjz0
= _mm_setzero_ps();
1969 fjx1
= _mm_setzero_ps();
1970 fjy1
= _mm_setzero_ps();
1971 fjz1
= _mm_setzero_ps();
1972 fjx2
= _mm_setzero_ps();
1973 fjy2
= _mm_setzero_ps();
1974 fjz2
= _mm_setzero_ps();
1976 /**************************
1977 * CALCULATE INTERACTIONS *
1978 **************************/
1980 r00
= _mm_mul_ps(rsq00
,rinv00
);
1981 r00
= _mm_andnot_ps(dummy_mask
,r00
);
1983 /* EWALD ELECTROSTATICS */
1985 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1986 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
1987 ewitab
= _mm_cvttps_epi32(ewrt
);
1988 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1989 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1990 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1992 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1993 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
1995 /* Analytical LJ-PME */
1996 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1997 ewcljrsq
= _mm_mul_ps(ewclj2
,rsq00
);
1998 ewclj6
= _mm_mul_ps(ewclj2
,_mm_mul_ps(ewclj2
,ewclj2
));
1999 exponent
= gmx_simd_exp_r(ewcljrsq
);
2000 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2001 poly
= _mm_mul_ps(exponent
,_mm_add_ps(_mm_sub_ps(one
,ewcljrsq
),_mm_mul_ps(_mm_mul_ps(ewcljrsq
,ewcljrsq
),one_half
)));
2002 /* f6A = 6 * C6grid * (1 - poly) */
2003 f6A
= _mm_mul_ps(c6grid_00
,_mm_sub_ps(one
,poly
));
2004 /* f6B = C6grid * exponent * beta^6 */
2005 f6B
= _mm_mul_ps(_mm_mul_ps(c6grid_00
,one_sixth
),_mm_mul_ps(exponent
,ewclj6
));
2006 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2007 fvdw
= _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),_mm_sub_ps(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
2009 fscal
= _mm_add_ps(felec
,fvdw
);
2011 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2013 /* Calculate temporary vectorial force */
2014 tx
= _mm_mul_ps(fscal
,dx00
);
2015 ty
= _mm_mul_ps(fscal
,dy00
);
2016 tz
= _mm_mul_ps(fscal
,dz00
);
2018 /* Update vectorial force */
2019 fix0
= _mm_add_ps(fix0
,tx
);
2020 fiy0
= _mm_add_ps(fiy0
,ty
);
2021 fiz0
= _mm_add_ps(fiz0
,tz
);
2023 fjx0
= _mm_add_ps(fjx0
,tx
);
2024 fjy0
= _mm_add_ps(fjy0
,ty
);
2025 fjz0
= _mm_add_ps(fjz0
,tz
);
2027 /**************************
2028 * CALCULATE INTERACTIONS *
2029 **************************/
2031 r01
= _mm_mul_ps(rsq01
,rinv01
);
2032 r01
= _mm_andnot_ps(dummy_mask
,r01
);
2034 /* EWALD ELECTROSTATICS */
2036 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2037 ewrt
= _mm_mul_ps(r01
,ewtabscale
);
2038 ewitab
= _mm_cvttps_epi32(ewrt
);
2039 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2040 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2041 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2043 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2044 felec
= _mm_mul_ps(_mm_mul_ps(qq01
,rinv01
),_mm_sub_ps(rinvsq01
,felec
));
2048 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2050 /* Calculate temporary vectorial force */
2051 tx
= _mm_mul_ps(fscal
,dx01
);
2052 ty
= _mm_mul_ps(fscal
,dy01
);
2053 tz
= _mm_mul_ps(fscal
,dz01
);
2055 /* Update vectorial force */
2056 fix0
= _mm_add_ps(fix0
,tx
);
2057 fiy0
= _mm_add_ps(fiy0
,ty
);
2058 fiz0
= _mm_add_ps(fiz0
,tz
);
2060 fjx1
= _mm_add_ps(fjx1
,tx
);
2061 fjy1
= _mm_add_ps(fjy1
,ty
);
2062 fjz1
= _mm_add_ps(fjz1
,tz
);
2064 /**************************
2065 * CALCULATE INTERACTIONS *
2066 **************************/
2068 r02
= _mm_mul_ps(rsq02
,rinv02
);
2069 r02
= _mm_andnot_ps(dummy_mask
,r02
);
2071 /* EWALD ELECTROSTATICS */
2073 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2074 ewrt
= _mm_mul_ps(r02
,ewtabscale
);
2075 ewitab
= _mm_cvttps_epi32(ewrt
);
2076 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2077 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2078 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2080 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2081 felec
= _mm_mul_ps(_mm_mul_ps(qq02
,rinv02
),_mm_sub_ps(rinvsq02
,felec
));
2085 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2087 /* Calculate temporary vectorial force */
2088 tx
= _mm_mul_ps(fscal
,dx02
);
2089 ty
= _mm_mul_ps(fscal
,dy02
);
2090 tz
= _mm_mul_ps(fscal
,dz02
);
2092 /* Update vectorial force */
2093 fix0
= _mm_add_ps(fix0
,tx
);
2094 fiy0
= _mm_add_ps(fiy0
,ty
);
2095 fiz0
= _mm_add_ps(fiz0
,tz
);
2097 fjx2
= _mm_add_ps(fjx2
,tx
);
2098 fjy2
= _mm_add_ps(fjy2
,ty
);
2099 fjz2
= _mm_add_ps(fjz2
,tz
);
2101 /**************************
2102 * CALCULATE INTERACTIONS *
2103 **************************/
2105 r10
= _mm_mul_ps(rsq10
,rinv10
);
2106 r10
= _mm_andnot_ps(dummy_mask
,r10
);
2108 /* EWALD ELECTROSTATICS */
2110 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2111 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
2112 ewitab
= _mm_cvttps_epi32(ewrt
);
2113 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2114 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2115 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2117 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2118 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
2122 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2124 /* Calculate temporary vectorial force */
2125 tx
= _mm_mul_ps(fscal
,dx10
);
2126 ty
= _mm_mul_ps(fscal
,dy10
);
2127 tz
= _mm_mul_ps(fscal
,dz10
);
2129 /* Update vectorial force */
2130 fix1
= _mm_add_ps(fix1
,tx
);
2131 fiy1
= _mm_add_ps(fiy1
,ty
);
2132 fiz1
= _mm_add_ps(fiz1
,tz
);
2134 fjx0
= _mm_add_ps(fjx0
,tx
);
2135 fjy0
= _mm_add_ps(fjy0
,ty
);
2136 fjz0
= _mm_add_ps(fjz0
,tz
);
2138 /**************************
2139 * CALCULATE INTERACTIONS *
2140 **************************/
2142 r11
= _mm_mul_ps(rsq11
,rinv11
);
2143 r11
= _mm_andnot_ps(dummy_mask
,r11
);
2145 /* EWALD ELECTROSTATICS */
2147 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2148 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
2149 ewitab
= _mm_cvttps_epi32(ewrt
);
2150 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2151 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2152 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2154 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2155 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
2159 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2161 /* Calculate temporary vectorial force */
2162 tx
= _mm_mul_ps(fscal
,dx11
);
2163 ty
= _mm_mul_ps(fscal
,dy11
);
2164 tz
= _mm_mul_ps(fscal
,dz11
);
2166 /* Update vectorial force */
2167 fix1
= _mm_add_ps(fix1
,tx
);
2168 fiy1
= _mm_add_ps(fiy1
,ty
);
2169 fiz1
= _mm_add_ps(fiz1
,tz
);
2171 fjx1
= _mm_add_ps(fjx1
,tx
);
2172 fjy1
= _mm_add_ps(fjy1
,ty
);
2173 fjz1
= _mm_add_ps(fjz1
,tz
);
2175 /**************************
2176 * CALCULATE INTERACTIONS *
2177 **************************/
2179 r12
= _mm_mul_ps(rsq12
,rinv12
);
2180 r12
= _mm_andnot_ps(dummy_mask
,r12
);
2182 /* EWALD ELECTROSTATICS */
2184 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2185 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
2186 ewitab
= _mm_cvttps_epi32(ewrt
);
2187 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2188 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2189 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2191 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2192 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
2196 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2198 /* Calculate temporary vectorial force */
2199 tx
= _mm_mul_ps(fscal
,dx12
);
2200 ty
= _mm_mul_ps(fscal
,dy12
);
2201 tz
= _mm_mul_ps(fscal
,dz12
);
2203 /* Update vectorial force */
2204 fix1
= _mm_add_ps(fix1
,tx
);
2205 fiy1
= _mm_add_ps(fiy1
,ty
);
2206 fiz1
= _mm_add_ps(fiz1
,tz
);
2208 fjx2
= _mm_add_ps(fjx2
,tx
);
2209 fjy2
= _mm_add_ps(fjy2
,ty
);
2210 fjz2
= _mm_add_ps(fjz2
,tz
);
2212 /**************************
2213 * CALCULATE INTERACTIONS *
2214 **************************/
2216 r20
= _mm_mul_ps(rsq20
,rinv20
);
2217 r20
= _mm_andnot_ps(dummy_mask
,r20
);
2219 /* EWALD ELECTROSTATICS */
2221 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2222 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
2223 ewitab
= _mm_cvttps_epi32(ewrt
);
2224 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2225 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2226 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2228 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2229 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
2233 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2235 /* Calculate temporary vectorial force */
2236 tx
= _mm_mul_ps(fscal
,dx20
);
2237 ty
= _mm_mul_ps(fscal
,dy20
);
2238 tz
= _mm_mul_ps(fscal
,dz20
);
2240 /* Update vectorial force */
2241 fix2
= _mm_add_ps(fix2
,tx
);
2242 fiy2
= _mm_add_ps(fiy2
,ty
);
2243 fiz2
= _mm_add_ps(fiz2
,tz
);
2245 fjx0
= _mm_add_ps(fjx0
,tx
);
2246 fjy0
= _mm_add_ps(fjy0
,ty
);
2247 fjz0
= _mm_add_ps(fjz0
,tz
);
2249 /**************************
2250 * CALCULATE INTERACTIONS *
2251 **************************/
2253 r21
= _mm_mul_ps(rsq21
,rinv21
);
2254 r21
= _mm_andnot_ps(dummy_mask
,r21
);
2256 /* EWALD ELECTROSTATICS */
2258 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2259 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
2260 ewitab
= _mm_cvttps_epi32(ewrt
);
2261 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2262 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2263 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2265 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2266 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
2270 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2272 /* Calculate temporary vectorial force */
2273 tx
= _mm_mul_ps(fscal
,dx21
);
2274 ty
= _mm_mul_ps(fscal
,dy21
);
2275 tz
= _mm_mul_ps(fscal
,dz21
);
2277 /* Update vectorial force */
2278 fix2
= _mm_add_ps(fix2
,tx
);
2279 fiy2
= _mm_add_ps(fiy2
,ty
);
2280 fiz2
= _mm_add_ps(fiz2
,tz
);
2282 fjx1
= _mm_add_ps(fjx1
,tx
);
2283 fjy1
= _mm_add_ps(fjy1
,ty
);
2284 fjz1
= _mm_add_ps(fjz1
,tz
);
2286 /**************************
2287 * CALCULATE INTERACTIONS *
2288 **************************/
2290 r22
= _mm_mul_ps(rsq22
,rinv22
);
2291 r22
= _mm_andnot_ps(dummy_mask
,r22
);
2293 /* EWALD ELECTROSTATICS */
2295 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2296 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
2297 ewitab
= _mm_cvttps_epi32(ewrt
);
2298 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2299 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2300 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2302 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2303 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
2307 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2309 /* Calculate temporary vectorial force */
2310 tx
= _mm_mul_ps(fscal
,dx22
);
2311 ty
= _mm_mul_ps(fscal
,dy22
);
2312 tz
= _mm_mul_ps(fscal
,dz22
);
2314 /* Update vectorial force */
2315 fix2
= _mm_add_ps(fix2
,tx
);
2316 fiy2
= _mm_add_ps(fiy2
,ty
);
2317 fiz2
= _mm_add_ps(fiz2
,tz
);
2319 fjx2
= _mm_add_ps(fjx2
,tx
);
2320 fjy2
= _mm_add_ps(fjy2
,ty
);
2321 fjz2
= _mm_add_ps(fjz2
,tz
);
2323 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2324 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2325 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2326 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2328 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2329 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2331 /* Inner loop uses 356 flops */
2334 /* End of innermost loop */
2336 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2337 f
+i_coord_offset
,fshift
+i_shift_offset
);
2339 /* Increment number of inner iterations */
2340 inneriter
+= j_index_end
- j_index_start
;
2342 /* Outer loop uses 18 flops */
2345 /* Increment number of outer iterations */
2348 /* Update outer/inner flops */
2350 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*356);