2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single
59 (t_nblist
* gmx_restrict nlist
,
60 rvec
* gmx_restrict xx
,
61 rvec
* gmx_restrict ff
,
62 t_forcerec
* gmx_restrict fr
,
63 t_mdatoms
* gmx_restrict mdatoms
,
64 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
65 t_nrnb
* gmx_restrict nrnb
)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
73 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int jnrA
,jnrB
,jnrC
,jnrD
;
75 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
76 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
82 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
84 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
86 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
88 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
90 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
91 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
92 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
93 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
94 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
95 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
96 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
97 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
98 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
99 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
100 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
101 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
102 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
103 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
104 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
105 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
106 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
107 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
108 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
109 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
112 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
115 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
116 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
118 __m128 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
120 __m128 dummy_mask
,cutoff_mask
;
121 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
122 __m128 one
= _mm_set1_ps(1.0);
123 __m128 two
= _mm_set1_ps(2.0);
129 jindex
= nlist
->jindex
;
131 shiftidx
= nlist
->shift
;
133 shiftvec
= fr
->shift_vec
[0];
134 fshift
= fr
->fshift
[0];
135 facel
= _mm_set1_ps(fr
->epsfac
);
136 charge
= mdatoms
->chargeA
;
137 nvdwtype
= fr
->ntype
;
139 vdwtype
= mdatoms
->typeA
;
141 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
142 ewtab
= fr
->ic
->tabq_coul_FDV0
;
143 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
144 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
146 /* Setup water-specific parameters */
147 inr
= nlist
->iinr
[0];
148 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
149 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
150 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
151 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
153 jq1
= _mm_set1_ps(charge
[inr
+1]);
154 jq2
= _mm_set1_ps(charge
[inr
+2]);
155 jq3
= _mm_set1_ps(charge
[inr
+3]);
156 vdwjidx0A
= 2*vdwtype
[inr
+0];
157 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
158 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
159 qq11
= _mm_mul_ps(iq1
,jq1
);
160 qq12
= _mm_mul_ps(iq1
,jq2
);
161 qq13
= _mm_mul_ps(iq1
,jq3
);
162 qq21
= _mm_mul_ps(iq2
,jq1
);
163 qq22
= _mm_mul_ps(iq2
,jq2
);
164 qq23
= _mm_mul_ps(iq2
,jq3
);
165 qq31
= _mm_mul_ps(iq3
,jq1
);
166 qq32
= _mm_mul_ps(iq3
,jq2
);
167 qq33
= _mm_mul_ps(iq3
,jq3
);
169 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
170 rcutoff_scalar
= fr
->rcoulomb
;
171 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
172 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
174 sh_vdw_invrcut6
= _mm_set1_ps(fr
->ic
->sh_invrc6
);
175 rvdw
= _mm_set1_ps(fr
->rvdw
);
177 /* Avoid stupid compiler warnings */
178 jnrA
= jnrB
= jnrC
= jnrD
= 0;
187 for(iidx
=0;iidx
<4*DIM
;iidx
++)
192 /* Start outer loop over neighborlists */
193 for(iidx
=0; iidx
<nri
; iidx
++)
195 /* Load shift vector for this list */
196 i_shift_offset
= DIM
*shiftidx
[iidx
];
198 /* Load limits for loop over neighbors */
199 j_index_start
= jindex
[iidx
];
200 j_index_end
= jindex
[iidx
+1];
202 /* Get outer coordinate index */
204 i_coord_offset
= DIM
*inr
;
206 /* Load i particle coords and add shift vector */
207 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
208 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
210 fix0
= _mm_setzero_ps();
211 fiy0
= _mm_setzero_ps();
212 fiz0
= _mm_setzero_ps();
213 fix1
= _mm_setzero_ps();
214 fiy1
= _mm_setzero_ps();
215 fiz1
= _mm_setzero_ps();
216 fix2
= _mm_setzero_ps();
217 fiy2
= _mm_setzero_ps();
218 fiz2
= _mm_setzero_ps();
219 fix3
= _mm_setzero_ps();
220 fiy3
= _mm_setzero_ps();
221 fiz3
= _mm_setzero_ps();
223 /* Reset potential sums */
224 velecsum
= _mm_setzero_ps();
225 vvdwsum
= _mm_setzero_ps();
227 /* Start inner kernel loop */
228 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
231 /* Get j neighbor index, and coordinate index */
236 j_coord_offsetA
= DIM
*jnrA
;
237 j_coord_offsetB
= DIM
*jnrB
;
238 j_coord_offsetC
= DIM
*jnrC
;
239 j_coord_offsetD
= DIM
*jnrD
;
241 /* load j atom coordinates */
242 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
243 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
244 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
245 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
247 /* Calculate displacement vector */
248 dx00
= _mm_sub_ps(ix0
,jx0
);
249 dy00
= _mm_sub_ps(iy0
,jy0
);
250 dz00
= _mm_sub_ps(iz0
,jz0
);
251 dx11
= _mm_sub_ps(ix1
,jx1
);
252 dy11
= _mm_sub_ps(iy1
,jy1
);
253 dz11
= _mm_sub_ps(iz1
,jz1
);
254 dx12
= _mm_sub_ps(ix1
,jx2
);
255 dy12
= _mm_sub_ps(iy1
,jy2
);
256 dz12
= _mm_sub_ps(iz1
,jz2
);
257 dx13
= _mm_sub_ps(ix1
,jx3
);
258 dy13
= _mm_sub_ps(iy1
,jy3
);
259 dz13
= _mm_sub_ps(iz1
,jz3
);
260 dx21
= _mm_sub_ps(ix2
,jx1
);
261 dy21
= _mm_sub_ps(iy2
,jy1
);
262 dz21
= _mm_sub_ps(iz2
,jz1
);
263 dx22
= _mm_sub_ps(ix2
,jx2
);
264 dy22
= _mm_sub_ps(iy2
,jy2
);
265 dz22
= _mm_sub_ps(iz2
,jz2
);
266 dx23
= _mm_sub_ps(ix2
,jx3
);
267 dy23
= _mm_sub_ps(iy2
,jy3
);
268 dz23
= _mm_sub_ps(iz2
,jz3
);
269 dx31
= _mm_sub_ps(ix3
,jx1
);
270 dy31
= _mm_sub_ps(iy3
,jy1
);
271 dz31
= _mm_sub_ps(iz3
,jz1
);
272 dx32
= _mm_sub_ps(ix3
,jx2
);
273 dy32
= _mm_sub_ps(iy3
,jy2
);
274 dz32
= _mm_sub_ps(iz3
,jz2
);
275 dx33
= _mm_sub_ps(ix3
,jx3
);
276 dy33
= _mm_sub_ps(iy3
,jy3
);
277 dz33
= _mm_sub_ps(iz3
,jz3
);
279 /* Calculate squared distance and things based on it */
280 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
281 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
282 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
283 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
284 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
285 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
286 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
287 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
288 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
289 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
291 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
292 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
293 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
294 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
295 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
296 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
297 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
298 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
299 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
301 rinvsq00
= gmx_mm_inv_ps(rsq00
);
302 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
303 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
304 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
305 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
306 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
307 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
308 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
309 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
310 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
312 fjx0
= _mm_setzero_ps();
313 fjy0
= _mm_setzero_ps();
314 fjz0
= _mm_setzero_ps();
315 fjx1
= _mm_setzero_ps();
316 fjy1
= _mm_setzero_ps();
317 fjz1
= _mm_setzero_ps();
318 fjx2
= _mm_setzero_ps();
319 fjy2
= _mm_setzero_ps();
320 fjz2
= _mm_setzero_ps();
321 fjx3
= _mm_setzero_ps();
322 fjy3
= _mm_setzero_ps();
323 fjz3
= _mm_setzero_ps();
325 /**************************
326 * CALCULATE INTERACTIONS *
327 **************************/
329 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
332 /* LENNARD-JONES DISPERSION/REPULSION */
334 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
335 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
336 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
337 vvdw
= _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12
, _mm_mul_ps(c12_00
,_mm_mul_ps(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
338 _mm_mul_ps( _mm_sub_ps(vvdw6
,_mm_mul_ps(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
339 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
341 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
343 /* Update potential sum for this i atom from the interaction with this j atom. */
344 vvdw
= _mm_and_ps(vvdw
,cutoff_mask
);
345 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
349 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
351 /* Calculate temporary vectorial force */
352 tx
= _mm_mul_ps(fscal
,dx00
);
353 ty
= _mm_mul_ps(fscal
,dy00
);
354 tz
= _mm_mul_ps(fscal
,dz00
);
356 /* Update vectorial force */
357 fix0
= _mm_add_ps(fix0
,tx
);
358 fiy0
= _mm_add_ps(fiy0
,ty
);
359 fiz0
= _mm_add_ps(fiz0
,tz
);
361 fjx0
= _mm_add_ps(fjx0
,tx
);
362 fjy0
= _mm_add_ps(fjy0
,ty
);
363 fjz0
= _mm_add_ps(fjz0
,tz
);
367 /**************************
368 * CALCULATE INTERACTIONS *
369 **************************/
371 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
374 r11
= _mm_mul_ps(rsq11
,rinv11
);
376 /* EWALD ELECTROSTATICS */
378 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
379 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
380 ewitab
= _mm_cvttps_epi32(ewrt
);
381 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
382 ewitab
= _mm_slli_epi32(ewitab
,2);
383 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
384 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
385 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
386 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
387 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
388 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
389 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
390 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_sub_ps(rinv11
,sh_ewald
),velec
));
391 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
393 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
395 /* Update potential sum for this i atom from the interaction with this j atom. */
396 velec
= _mm_and_ps(velec
,cutoff_mask
);
397 velecsum
= _mm_add_ps(velecsum
,velec
);
401 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
403 /* Calculate temporary vectorial force */
404 tx
= _mm_mul_ps(fscal
,dx11
);
405 ty
= _mm_mul_ps(fscal
,dy11
);
406 tz
= _mm_mul_ps(fscal
,dz11
);
408 /* Update vectorial force */
409 fix1
= _mm_add_ps(fix1
,tx
);
410 fiy1
= _mm_add_ps(fiy1
,ty
);
411 fiz1
= _mm_add_ps(fiz1
,tz
);
413 fjx1
= _mm_add_ps(fjx1
,tx
);
414 fjy1
= _mm_add_ps(fjy1
,ty
);
415 fjz1
= _mm_add_ps(fjz1
,tz
);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
426 r12
= _mm_mul_ps(rsq12
,rinv12
);
428 /* EWALD ELECTROSTATICS */
430 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
431 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
432 ewitab
= _mm_cvttps_epi32(ewrt
);
433 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
434 ewitab
= _mm_slli_epi32(ewitab
,2);
435 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
436 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
437 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
438 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
439 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
440 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
441 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
442 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_sub_ps(rinv12
,sh_ewald
),velec
));
443 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
445 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
447 /* Update potential sum for this i atom from the interaction with this j atom. */
448 velec
= _mm_and_ps(velec
,cutoff_mask
);
449 velecsum
= _mm_add_ps(velecsum
,velec
);
453 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
455 /* Calculate temporary vectorial force */
456 tx
= _mm_mul_ps(fscal
,dx12
);
457 ty
= _mm_mul_ps(fscal
,dy12
);
458 tz
= _mm_mul_ps(fscal
,dz12
);
460 /* Update vectorial force */
461 fix1
= _mm_add_ps(fix1
,tx
);
462 fiy1
= _mm_add_ps(fiy1
,ty
);
463 fiz1
= _mm_add_ps(fiz1
,tz
);
465 fjx2
= _mm_add_ps(fjx2
,tx
);
466 fjy2
= _mm_add_ps(fjy2
,ty
);
467 fjz2
= _mm_add_ps(fjz2
,tz
);
471 /**************************
472 * CALCULATE INTERACTIONS *
473 **************************/
475 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
478 r13
= _mm_mul_ps(rsq13
,rinv13
);
480 /* EWALD ELECTROSTATICS */
482 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
483 ewrt
= _mm_mul_ps(r13
,ewtabscale
);
484 ewitab
= _mm_cvttps_epi32(ewrt
);
485 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
486 ewitab
= _mm_slli_epi32(ewitab
,2);
487 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
488 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
489 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
490 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
491 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
492 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
493 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
494 velec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_sub_ps(rinv13
,sh_ewald
),velec
));
495 felec
= _mm_mul_ps(_mm_mul_ps(qq13
,rinv13
),_mm_sub_ps(rinvsq13
,felec
));
497 cutoff_mask
= _mm_cmplt_ps(rsq13
,rcutoff2
);
499 /* Update potential sum for this i atom from the interaction with this j atom. */
500 velec
= _mm_and_ps(velec
,cutoff_mask
);
501 velecsum
= _mm_add_ps(velecsum
,velec
);
505 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
507 /* Calculate temporary vectorial force */
508 tx
= _mm_mul_ps(fscal
,dx13
);
509 ty
= _mm_mul_ps(fscal
,dy13
);
510 tz
= _mm_mul_ps(fscal
,dz13
);
512 /* Update vectorial force */
513 fix1
= _mm_add_ps(fix1
,tx
);
514 fiy1
= _mm_add_ps(fiy1
,ty
);
515 fiz1
= _mm_add_ps(fiz1
,tz
);
517 fjx3
= _mm_add_ps(fjx3
,tx
);
518 fjy3
= _mm_add_ps(fjy3
,ty
);
519 fjz3
= _mm_add_ps(fjz3
,tz
);
523 /**************************
524 * CALCULATE INTERACTIONS *
525 **************************/
527 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
530 r21
= _mm_mul_ps(rsq21
,rinv21
);
532 /* EWALD ELECTROSTATICS */
534 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
535 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
536 ewitab
= _mm_cvttps_epi32(ewrt
);
537 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
538 ewitab
= _mm_slli_epi32(ewitab
,2);
539 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
540 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
541 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
542 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
543 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
544 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
545 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
546 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_sub_ps(rinv21
,sh_ewald
),velec
));
547 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
549 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
551 /* Update potential sum for this i atom from the interaction with this j atom. */
552 velec
= _mm_and_ps(velec
,cutoff_mask
);
553 velecsum
= _mm_add_ps(velecsum
,velec
);
557 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
559 /* Calculate temporary vectorial force */
560 tx
= _mm_mul_ps(fscal
,dx21
);
561 ty
= _mm_mul_ps(fscal
,dy21
);
562 tz
= _mm_mul_ps(fscal
,dz21
);
564 /* Update vectorial force */
565 fix2
= _mm_add_ps(fix2
,tx
);
566 fiy2
= _mm_add_ps(fiy2
,ty
);
567 fiz2
= _mm_add_ps(fiz2
,tz
);
569 fjx1
= _mm_add_ps(fjx1
,tx
);
570 fjy1
= _mm_add_ps(fjy1
,ty
);
571 fjz1
= _mm_add_ps(fjz1
,tz
);
575 /**************************
576 * CALCULATE INTERACTIONS *
577 **************************/
579 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
582 r22
= _mm_mul_ps(rsq22
,rinv22
);
584 /* EWALD ELECTROSTATICS */
586 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
587 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
588 ewitab
= _mm_cvttps_epi32(ewrt
);
589 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
590 ewitab
= _mm_slli_epi32(ewitab
,2);
591 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
592 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
593 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
594 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
595 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
596 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
597 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
598 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_sub_ps(rinv22
,sh_ewald
),velec
));
599 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
601 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
603 /* Update potential sum for this i atom from the interaction with this j atom. */
604 velec
= _mm_and_ps(velec
,cutoff_mask
);
605 velecsum
= _mm_add_ps(velecsum
,velec
);
609 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
611 /* Calculate temporary vectorial force */
612 tx
= _mm_mul_ps(fscal
,dx22
);
613 ty
= _mm_mul_ps(fscal
,dy22
);
614 tz
= _mm_mul_ps(fscal
,dz22
);
616 /* Update vectorial force */
617 fix2
= _mm_add_ps(fix2
,tx
);
618 fiy2
= _mm_add_ps(fiy2
,ty
);
619 fiz2
= _mm_add_ps(fiz2
,tz
);
621 fjx2
= _mm_add_ps(fjx2
,tx
);
622 fjy2
= _mm_add_ps(fjy2
,ty
);
623 fjz2
= _mm_add_ps(fjz2
,tz
);
627 /**************************
628 * CALCULATE INTERACTIONS *
629 **************************/
631 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
634 r23
= _mm_mul_ps(rsq23
,rinv23
);
636 /* EWALD ELECTROSTATICS */
638 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
639 ewrt
= _mm_mul_ps(r23
,ewtabscale
);
640 ewitab
= _mm_cvttps_epi32(ewrt
);
641 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
642 ewitab
= _mm_slli_epi32(ewitab
,2);
643 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
644 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
645 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
646 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
647 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
648 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
649 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
650 velec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_sub_ps(rinv23
,sh_ewald
),velec
));
651 felec
= _mm_mul_ps(_mm_mul_ps(qq23
,rinv23
),_mm_sub_ps(rinvsq23
,felec
));
653 cutoff_mask
= _mm_cmplt_ps(rsq23
,rcutoff2
);
655 /* Update potential sum for this i atom from the interaction with this j atom. */
656 velec
= _mm_and_ps(velec
,cutoff_mask
);
657 velecsum
= _mm_add_ps(velecsum
,velec
);
661 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
663 /* Calculate temporary vectorial force */
664 tx
= _mm_mul_ps(fscal
,dx23
);
665 ty
= _mm_mul_ps(fscal
,dy23
);
666 tz
= _mm_mul_ps(fscal
,dz23
);
668 /* Update vectorial force */
669 fix2
= _mm_add_ps(fix2
,tx
);
670 fiy2
= _mm_add_ps(fiy2
,ty
);
671 fiz2
= _mm_add_ps(fiz2
,tz
);
673 fjx3
= _mm_add_ps(fjx3
,tx
);
674 fjy3
= _mm_add_ps(fjy3
,ty
);
675 fjz3
= _mm_add_ps(fjz3
,tz
);
679 /**************************
680 * CALCULATE INTERACTIONS *
681 **************************/
683 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
686 r31
= _mm_mul_ps(rsq31
,rinv31
);
688 /* EWALD ELECTROSTATICS */
690 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
691 ewrt
= _mm_mul_ps(r31
,ewtabscale
);
692 ewitab
= _mm_cvttps_epi32(ewrt
);
693 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
694 ewitab
= _mm_slli_epi32(ewitab
,2);
695 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
696 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
697 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
698 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
699 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
700 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
701 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
702 velec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_sub_ps(rinv31
,sh_ewald
),velec
));
703 felec
= _mm_mul_ps(_mm_mul_ps(qq31
,rinv31
),_mm_sub_ps(rinvsq31
,felec
));
705 cutoff_mask
= _mm_cmplt_ps(rsq31
,rcutoff2
);
707 /* Update potential sum for this i atom from the interaction with this j atom. */
708 velec
= _mm_and_ps(velec
,cutoff_mask
);
709 velecsum
= _mm_add_ps(velecsum
,velec
);
713 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
715 /* Calculate temporary vectorial force */
716 tx
= _mm_mul_ps(fscal
,dx31
);
717 ty
= _mm_mul_ps(fscal
,dy31
);
718 tz
= _mm_mul_ps(fscal
,dz31
);
720 /* Update vectorial force */
721 fix3
= _mm_add_ps(fix3
,tx
);
722 fiy3
= _mm_add_ps(fiy3
,ty
);
723 fiz3
= _mm_add_ps(fiz3
,tz
);
725 fjx1
= _mm_add_ps(fjx1
,tx
);
726 fjy1
= _mm_add_ps(fjy1
,ty
);
727 fjz1
= _mm_add_ps(fjz1
,tz
);
731 /**************************
732 * CALCULATE INTERACTIONS *
733 **************************/
735 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
738 r32
= _mm_mul_ps(rsq32
,rinv32
);
740 /* EWALD ELECTROSTATICS */
742 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
743 ewrt
= _mm_mul_ps(r32
,ewtabscale
);
744 ewitab
= _mm_cvttps_epi32(ewrt
);
745 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
746 ewitab
= _mm_slli_epi32(ewitab
,2);
747 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
748 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
749 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
750 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
751 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
752 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
753 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
754 velec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_sub_ps(rinv32
,sh_ewald
),velec
));
755 felec
= _mm_mul_ps(_mm_mul_ps(qq32
,rinv32
),_mm_sub_ps(rinvsq32
,felec
));
757 cutoff_mask
= _mm_cmplt_ps(rsq32
,rcutoff2
);
759 /* Update potential sum for this i atom from the interaction with this j atom. */
760 velec
= _mm_and_ps(velec
,cutoff_mask
);
761 velecsum
= _mm_add_ps(velecsum
,velec
);
765 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
767 /* Calculate temporary vectorial force */
768 tx
= _mm_mul_ps(fscal
,dx32
);
769 ty
= _mm_mul_ps(fscal
,dy32
);
770 tz
= _mm_mul_ps(fscal
,dz32
);
772 /* Update vectorial force */
773 fix3
= _mm_add_ps(fix3
,tx
);
774 fiy3
= _mm_add_ps(fiy3
,ty
);
775 fiz3
= _mm_add_ps(fiz3
,tz
);
777 fjx2
= _mm_add_ps(fjx2
,tx
);
778 fjy2
= _mm_add_ps(fjy2
,ty
);
779 fjz2
= _mm_add_ps(fjz2
,tz
);
783 /**************************
784 * CALCULATE INTERACTIONS *
785 **************************/
787 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
790 r33
= _mm_mul_ps(rsq33
,rinv33
);
792 /* EWALD ELECTROSTATICS */
794 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
795 ewrt
= _mm_mul_ps(r33
,ewtabscale
);
796 ewitab
= _mm_cvttps_epi32(ewrt
);
797 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
798 ewitab
= _mm_slli_epi32(ewitab
,2);
799 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
800 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
801 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
802 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
803 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
804 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
805 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
806 velec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_sub_ps(rinv33
,sh_ewald
),velec
));
807 felec
= _mm_mul_ps(_mm_mul_ps(qq33
,rinv33
),_mm_sub_ps(rinvsq33
,felec
));
809 cutoff_mask
= _mm_cmplt_ps(rsq33
,rcutoff2
);
811 /* Update potential sum for this i atom from the interaction with this j atom. */
812 velec
= _mm_and_ps(velec
,cutoff_mask
);
813 velecsum
= _mm_add_ps(velecsum
,velec
);
817 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
819 /* Calculate temporary vectorial force */
820 tx
= _mm_mul_ps(fscal
,dx33
);
821 ty
= _mm_mul_ps(fscal
,dy33
);
822 tz
= _mm_mul_ps(fscal
,dz33
);
824 /* Update vectorial force */
825 fix3
= _mm_add_ps(fix3
,tx
);
826 fiy3
= _mm_add_ps(fiy3
,ty
);
827 fiz3
= _mm_add_ps(fiz3
,tz
);
829 fjx3
= _mm_add_ps(fjx3
,tx
);
830 fjy3
= _mm_add_ps(fjy3
,ty
);
831 fjz3
= _mm_add_ps(fjz3
,tz
);
835 fjptrA
= f
+j_coord_offsetA
;
836 fjptrB
= f
+j_coord_offsetB
;
837 fjptrC
= f
+j_coord_offsetC
;
838 fjptrD
= f
+j_coord_offsetD
;
840 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
841 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
842 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
844 /* Inner loop uses 458 flops */
850 /* Get j neighbor index, and coordinate index */
851 jnrlistA
= jjnr
[jidx
];
852 jnrlistB
= jjnr
[jidx
+1];
853 jnrlistC
= jjnr
[jidx
+2];
854 jnrlistD
= jjnr
[jidx
+3];
855 /* Sign of each element will be negative for non-real atoms.
856 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
857 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
859 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
860 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
861 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
862 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
863 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
864 j_coord_offsetA
= DIM
*jnrA
;
865 j_coord_offsetB
= DIM
*jnrB
;
866 j_coord_offsetC
= DIM
*jnrC
;
867 j_coord_offsetD
= DIM
*jnrD
;
869 /* load j atom coordinates */
870 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
871 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
872 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
873 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
875 /* Calculate displacement vector */
876 dx00
= _mm_sub_ps(ix0
,jx0
);
877 dy00
= _mm_sub_ps(iy0
,jy0
);
878 dz00
= _mm_sub_ps(iz0
,jz0
);
879 dx11
= _mm_sub_ps(ix1
,jx1
);
880 dy11
= _mm_sub_ps(iy1
,jy1
);
881 dz11
= _mm_sub_ps(iz1
,jz1
);
882 dx12
= _mm_sub_ps(ix1
,jx2
);
883 dy12
= _mm_sub_ps(iy1
,jy2
);
884 dz12
= _mm_sub_ps(iz1
,jz2
);
885 dx13
= _mm_sub_ps(ix1
,jx3
);
886 dy13
= _mm_sub_ps(iy1
,jy3
);
887 dz13
= _mm_sub_ps(iz1
,jz3
);
888 dx21
= _mm_sub_ps(ix2
,jx1
);
889 dy21
= _mm_sub_ps(iy2
,jy1
);
890 dz21
= _mm_sub_ps(iz2
,jz1
);
891 dx22
= _mm_sub_ps(ix2
,jx2
);
892 dy22
= _mm_sub_ps(iy2
,jy2
);
893 dz22
= _mm_sub_ps(iz2
,jz2
);
894 dx23
= _mm_sub_ps(ix2
,jx3
);
895 dy23
= _mm_sub_ps(iy2
,jy3
);
896 dz23
= _mm_sub_ps(iz2
,jz3
);
897 dx31
= _mm_sub_ps(ix3
,jx1
);
898 dy31
= _mm_sub_ps(iy3
,jy1
);
899 dz31
= _mm_sub_ps(iz3
,jz1
);
900 dx32
= _mm_sub_ps(ix3
,jx2
);
901 dy32
= _mm_sub_ps(iy3
,jy2
);
902 dz32
= _mm_sub_ps(iz3
,jz2
);
903 dx33
= _mm_sub_ps(ix3
,jx3
);
904 dy33
= _mm_sub_ps(iy3
,jy3
);
905 dz33
= _mm_sub_ps(iz3
,jz3
);
907 /* Calculate squared distance and things based on it */
908 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
909 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
910 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
911 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
912 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
913 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
914 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
915 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
916 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
917 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
919 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
920 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
921 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
922 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
923 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
924 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
925 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
926 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
927 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
929 rinvsq00
= gmx_mm_inv_ps(rsq00
);
930 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
931 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
932 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
933 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
934 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
935 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
936 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
937 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
938 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
940 fjx0
= _mm_setzero_ps();
941 fjy0
= _mm_setzero_ps();
942 fjz0
= _mm_setzero_ps();
943 fjx1
= _mm_setzero_ps();
944 fjy1
= _mm_setzero_ps();
945 fjz1
= _mm_setzero_ps();
946 fjx2
= _mm_setzero_ps();
947 fjy2
= _mm_setzero_ps();
948 fjz2
= _mm_setzero_ps();
949 fjx3
= _mm_setzero_ps();
950 fjy3
= _mm_setzero_ps();
951 fjz3
= _mm_setzero_ps();
953 /**************************
954 * CALCULATE INTERACTIONS *
955 **************************/
957 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
960 /* LENNARD-JONES DISPERSION/REPULSION */
962 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
963 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
964 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
965 vvdw
= _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12
, _mm_mul_ps(c12_00
,_mm_mul_ps(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
966 _mm_mul_ps( _mm_sub_ps(vvdw6
,_mm_mul_ps(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
967 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
969 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
971 /* Update potential sum for this i atom from the interaction with this j atom. */
972 vvdw
= _mm_and_ps(vvdw
,cutoff_mask
);
973 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
974 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
978 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
980 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
982 /* Calculate temporary vectorial force */
983 tx
= _mm_mul_ps(fscal
,dx00
);
984 ty
= _mm_mul_ps(fscal
,dy00
);
985 tz
= _mm_mul_ps(fscal
,dz00
);
987 /* Update vectorial force */
988 fix0
= _mm_add_ps(fix0
,tx
);
989 fiy0
= _mm_add_ps(fiy0
,ty
);
990 fiz0
= _mm_add_ps(fiz0
,tz
);
992 fjx0
= _mm_add_ps(fjx0
,tx
);
993 fjy0
= _mm_add_ps(fjy0
,ty
);
994 fjz0
= _mm_add_ps(fjz0
,tz
);
998 /**************************
999 * CALCULATE INTERACTIONS *
1000 **************************/
1002 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1005 r11
= _mm_mul_ps(rsq11
,rinv11
);
1006 r11
= _mm_andnot_ps(dummy_mask
,r11
);
1008 /* EWALD ELECTROSTATICS */
1010 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1011 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
1012 ewitab
= _mm_cvttps_epi32(ewrt
);
1013 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1014 ewitab
= _mm_slli_epi32(ewitab
,2);
1015 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1016 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1017 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1018 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1019 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1020 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1021 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1022 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_sub_ps(rinv11
,sh_ewald
),velec
));
1023 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
1025 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
1027 /* Update potential sum for this i atom from the interaction with this j atom. */
1028 velec
= _mm_and_ps(velec
,cutoff_mask
);
1029 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1030 velecsum
= _mm_add_ps(velecsum
,velec
);
1034 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1036 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1038 /* Calculate temporary vectorial force */
1039 tx
= _mm_mul_ps(fscal
,dx11
);
1040 ty
= _mm_mul_ps(fscal
,dy11
);
1041 tz
= _mm_mul_ps(fscal
,dz11
);
1043 /* Update vectorial force */
1044 fix1
= _mm_add_ps(fix1
,tx
);
1045 fiy1
= _mm_add_ps(fiy1
,ty
);
1046 fiz1
= _mm_add_ps(fiz1
,tz
);
1048 fjx1
= _mm_add_ps(fjx1
,tx
);
1049 fjy1
= _mm_add_ps(fjy1
,ty
);
1050 fjz1
= _mm_add_ps(fjz1
,tz
);
1054 /**************************
1055 * CALCULATE INTERACTIONS *
1056 **************************/
1058 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1061 r12
= _mm_mul_ps(rsq12
,rinv12
);
1062 r12
= _mm_andnot_ps(dummy_mask
,r12
);
1064 /* EWALD ELECTROSTATICS */
1066 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1067 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
1068 ewitab
= _mm_cvttps_epi32(ewrt
);
1069 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1070 ewitab
= _mm_slli_epi32(ewitab
,2);
1071 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1072 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1073 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1074 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1075 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1076 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1077 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1078 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_sub_ps(rinv12
,sh_ewald
),velec
));
1079 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
1081 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
1083 /* Update potential sum for this i atom from the interaction with this j atom. */
1084 velec
= _mm_and_ps(velec
,cutoff_mask
);
1085 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1086 velecsum
= _mm_add_ps(velecsum
,velec
);
1090 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1092 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1094 /* Calculate temporary vectorial force */
1095 tx
= _mm_mul_ps(fscal
,dx12
);
1096 ty
= _mm_mul_ps(fscal
,dy12
);
1097 tz
= _mm_mul_ps(fscal
,dz12
);
1099 /* Update vectorial force */
1100 fix1
= _mm_add_ps(fix1
,tx
);
1101 fiy1
= _mm_add_ps(fiy1
,ty
);
1102 fiz1
= _mm_add_ps(fiz1
,tz
);
1104 fjx2
= _mm_add_ps(fjx2
,tx
);
1105 fjy2
= _mm_add_ps(fjy2
,ty
);
1106 fjz2
= _mm_add_ps(fjz2
,tz
);
1110 /**************************
1111 * CALCULATE INTERACTIONS *
1112 **************************/
1114 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
1117 r13
= _mm_mul_ps(rsq13
,rinv13
);
1118 r13
= _mm_andnot_ps(dummy_mask
,r13
);
1120 /* EWALD ELECTROSTATICS */
1122 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1123 ewrt
= _mm_mul_ps(r13
,ewtabscale
);
1124 ewitab
= _mm_cvttps_epi32(ewrt
);
1125 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1126 ewitab
= _mm_slli_epi32(ewitab
,2);
1127 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1128 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1129 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1130 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1131 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1132 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1133 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1134 velec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_sub_ps(rinv13
,sh_ewald
),velec
));
1135 felec
= _mm_mul_ps(_mm_mul_ps(qq13
,rinv13
),_mm_sub_ps(rinvsq13
,felec
));
1137 cutoff_mask
= _mm_cmplt_ps(rsq13
,rcutoff2
);
1139 /* Update potential sum for this i atom from the interaction with this j atom. */
1140 velec
= _mm_and_ps(velec
,cutoff_mask
);
1141 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1142 velecsum
= _mm_add_ps(velecsum
,velec
);
1146 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1148 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1150 /* Calculate temporary vectorial force */
1151 tx
= _mm_mul_ps(fscal
,dx13
);
1152 ty
= _mm_mul_ps(fscal
,dy13
);
1153 tz
= _mm_mul_ps(fscal
,dz13
);
1155 /* Update vectorial force */
1156 fix1
= _mm_add_ps(fix1
,tx
);
1157 fiy1
= _mm_add_ps(fiy1
,ty
);
1158 fiz1
= _mm_add_ps(fiz1
,tz
);
1160 fjx3
= _mm_add_ps(fjx3
,tx
);
1161 fjy3
= _mm_add_ps(fjy3
,ty
);
1162 fjz3
= _mm_add_ps(fjz3
,tz
);
1166 /**************************
1167 * CALCULATE INTERACTIONS *
1168 **************************/
1170 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1173 r21
= _mm_mul_ps(rsq21
,rinv21
);
1174 r21
= _mm_andnot_ps(dummy_mask
,r21
);
1176 /* EWALD ELECTROSTATICS */
1178 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1179 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
1180 ewitab
= _mm_cvttps_epi32(ewrt
);
1181 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1182 ewitab
= _mm_slli_epi32(ewitab
,2);
1183 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1184 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1185 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1186 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1187 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1188 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1189 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1190 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_sub_ps(rinv21
,sh_ewald
),velec
));
1191 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
1193 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
1195 /* Update potential sum for this i atom from the interaction with this j atom. */
1196 velec
= _mm_and_ps(velec
,cutoff_mask
);
1197 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1198 velecsum
= _mm_add_ps(velecsum
,velec
);
1202 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1204 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1206 /* Calculate temporary vectorial force */
1207 tx
= _mm_mul_ps(fscal
,dx21
);
1208 ty
= _mm_mul_ps(fscal
,dy21
);
1209 tz
= _mm_mul_ps(fscal
,dz21
);
1211 /* Update vectorial force */
1212 fix2
= _mm_add_ps(fix2
,tx
);
1213 fiy2
= _mm_add_ps(fiy2
,ty
);
1214 fiz2
= _mm_add_ps(fiz2
,tz
);
1216 fjx1
= _mm_add_ps(fjx1
,tx
);
1217 fjy1
= _mm_add_ps(fjy1
,ty
);
1218 fjz1
= _mm_add_ps(fjz1
,tz
);
1222 /**************************
1223 * CALCULATE INTERACTIONS *
1224 **************************/
1226 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1229 r22
= _mm_mul_ps(rsq22
,rinv22
);
1230 r22
= _mm_andnot_ps(dummy_mask
,r22
);
1232 /* EWALD ELECTROSTATICS */
1234 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1235 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
1236 ewitab
= _mm_cvttps_epi32(ewrt
);
1237 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1238 ewitab
= _mm_slli_epi32(ewitab
,2);
1239 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1240 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1241 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1242 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1243 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1244 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1245 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1246 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_sub_ps(rinv22
,sh_ewald
),velec
));
1247 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
1249 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
1251 /* Update potential sum for this i atom from the interaction with this j atom. */
1252 velec
= _mm_and_ps(velec
,cutoff_mask
);
1253 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1254 velecsum
= _mm_add_ps(velecsum
,velec
);
1258 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1260 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1262 /* Calculate temporary vectorial force */
1263 tx
= _mm_mul_ps(fscal
,dx22
);
1264 ty
= _mm_mul_ps(fscal
,dy22
);
1265 tz
= _mm_mul_ps(fscal
,dz22
);
1267 /* Update vectorial force */
1268 fix2
= _mm_add_ps(fix2
,tx
);
1269 fiy2
= _mm_add_ps(fiy2
,ty
);
1270 fiz2
= _mm_add_ps(fiz2
,tz
);
1272 fjx2
= _mm_add_ps(fjx2
,tx
);
1273 fjy2
= _mm_add_ps(fjy2
,ty
);
1274 fjz2
= _mm_add_ps(fjz2
,tz
);
1278 /**************************
1279 * CALCULATE INTERACTIONS *
1280 **************************/
1282 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
1285 r23
= _mm_mul_ps(rsq23
,rinv23
);
1286 r23
= _mm_andnot_ps(dummy_mask
,r23
);
1288 /* EWALD ELECTROSTATICS */
1290 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1291 ewrt
= _mm_mul_ps(r23
,ewtabscale
);
1292 ewitab
= _mm_cvttps_epi32(ewrt
);
1293 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1294 ewitab
= _mm_slli_epi32(ewitab
,2);
1295 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1296 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1297 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1298 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1299 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1300 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1301 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1302 velec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_sub_ps(rinv23
,sh_ewald
),velec
));
1303 felec
= _mm_mul_ps(_mm_mul_ps(qq23
,rinv23
),_mm_sub_ps(rinvsq23
,felec
));
1305 cutoff_mask
= _mm_cmplt_ps(rsq23
,rcutoff2
);
1307 /* Update potential sum for this i atom from the interaction with this j atom. */
1308 velec
= _mm_and_ps(velec
,cutoff_mask
);
1309 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1310 velecsum
= _mm_add_ps(velecsum
,velec
);
1314 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1316 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1318 /* Calculate temporary vectorial force */
1319 tx
= _mm_mul_ps(fscal
,dx23
);
1320 ty
= _mm_mul_ps(fscal
,dy23
);
1321 tz
= _mm_mul_ps(fscal
,dz23
);
1323 /* Update vectorial force */
1324 fix2
= _mm_add_ps(fix2
,tx
);
1325 fiy2
= _mm_add_ps(fiy2
,ty
);
1326 fiz2
= _mm_add_ps(fiz2
,tz
);
1328 fjx3
= _mm_add_ps(fjx3
,tx
);
1329 fjy3
= _mm_add_ps(fjy3
,ty
);
1330 fjz3
= _mm_add_ps(fjz3
,tz
);
1334 /**************************
1335 * CALCULATE INTERACTIONS *
1336 **************************/
1338 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
1341 r31
= _mm_mul_ps(rsq31
,rinv31
);
1342 r31
= _mm_andnot_ps(dummy_mask
,r31
);
1344 /* EWALD ELECTROSTATICS */
1346 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1347 ewrt
= _mm_mul_ps(r31
,ewtabscale
);
1348 ewitab
= _mm_cvttps_epi32(ewrt
);
1349 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1350 ewitab
= _mm_slli_epi32(ewitab
,2);
1351 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1352 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1353 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1354 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1355 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1356 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1357 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1358 velec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_sub_ps(rinv31
,sh_ewald
),velec
));
1359 felec
= _mm_mul_ps(_mm_mul_ps(qq31
,rinv31
),_mm_sub_ps(rinvsq31
,felec
));
1361 cutoff_mask
= _mm_cmplt_ps(rsq31
,rcutoff2
);
1363 /* Update potential sum for this i atom from the interaction with this j atom. */
1364 velec
= _mm_and_ps(velec
,cutoff_mask
);
1365 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1366 velecsum
= _mm_add_ps(velecsum
,velec
);
1370 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1372 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1374 /* Calculate temporary vectorial force */
1375 tx
= _mm_mul_ps(fscal
,dx31
);
1376 ty
= _mm_mul_ps(fscal
,dy31
);
1377 tz
= _mm_mul_ps(fscal
,dz31
);
1379 /* Update vectorial force */
1380 fix3
= _mm_add_ps(fix3
,tx
);
1381 fiy3
= _mm_add_ps(fiy3
,ty
);
1382 fiz3
= _mm_add_ps(fiz3
,tz
);
1384 fjx1
= _mm_add_ps(fjx1
,tx
);
1385 fjy1
= _mm_add_ps(fjy1
,ty
);
1386 fjz1
= _mm_add_ps(fjz1
,tz
);
1390 /**************************
1391 * CALCULATE INTERACTIONS *
1392 **************************/
1394 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
1397 r32
= _mm_mul_ps(rsq32
,rinv32
);
1398 r32
= _mm_andnot_ps(dummy_mask
,r32
);
1400 /* EWALD ELECTROSTATICS */
1402 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1403 ewrt
= _mm_mul_ps(r32
,ewtabscale
);
1404 ewitab
= _mm_cvttps_epi32(ewrt
);
1405 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1406 ewitab
= _mm_slli_epi32(ewitab
,2);
1407 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1408 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1409 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1410 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1411 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1412 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1413 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1414 velec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_sub_ps(rinv32
,sh_ewald
),velec
));
1415 felec
= _mm_mul_ps(_mm_mul_ps(qq32
,rinv32
),_mm_sub_ps(rinvsq32
,felec
));
1417 cutoff_mask
= _mm_cmplt_ps(rsq32
,rcutoff2
);
1419 /* Update potential sum for this i atom from the interaction with this j atom. */
1420 velec
= _mm_and_ps(velec
,cutoff_mask
);
1421 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1422 velecsum
= _mm_add_ps(velecsum
,velec
);
1426 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1428 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1430 /* Calculate temporary vectorial force */
1431 tx
= _mm_mul_ps(fscal
,dx32
);
1432 ty
= _mm_mul_ps(fscal
,dy32
);
1433 tz
= _mm_mul_ps(fscal
,dz32
);
1435 /* Update vectorial force */
1436 fix3
= _mm_add_ps(fix3
,tx
);
1437 fiy3
= _mm_add_ps(fiy3
,ty
);
1438 fiz3
= _mm_add_ps(fiz3
,tz
);
1440 fjx2
= _mm_add_ps(fjx2
,tx
);
1441 fjy2
= _mm_add_ps(fjy2
,ty
);
1442 fjz2
= _mm_add_ps(fjz2
,tz
);
1446 /**************************
1447 * CALCULATE INTERACTIONS *
1448 **************************/
1450 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
1453 r33
= _mm_mul_ps(rsq33
,rinv33
);
1454 r33
= _mm_andnot_ps(dummy_mask
,r33
);
1456 /* EWALD ELECTROSTATICS */
1458 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1459 ewrt
= _mm_mul_ps(r33
,ewtabscale
);
1460 ewitab
= _mm_cvttps_epi32(ewrt
);
1461 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1462 ewitab
= _mm_slli_epi32(ewitab
,2);
1463 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1464 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1465 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1466 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1467 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1468 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1469 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1470 velec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_sub_ps(rinv33
,sh_ewald
),velec
));
1471 felec
= _mm_mul_ps(_mm_mul_ps(qq33
,rinv33
),_mm_sub_ps(rinvsq33
,felec
));
1473 cutoff_mask
= _mm_cmplt_ps(rsq33
,rcutoff2
);
1475 /* Update potential sum for this i atom from the interaction with this j atom. */
1476 velec
= _mm_and_ps(velec
,cutoff_mask
);
1477 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1478 velecsum
= _mm_add_ps(velecsum
,velec
);
1482 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1484 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1486 /* Calculate temporary vectorial force */
1487 tx
= _mm_mul_ps(fscal
,dx33
);
1488 ty
= _mm_mul_ps(fscal
,dy33
);
1489 tz
= _mm_mul_ps(fscal
,dz33
);
1491 /* Update vectorial force */
1492 fix3
= _mm_add_ps(fix3
,tx
);
1493 fiy3
= _mm_add_ps(fiy3
,ty
);
1494 fiz3
= _mm_add_ps(fiz3
,tz
);
1496 fjx3
= _mm_add_ps(fjx3
,tx
);
1497 fjy3
= _mm_add_ps(fjy3
,ty
);
1498 fjz3
= _mm_add_ps(fjz3
,tz
);
1502 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1503 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1504 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1505 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1507 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1508 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1509 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1511 /* Inner loop uses 467 flops */
1514 /* End of innermost loop */
1516 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1517 f
+i_coord_offset
,fshift
+i_shift_offset
);
1520 /* Update potential energies */
1521 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1522 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1524 /* Increment number of inner iterations */
1525 inneriter
+= j_index_end
- j_index_start
;
1527 /* Outer loop uses 26 flops */
1530 /* Increment number of outer iterations */
1533 /* Update outer/inner flops */
1535 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*467);
1538 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single
1539 * Electrostatics interaction: Ewald
1540 * VdW interaction: LennardJones
1541 * Geometry: Water4-Water4
1542 * Calculate force/pot: Force
1545 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single
1546 (t_nblist
* gmx_restrict nlist
,
1547 rvec
* gmx_restrict xx
,
1548 rvec
* gmx_restrict ff
,
1549 t_forcerec
* gmx_restrict fr
,
1550 t_mdatoms
* gmx_restrict mdatoms
,
1551 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1552 t_nrnb
* gmx_restrict nrnb
)
1554 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1555 * just 0 for non-waters.
1556 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1557 * jnr indices corresponding to data put in the four positions in the SIMD register.
1559 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1560 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1561 int jnrA
,jnrB
,jnrC
,jnrD
;
1562 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1563 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1564 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1565 real rcutoff_scalar
;
1566 real
*shiftvec
,*fshift
,*x
,*f
;
1567 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1568 real scratch
[4*DIM
];
1569 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1571 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1573 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1575 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1577 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1578 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1579 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1580 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1581 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1582 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1583 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1584 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
1585 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1586 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1587 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1588 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1589 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1590 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1591 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1592 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1593 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1594 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1595 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1596 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1599 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1602 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1603 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1605 __m128 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1607 __m128 dummy_mask
,cutoff_mask
;
1608 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1609 __m128 one
= _mm_set1_ps(1.0);
1610 __m128 two
= _mm_set1_ps(2.0);
1616 jindex
= nlist
->jindex
;
1618 shiftidx
= nlist
->shift
;
1620 shiftvec
= fr
->shift_vec
[0];
1621 fshift
= fr
->fshift
[0];
1622 facel
= _mm_set1_ps(fr
->epsfac
);
1623 charge
= mdatoms
->chargeA
;
1624 nvdwtype
= fr
->ntype
;
1625 vdwparam
= fr
->nbfp
;
1626 vdwtype
= mdatoms
->typeA
;
1628 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
1629 ewtab
= fr
->ic
->tabq_coul_F
;
1630 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
1631 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
1633 /* Setup water-specific parameters */
1634 inr
= nlist
->iinr
[0];
1635 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1636 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1637 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
1638 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1640 jq1
= _mm_set1_ps(charge
[inr
+1]);
1641 jq2
= _mm_set1_ps(charge
[inr
+2]);
1642 jq3
= _mm_set1_ps(charge
[inr
+3]);
1643 vdwjidx0A
= 2*vdwtype
[inr
+0];
1644 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1645 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1646 qq11
= _mm_mul_ps(iq1
,jq1
);
1647 qq12
= _mm_mul_ps(iq1
,jq2
);
1648 qq13
= _mm_mul_ps(iq1
,jq3
);
1649 qq21
= _mm_mul_ps(iq2
,jq1
);
1650 qq22
= _mm_mul_ps(iq2
,jq2
);
1651 qq23
= _mm_mul_ps(iq2
,jq3
);
1652 qq31
= _mm_mul_ps(iq3
,jq1
);
1653 qq32
= _mm_mul_ps(iq3
,jq2
);
1654 qq33
= _mm_mul_ps(iq3
,jq3
);
1656 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1657 rcutoff_scalar
= fr
->rcoulomb
;
1658 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
1659 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
1661 sh_vdw_invrcut6
= _mm_set1_ps(fr
->ic
->sh_invrc6
);
1662 rvdw
= _mm_set1_ps(fr
->rvdw
);
1664 /* Avoid stupid compiler warnings */
1665 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1666 j_coord_offsetA
= 0;
1667 j_coord_offsetB
= 0;
1668 j_coord_offsetC
= 0;
1669 j_coord_offsetD
= 0;
1674 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1676 scratch
[iidx
] = 0.0;
1679 /* Start outer loop over neighborlists */
1680 for(iidx
=0; iidx
<nri
; iidx
++)
1682 /* Load shift vector for this list */
1683 i_shift_offset
= DIM
*shiftidx
[iidx
];
1685 /* Load limits for loop over neighbors */
1686 j_index_start
= jindex
[iidx
];
1687 j_index_end
= jindex
[iidx
+1];
1689 /* Get outer coordinate index */
1691 i_coord_offset
= DIM
*inr
;
1693 /* Load i particle coords and add shift vector */
1694 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1695 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1697 fix0
= _mm_setzero_ps();
1698 fiy0
= _mm_setzero_ps();
1699 fiz0
= _mm_setzero_ps();
1700 fix1
= _mm_setzero_ps();
1701 fiy1
= _mm_setzero_ps();
1702 fiz1
= _mm_setzero_ps();
1703 fix2
= _mm_setzero_ps();
1704 fiy2
= _mm_setzero_ps();
1705 fiz2
= _mm_setzero_ps();
1706 fix3
= _mm_setzero_ps();
1707 fiy3
= _mm_setzero_ps();
1708 fiz3
= _mm_setzero_ps();
1710 /* Start inner kernel loop */
1711 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1714 /* Get j neighbor index, and coordinate index */
1716 jnrB
= jjnr
[jidx
+1];
1717 jnrC
= jjnr
[jidx
+2];
1718 jnrD
= jjnr
[jidx
+3];
1719 j_coord_offsetA
= DIM
*jnrA
;
1720 j_coord_offsetB
= DIM
*jnrB
;
1721 j_coord_offsetC
= DIM
*jnrC
;
1722 j_coord_offsetD
= DIM
*jnrD
;
1724 /* load j atom coordinates */
1725 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1726 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1727 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1728 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1730 /* Calculate displacement vector */
1731 dx00
= _mm_sub_ps(ix0
,jx0
);
1732 dy00
= _mm_sub_ps(iy0
,jy0
);
1733 dz00
= _mm_sub_ps(iz0
,jz0
);
1734 dx11
= _mm_sub_ps(ix1
,jx1
);
1735 dy11
= _mm_sub_ps(iy1
,jy1
);
1736 dz11
= _mm_sub_ps(iz1
,jz1
);
1737 dx12
= _mm_sub_ps(ix1
,jx2
);
1738 dy12
= _mm_sub_ps(iy1
,jy2
);
1739 dz12
= _mm_sub_ps(iz1
,jz2
);
1740 dx13
= _mm_sub_ps(ix1
,jx3
);
1741 dy13
= _mm_sub_ps(iy1
,jy3
);
1742 dz13
= _mm_sub_ps(iz1
,jz3
);
1743 dx21
= _mm_sub_ps(ix2
,jx1
);
1744 dy21
= _mm_sub_ps(iy2
,jy1
);
1745 dz21
= _mm_sub_ps(iz2
,jz1
);
1746 dx22
= _mm_sub_ps(ix2
,jx2
);
1747 dy22
= _mm_sub_ps(iy2
,jy2
);
1748 dz22
= _mm_sub_ps(iz2
,jz2
);
1749 dx23
= _mm_sub_ps(ix2
,jx3
);
1750 dy23
= _mm_sub_ps(iy2
,jy3
);
1751 dz23
= _mm_sub_ps(iz2
,jz3
);
1752 dx31
= _mm_sub_ps(ix3
,jx1
);
1753 dy31
= _mm_sub_ps(iy3
,jy1
);
1754 dz31
= _mm_sub_ps(iz3
,jz1
);
1755 dx32
= _mm_sub_ps(ix3
,jx2
);
1756 dy32
= _mm_sub_ps(iy3
,jy2
);
1757 dz32
= _mm_sub_ps(iz3
,jz2
);
1758 dx33
= _mm_sub_ps(ix3
,jx3
);
1759 dy33
= _mm_sub_ps(iy3
,jy3
);
1760 dz33
= _mm_sub_ps(iz3
,jz3
);
1762 /* Calculate squared distance and things based on it */
1763 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1764 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1765 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1766 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1767 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1768 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1769 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1770 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1771 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1772 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1774 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1775 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1776 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
1777 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1778 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1779 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
1780 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
1781 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
1782 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
1784 rinvsq00
= gmx_mm_inv_ps(rsq00
);
1785 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1786 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1787 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
1788 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1789 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1790 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
1791 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
1792 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
1793 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
1795 fjx0
= _mm_setzero_ps();
1796 fjy0
= _mm_setzero_ps();
1797 fjz0
= _mm_setzero_ps();
1798 fjx1
= _mm_setzero_ps();
1799 fjy1
= _mm_setzero_ps();
1800 fjz1
= _mm_setzero_ps();
1801 fjx2
= _mm_setzero_ps();
1802 fjy2
= _mm_setzero_ps();
1803 fjz2
= _mm_setzero_ps();
1804 fjx3
= _mm_setzero_ps();
1805 fjy3
= _mm_setzero_ps();
1806 fjz3
= _mm_setzero_ps();
1808 /**************************
1809 * CALCULATE INTERACTIONS *
1810 **************************/
1812 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1815 /* LENNARD-JONES DISPERSION/REPULSION */
1817 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1818 fvdw
= _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1820 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
1824 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1826 /* Calculate temporary vectorial force */
1827 tx
= _mm_mul_ps(fscal
,dx00
);
1828 ty
= _mm_mul_ps(fscal
,dy00
);
1829 tz
= _mm_mul_ps(fscal
,dz00
);
1831 /* Update vectorial force */
1832 fix0
= _mm_add_ps(fix0
,tx
);
1833 fiy0
= _mm_add_ps(fiy0
,ty
);
1834 fiz0
= _mm_add_ps(fiz0
,tz
);
1836 fjx0
= _mm_add_ps(fjx0
,tx
);
1837 fjy0
= _mm_add_ps(fjy0
,ty
);
1838 fjz0
= _mm_add_ps(fjz0
,tz
);
1842 /**************************
1843 * CALCULATE INTERACTIONS *
1844 **************************/
1846 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1849 r11
= _mm_mul_ps(rsq11
,rinv11
);
1851 /* EWALD ELECTROSTATICS */
1853 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1854 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
1855 ewitab
= _mm_cvttps_epi32(ewrt
);
1856 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1857 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1858 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1860 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1861 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
1863 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
1867 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1869 /* Calculate temporary vectorial force */
1870 tx
= _mm_mul_ps(fscal
,dx11
);
1871 ty
= _mm_mul_ps(fscal
,dy11
);
1872 tz
= _mm_mul_ps(fscal
,dz11
);
1874 /* Update vectorial force */
1875 fix1
= _mm_add_ps(fix1
,tx
);
1876 fiy1
= _mm_add_ps(fiy1
,ty
);
1877 fiz1
= _mm_add_ps(fiz1
,tz
);
1879 fjx1
= _mm_add_ps(fjx1
,tx
);
1880 fjy1
= _mm_add_ps(fjy1
,ty
);
1881 fjz1
= _mm_add_ps(fjz1
,tz
);
1885 /**************************
1886 * CALCULATE INTERACTIONS *
1887 **************************/
1889 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1892 r12
= _mm_mul_ps(rsq12
,rinv12
);
1894 /* EWALD ELECTROSTATICS */
1896 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1897 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
1898 ewitab
= _mm_cvttps_epi32(ewrt
);
1899 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1900 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1901 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1903 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1904 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
1906 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
1910 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1912 /* Calculate temporary vectorial force */
1913 tx
= _mm_mul_ps(fscal
,dx12
);
1914 ty
= _mm_mul_ps(fscal
,dy12
);
1915 tz
= _mm_mul_ps(fscal
,dz12
);
1917 /* Update vectorial force */
1918 fix1
= _mm_add_ps(fix1
,tx
);
1919 fiy1
= _mm_add_ps(fiy1
,ty
);
1920 fiz1
= _mm_add_ps(fiz1
,tz
);
1922 fjx2
= _mm_add_ps(fjx2
,tx
);
1923 fjy2
= _mm_add_ps(fjy2
,ty
);
1924 fjz2
= _mm_add_ps(fjz2
,tz
);
1928 /**************************
1929 * CALCULATE INTERACTIONS *
1930 **************************/
1932 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
1935 r13
= _mm_mul_ps(rsq13
,rinv13
);
1937 /* EWALD ELECTROSTATICS */
1939 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1940 ewrt
= _mm_mul_ps(r13
,ewtabscale
);
1941 ewitab
= _mm_cvttps_epi32(ewrt
);
1942 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1943 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1944 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1946 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1947 felec
= _mm_mul_ps(_mm_mul_ps(qq13
,rinv13
),_mm_sub_ps(rinvsq13
,felec
));
1949 cutoff_mask
= _mm_cmplt_ps(rsq13
,rcutoff2
);
1953 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1955 /* Calculate temporary vectorial force */
1956 tx
= _mm_mul_ps(fscal
,dx13
);
1957 ty
= _mm_mul_ps(fscal
,dy13
);
1958 tz
= _mm_mul_ps(fscal
,dz13
);
1960 /* Update vectorial force */
1961 fix1
= _mm_add_ps(fix1
,tx
);
1962 fiy1
= _mm_add_ps(fiy1
,ty
);
1963 fiz1
= _mm_add_ps(fiz1
,tz
);
1965 fjx3
= _mm_add_ps(fjx3
,tx
);
1966 fjy3
= _mm_add_ps(fjy3
,ty
);
1967 fjz3
= _mm_add_ps(fjz3
,tz
);
1971 /**************************
1972 * CALCULATE INTERACTIONS *
1973 **************************/
1975 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1978 r21
= _mm_mul_ps(rsq21
,rinv21
);
1980 /* EWALD ELECTROSTATICS */
1982 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1983 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
1984 ewitab
= _mm_cvttps_epi32(ewrt
);
1985 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1986 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1987 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1989 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1990 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
1992 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
1996 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1998 /* Calculate temporary vectorial force */
1999 tx
= _mm_mul_ps(fscal
,dx21
);
2000 ty
= _mm_mul_ps(fscal
,dy21
);
2001 tz
= _mm_mul_ps(fscal
,dz21
);
2003 /* Update vectorial force */
2004 fix2
= _mm_add_ps(fix2
,tx
);
2005 fiy2
= _mm_add_ps(fiy2
,ty
);
2006 fiz2
= _mm_add_ps(fiz2
,tz
);
2008 fjx1
= _mm_add_ps(fjx1
,tx
);
2009 fjy1
= _mm_add_ps(fjy1
,ty
);
2010 fjz1
= _mm_add_ps(fjz1
,tz
);
2014 /**************************
2015 * CALCULATE INTERACTIONS *
2016 **************************/
2018 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2021 r22
= _mm_mul_ps(rsq22
,rinv22
);
2023 /* EWALD ELECTROSTATICS */
2025 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2026 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
2027 ewitab
= _mm_cvttps_epi32(ewrt
);
2028 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2029 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2030 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2032 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2033 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
2035 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
2039 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2041 /* Calculate temporary vectorial force */
2042 tx
= _mm_mul_ps(fscal
,dx22
);
2043 ty
= _mm_mul_ps(fscal
,dy22
);
2044 tz
= _mm_mul_ps(fscal
,dz22
);
2046 /* Update vectorial force */
2047 fix2
= _mm_add_ps(fix2
,tx
);
2048 fiy2
= _mm_add_ps(fiy2
,ty
);
2049 fiz2
= _mm_add_ps(fiz2
,tz
);
2051 fjx2
= _mm_add_ps(fjx2
,tx
);
2052 fjy2
= _mm_add_ps(fjy2
,ty
);
2053 fjz2
= _mm_add_ps(fjz2
,tz
);
2057 /**************************
2058 * CALCULATE INTERACTIONS *
2059 **************************/
2061 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
2064 r23
= _mm_mul_ps(rsq23
,rinv23
);
2066 /* EWALD ELECTROSTATICS */
2068 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2069 ewrt
= _mm_mul_ps(r23
,ewtabscale
);
2070 ewitab
= _mm_cvttps_epi32(ewrt
);
2071 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2072 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2073 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2075 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2076 felec
= _mm_mul_ps(_mm_mul_ps(qq23
,rinv23
),_mm_sub_ps(rinvsq23
,felec
));
2078 cutoff_mask
= _mm_cmplt_ps(rsq23
,rcutoff2
);
2082 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2084 /* Calculate temporary vectorial force */
2085 tx
= _mm_mul_ps(fscal
,dx23
);
2086 ty
= _mm_mul_ps(fscal
,dy23
);
2087 tz
= _mm_mul_ps(fscal
,dz23
);
2089 /* Update vectorial force */
2090 fix2
= _mm_add_ps(fix2
,tx
);
2091 fiy2
= _mm_add_ps(fiy2
,ty
);
2092 fiz2
= _mm_add_ps(fiz2
,tz
);
2094 fjx3
= _mm_add_ps(fjx3
,tx
);
2095 fjy3
= _mm_add_ps(fjy3
,ty
);
2096 fjz3
= _mm_add_ps(fjz3
,tz
);
2100 /**************************
2101 * CALCULATE INTERACTIONS *
2102 **************************/
2104 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
2107 r31
= _mm_mul_ps(rsq31
,rinv31
);
2109 /* EWALD ELECTROSTATICS */
2111 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2112 ewrt
= _mm_mul_ps(r31
,ewtabscale
);
2113 ewitab
= _mm_cvttps_epi32(ewrt
);
2114 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2115 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2116 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2118 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2119 felec
= _mm_mul_ps(_mm_mul_ps(qq31
,rinv31
),_mm_sub_ps(rinvsq31
,felec
));
2121 cutoff_mask
= _mm_cmplt_ps(rsq31
,rcutoff2
);
2125 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2127 /* Calculate temporary vectorial force */
2128 tx
= _mm_mul_ps(fscal
,dx31
);
2129 ty
= _mm_mul_ps(fscal
,dy31
);
2130 tz
= _mm_mul_ps(fscal
,dz31
);
2132 /* Update vectorial force */
2133 fix3
= _mm_add_ps(fix3
,tx
);
2134 fiy3
= _mm_add_ps(fiy3
,ty
);
2135 fiz3
= _mm_add_ps(fiz3
,tz
);
2137 fjx1
= _mm_add_ps(fjx1
,tx
);
2138 fjy1
= _mm_add_ps(fjy1
,ty
);
2139 fjz1
= _mm_add_ps(fjz1
,tz
);
2143 /**************************
2144 * CALCULATE INTERACTIONS *
2145 **************************/
2147 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
2150 r32
= _mm_mul_ps(rsq32
,rinv32
);
2152 /* EWALD ELECTROSTATICS */
2154 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2155 ewrt
= _mm_mul_ps(r32
,ewtabscale
);
2156 ewitab
= _mm_cvttps_epi32(ewrt
);
2157 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2158 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2159 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2161 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2162 felec
= _mm_mul_ps(_mm_mul_ps(qq32
,rinv32
),_mm_sub_ps(rinvsq32
,felec
));
2164 cutoff_mask
= _mm_cmplt_ps(rsq32
,rcutoff2
);
2168 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2170 /* Calculate temporary vectorial force */
2171 tx
= _mm_mul_ps(fscal
,dx32
);
2172 ty
= _mm_mul_ps(fscal
,dy32
);
2173 tz
= _mm_mul_ps(fscal
,dz32
);
2175 /* Update vectorial force */
2176 fix3
= _mm_add_ps(fix3
,tx
);
2177 fiy3
= _mm_add_ps(fiy3
,ty
);
2178 fiz3
= _mm_add_ps(fiz3
,tz
);
2180 fjx2
= _mm_add_ps(fjx2
,tx
);
2181 fjy2
= _mm_add_ps(fjy2
,ty
);
2182 fjz2
= _mm_add_ps(fjz2
,tz
);
2186 /**************************
2187 * CALCULATE INTERACTIONS *
2188 **************************/
2190 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
2193 r33
= _mm_mul_ps(rsq33
,rinv33
);
2195 /* EWALD ELECTROSTATICS */
2197 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2198 ewrt
= _mm_mul_ps(r33
,ewtabscale
);
2199 ewitab
= _mm_cvttps_epi32(ewrt
);
2200 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2201 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2202 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2204 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2205 felec
= _mm_mul_ps(_mm_mul_ps(qq33
,rinv33
),_mm_sub_ps(rinvsq33
,felec
));
2207 cutoff_mask
= _mm_cmplt_ps(rsq33
,rcutoff2
);
2211 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2213 /* Calculate temporary vectorial force */
2214 tx
= _mm_mul_ps(fscal
,dx33
);
2215 ty
= _mm_mul_ps(fscal
,dy33
);
2216 tz
= _mm_mul_ps(fscal
,dz33
);
2218 /* Update vectorial force */
2219 fix3
= _mm_add_ps(fix3
,tx
);
2220 fiy3
= _mm_add_ps(fiy3
,ty
);
2221 fiz3
= _mm_add_ps(fiz3
,tz
);
2223 fjx3
= _mm_add_ps(fjx3
,tx
);
2224 fjy3
= _mm_add_ps(fjy3
,ty
);
2225 fjz3
= _mm_add_ps(fjz3
,tz
);
2229 fjptrA
= f
+j_coord_offsetA
;
2230 fjptrB
= f
+j_coord_offsetB
;
2231 fjptrC
= f
+j_coord_offsetC
;
2232 fjptrD
= f
+j_coord_offsetD
;
2234 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2235 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2236 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2238 /* Inner loop uses 384 flops */
2241 if(jidx
<j_index_end
)
2244 /* Get j neighbor index, and coordinate index */
2245 jnrlistA
= jjnr
[jidx
];
2246 jnrlistB
= jjnr
[jidx
+1];
2247 jnrlistC
= jjnr
[jidx
+2];
2248 jnrlistD
= jjnr
[jidx
+3];
2249 /* Sign of each element will be negative for non-real atoms.
2250 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2251 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2253 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
2254 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
2255 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
2256 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
2257 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
2258 j_coord_offsetA
= DIM
*jnrA
;
2259 j_coord_offsetB
= DIM
*jnrB
;
2260 j_coord_offsetC
= DIM
*jnrC
;
2261 j_coord_offsetD
= DIM
*jnrD
;
2263 /* load j atom coordinates */
2264 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
2265 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
2266 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
2267 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
2269 /* Calculate displacement vector */
2270 dx00
= _mm_sub_ps(ix0
,jx0
);
2271 dy00
= _mm_sub_ps(iy0
,jy0
);
2272 dz00
= _mm_sub_ps(iz0
,jz0
);
2273 dx11
= _mm_sub_ps(ix1
,jx1
);
2274 dy11
= _mm_sub_ps(iy1
,jy1
);
2275 dz11
= _mm_sub_ps(iz1
,jz1
);
2276 dx12
= _mm_sub_ps(ix1
,jx2
);
2277 dy12
= _mm_sub_ps(iy1
,jy2
);
2278 dz12
= _mm_sub_ps(iz1
,jz2
);
2279 dx13
= _mm_sub_ps(ix1
,jx3
);
2280 dy13
= _mm_sub_ps(iy1
,jy3
);
2281 dz13
= _mm_sub_ps(iz1
,jz3
);
2282 dx21
= _mm_sub_ps(ix2
,jx1
);
2283 dy21
= _mm_sub_ps(iy2
,jy1
);
2284 dz21
= _mm_sub_ps(iz2
,jz1
);
2285 dx22
= _mm_sub_ps(ix2
,jx2
);
2286 dy22
= _mm_sub_ps(iy2
,jy2
);
2287 dz22
= _mm_sub_ps(iz2
,jz2
);
2288 dx23
= _mm_sub_ps(ix2
,jx3
);
2289 dy23
= _mm_sub_ps(iy2
,jy3
);
2290 dz23
= _mm_sub_ps(iz2
,jz3
);
2291 dx31
= _mm_sub_ps(ix3
,jx1
);
2292 dy31
= _mm_sub_ps(iy3
,jy1
);
2293 dz31
= _mm_sub_ps(iz3
,jz1
);
2294 dx32
= _mm_sub_ps(ix3
,jx2
);
2295 dy32
= _mm_sub_ps(iy3
,jy2
);
2296 dz32
= _mm_sub_ps(iz3
,jz2
);
2297 dx33
= _mm_sub_ps(ix3
,jx3
);
2298 dy33
= _mm_sub_ps(iy3
,jy3
);
2299 dz33
= _mm_sub_ps(iz3
,jz3
);
2301 /* Calculate squared distance and things based on it */
2302 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
2303 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
2304 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
2305 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
2306 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
2307 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
2308 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
2309 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
2310 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
2311 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
2313 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
2314 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
2315 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
2316 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
2317 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
2318 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
2319 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
2320 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
2321 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
2323 rinvsq00
= gmx_mm_inv_ps(rsq00
);
2324 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
2325 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
2326 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
2327 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
2328 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
2329 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
2330 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
2331 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
2332 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
2334 fjx0
= _mm_setzero_ps();
2335 fjy0
= _mm_setzero_ps();
2336 fjz0
= _mm_setzero_ps();
2337 fjx1
= _mm_setzero_ps();
2338 fjy1
= _mm_setzero_ps();
2339 fjz1
= _mm_setzero_ps();
2340 fjx2
= _mm_setzero_ps();
2341 fjy2
= _mm_setzero_ps();
2342 fjz2
= _mm_setzero_ps();
2343 fjx3
= _mm_setzero_ps();
2344 fjy3
= _mm_setzero_ps();
2345 fjz3
= _mm_setzero_ps();
2347 /**************************
2348 * CALCULATE INTERACTIONS *
2349 **************************/
2351 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
2354 /* LENNARD-JONES DISPERSION/REPULSION */
2356 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
2357 fvdw
= _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
2359 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
2363 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2365 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2367 /* Calculate temporary vectorial force */
2368 tx
= _mm_mul_ps(fscal
,dx00
);
2369 ty
= _mm_mul_ps(fscal
,dy00
);
2370 tz
= _mm_mul_ps(fscal
,dz00
);
2372 /* Update vectorial force */
2373 fix0
= _mm_add_ps(fix0
,tx
);
2374 fiy0
= _mm_add_ps(fiy0
,ty
);
2375 fiz0
= _mm_add_ps(fiz0
,tz
);
2377 fjx0
= _mm_add_ps(fjx0
,tx
);
2378 fjy0
= _mm_add_ps(fjy0
,ty
);
2379 fjz0
= _mm_add_ps(fjz0
,tz
);
2383 /**************************
2384 * CALCULATE INTERACTIONS *
2385 **************************/
2387 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2390 r11
= _mm_mul_ps(rsq11
,rinv11
);
2391 r11
= _mm_andnot_ps(dummy_mask
,r11
);
2393 /* EWALD ELECTROSTATICS */
2395 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2396 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
2397 ewitab
= _mm_cvttps_epi32(ewrt
);
2398 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2399 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2400 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2402 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2403 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
2405 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
2409 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2411 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2413 /* Calculate temporary vectorial force */
2414 tx
= _mm_mul_ps(fscal
,dx11
);
2415 ty
= _mm_mul_ps(fscal
,dy11
);
2416 tz
= _mm_mul_ps(fscal
,dz11
);
2418 /* Update vectorial force */
2419 fix1
= _mm_add_ps(fix1
,tx
);
2420 fiy1
= _mm_add_ps(fiy1
,ty
);
2421 fiz1
= _mm_add_ps(fiz1
,tz
);
2423 fjx1
= _mm_add_ps(fjx1
,tx
);
2424 fjy1
= _mm_add_ps(fjy1
,ty
);
2425 fjz1
= _mm_add_ps(fjz1
,tz
);
2429 /**************************
2430 * CALCULATE INTERACTIONS *
2431 **************************/
2433 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2436 r12
= _mm_mul_ps(rsq12
,rinv12
);
2437 r12
= _mm_andnot_ps(dummy_mask
,r12
);
2439 /* EWALD ELECTROSTATICS */
2441 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2442 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
2443 ewitab
= _mm_cvttps_epi32(ewrt
);
2444 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2445 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2446 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2448 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2449 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
2451 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
2455 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2457 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2459 /* Calculate temporary vectorial force */
2460 tx
= _mm_mul_ps(fscal
,dx12
);
2461 ty
= _mm_mul_ps(fscal
,dy12
);
2462 tz
= _mm_mul_ps(fscal
,dz12
);
2464 /* Update vectorial force */
2465 fix1
= _mm_add_ps(fix1
,tx
);
2466 fiy1
= _mm_add_ps(fiy1
,ty
);
2467 fiz1
= _mm_add_ps(fiz1
,tz
);
2469 fjx2
= _mm_add_ps(fjx2
,tx
);
2470 fjy2
= _mm_add_ps(fjy2
,ty
);
2471 fjz2
= _mm_add_ps(fjz2
,tz
);
2475 /**************************
2476 * CALCULATE INTERACTIONS *
2477 **************************/
2479 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
2482 r13
= _mm_mul_ps(rsq13
,rinv13
);
2483 r13
= _mm_andnot_ps(dummy_mask
,r13
);
2485 /* EWALD ELECTROSTATICS */
2487 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2488 ewrt
= _mm_mul_ps(r13
,ewtabscale
);
2489 ewitab
= _mm_cvttps_epi32(ewrt
);
2490 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2491 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2492 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2494 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2495 felec
= _mm_mul_ps(_mm_mul_ps(qq13
,rinv13
),_mm_sub_ps(rinvsq13
,felec
));
2497 cutoff_mask
= _mm_cmplt_ps(rsq13
,rcutoff2
);
2501 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2503 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2505 /* Calculate temporary vectorial force */
2506 tx
= _mm_mul_ps(fscal
,dx13
);
2507 ty
= _mm_mul_ps(fscal
,dy13
);
2508 tz
= _mm_mul_ps(fscal
,dz13
);
2510 /* Update vectorial force */
2511 fix1
= _mm_add_ps(fix1
,tx
);
2512 fiy1
= _mm_add_ps(fiy1
,ty
);
2513 fiz1
= _mm_add_ps(fiz1
,tz
);
2515 fjx3
= _mm_add_ps(fjx3
,tx
);
2516 fjy3
= _mm_add_ps(fjy3
,ty
);
2517 fjz3
= _mm_add_ps(fjz3
,tz
);
2521 /**************************
2522 * CALCULATE INTERACTIONS *
2523 **************************/
2525 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2528 r21
= _mm_mul_ps(rsq21
,rinv21
);
2529 r21
= _mm_andnot_ps(dummy_mask
,r21
);
2531 /* EWALD ELECTROSTATICS */
2533 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2534 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
2535 ewitab
= _mm_cvttps_epi32(ewrt
);
2536 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2537 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2538 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2540 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2541 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
2543 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
2547 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2549 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2551 /* Calculate temporary vectorial force */
2552 tx
= _mm_mul_ps(fscal
,dx21
);
2553 ty
= _mm_mul_ps(fscal
,dy21
);
2554 tz
= _mm_mul_ps(fscal
,dz21
);
2556 /* Update vectorial force */
2557 fix2
= _mm_add_ps(fix2
,tx
);
2558 fiy2
= _mm_add_ps(fiy2
,ty
);
2559 fiz2
= _mm_add_ps(fiz2
,tz
);
2561 fjx1
= _mm_add_ps(fjx1
,tx
);
2562 fjy1
= _mm_add_ps(fjy1
,ty
);
2563 fjz1
= _mm_add_ps(fjz1
,tz
);
2567 /**************************
2568 * CALCULATE INTERACTIONS *
2569 **************************/
2571 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2574 r22
= _mm_mul_ps(rsq22
,rinv22
);
2575 r22
= _mm_andnot_ps(dummy_mask
,r22
);
2577 /* EWALD ELECTROSTATICS */
2579 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2580 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
2581 ewitab
= _mm_cvttps_epi32(ewrt
);
2582 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2583 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2584 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2586 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2587 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
2589 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
2593 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2595 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2597 /* Calculate temporary vectorial force */
2598 tx
= _mm_mul_ps(fscal
,dx22
);
2599 ty
= _mm_mul_ps(fscal
,dy22
);
2600 tz
= _mm_mul_ps(fscal
,dz22
);
2602 /* Update vectorial force */
2603 fix2
= _mm_add_ps(fix2
,tx
);
2604 fiy2
= _mm_add_ps(fiy2
,ty
);
2605 fiz2
= _mm_add_ps(fiz2
,tz
);
2607 fjx2
= _mm_add_ps(fjx2
,tx
);
2608 fjy2
= _mm_add_ps(fjy2
,ty
);
2609 fjz2
= _mm_add_ps(fjz2
,tz
);
2613 /**************************
2614 * CALCULATE INTERACTIONS *
2615 **************************/
2617 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
2620 r23
= _mm_mul_ps(rsq23
,rinv23
);
2621 r23
= _mm_andnot_ps(dummy_mask
,r23
);
2623 /* EWALD ELECTROSTATICS */
2625 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2626 ewrt
= _mm_mul_ps(r23
,ewtabscale
);
2627 ewitab
= _mm_cvttps_epi32(ewrt
);
2628 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2629 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2630 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2632 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2633 felec
= _mm_mul_ps(_mm_mul_ps(qq23
,rinv23
),_mm_sub_ps(rinvsq23
,felec
));
2635 cutoff_mask
= _mm_cmplt_ps(rsq23
,rcutoff2
);
2639 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2641 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2643 /* Calculate temporary vectorial force */
2644 tx
= _mm_mul_ps(fscal
,dx23
);
2645 ty
= _mm_mul_ps(fscal
,dy23
);
2646 tz
= _mm_mul_ps(fscal
,dz23
);
2648 /* Update vectorial force */
2649 fix2
= _mm_add_ps(fix2
,tx
);
2650 fiy2
= _mm_add_ps(fiy2
,ty
);
2651 fiz2
= _mm_add_ps(fiz2
,tz
);
2653 fjx3
= _mm_add_ps(fjx3
,tx
);
2654 fjy3
= _mm_add_ps(fjy3
,ty
);
2655 fjz3
= _mm_add_ps(fjz3
,tz
);
2659 /**************************
2660 * CALCULATE INTERACTIONS *
2661 **************************/
2663 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
2666 r31
= _mm_mul_ps(rsq31
,rinv31
);
2667 r31
= _mm_andnot_ps(dummy_mask
,r31
);
2669 /* EWALD ELECTROSTATICS */
2671 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2672 ewrt
= _mm_mul_ps(r31
,ewtabscale
);
2673 ewitab
= _mm_cvttps_epi32(ewrt
);
2674 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2675 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2676 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2678 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2679 felec
= _mm_mul_ps(_mm_mul_ps(qq31
,rinv31
),_mm_sub_ps(rinvsq31
,felec
));
2681 cutoff_mask
= _mm_cmplt_ps(rsq31
,rcutoff2
);
2685 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2687 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2689 /* Calculate temporary vectorial force */
2690 tx
= _mm_mul_ps(fscal
,dx31
);
2691 ty
= _mm_mul_ps(fscal
,dy31
);
2692 tz
= _mm_mul_ps(fscal
,dz31
);
2694 /* Update vectorial force */
2695 fix3
= _mm_add_ps(fix3
,tx
);
2696 fiy3
= _mm_add_ps(fiy3
,ty
);
2697 fiz3
= _mm_add_ps(fiz3
,tz
);
2699 fjx1
= _mm_add_ps(fjx1
,tx
);
2700 fjy1
= _mm_add_ps(fjy1
,ty
);
2701 fjz1
= _mm_add_ps(fjz1
,tz
);
2705 /**************************
2706 * CALCULATE INTERACTIONS *
2707 **************************/
2709 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
2712 r32
= _mm_mul_ps(rsq32
,rinv32
);
2713 r32
= _mm_andnot_ps(dummy_mask
,r32
);
2715 /* EWALD ELECTROSTATICS */
2717 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2718 ewrt
= _mm_mul_ps(r32
,ewtabscale
);
2719 ewitab
= _mm_cvttps_epi32(ewrt
);
2720 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2721 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2722 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2724 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2725 felec
= _mm_mul_ps(_mm_mul_ps(qq32
,rinv32
),_mm_sub_ps(rinvsq32
,felec
));
2727 cutoff_mask
= _mm_cmplt_ps(rsq32
,rcutoff2
);
2731 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2733 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2735 /* Calculate temporary vectorial force */
2736 tx
= _mm_mul_ps(fscal
,dx32
);
2737 ty
= _mm_mul_ps(fscal
,dy32
);
2738 tz
= _mm_mul_ps(fscal
,dz32
);
2740 /* Update vectorial force */
2741 fix3
= _mm_add_ps(fix3
,tx
);
2742 fiy3
= _mm_add_ps(fiy3
,ty
);
2743 fiz3
= _mm_add_ps(fiz3
,tz
);
2745 fjx2
= _mm_add_ps(fjx2
,tx
);
2746 fjy2
= _mm_add_ps(fjy2
,ty
);
2747 fjz2
= _mm_add_ps(fjz2
,tz
);
2751 /**************************
2752 * CALCULATE INTERACTIONS *
2753 **************************/
2755 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
2758 r33
= _mm_mul_ps(rsq33
,rinv33
);
2759 r33
= _mm_andnot_ps(dummy_mask
,r33
);
2761 /* EWALD ELECTROSTATICS */
2763 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2764 ewrt
= _mm_mul_ps(r33
,ewtabscale
);
2765 ewitab
= _mm_cvttps_epi32(ewrt
);
2766 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2767 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2768 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2770 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2771 felec
= _mm_mul_ps(_mm_mul_ps(qq33
,rinv33
),_mm_sub_ps(rinvsq33
,felec
));
2773 cutoff_mask
= _mm_cmplt_ps(rsq33
,rcutoff2
);
2777 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2779 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2781 /* Calculate temporary vectorial force */
2782 tx
= _mm_mul_ps(fscal
,dx33
);
2783 ty
= _mm_mul_ps(fscal
,dy33
);
2784 tz
= _mm_mul_ps(fscal
,dz33
);
2786 /* Update vectorial force */
2787 fix3
= _mm_add_ps(fix3
,tx
);
2788 fiy3
= _mm_add_ps(fiy3
,ty
);
2789 fiz3
= _mm_add_ps(fiz3
,tz
);
2791 fjx3
= _mm_add_ps(fjx3
,tx
);
2792 fjy3
= _mm_add_ps(fjy3
,ty
);
2793 fjz3
= _mm_add_ps(fjz3
,tz
);
2797 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2798 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2799 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2800 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2802 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2803 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2804 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2806 /* Inner loop uses 393 flops */
2809 /* End of innermost loop */
2811 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2812 f
+i_coord_offset
,fshift
+i_shift_offset
);
2814 /* Increment number of inner iterations */
2815 inneriter
+= j_index_end
- j_index_start
;
2817 /* Outer loop uses 24 flops */
2820 /* Increment number of outer iterations */
2823 /* Update outer/inner flops */
2825 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*393);