2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_single.h"
49 #include "kernelutil_x86_sse2_single.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single
53 * Electrostatics interaction: Ewald
54 * VdW interaction: None
55 * Geometry: Water3-Water3
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
75 int jnrA
,jnrB
,jnrC
,jnrD
;
76 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
77 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
78 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
80 real
*shiftvec
,*fshift
,*x
,*f
;
81 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
83 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
85 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
87 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
89 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
90 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
91 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
92 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
93 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
94 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
95 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
96 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
97 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
98 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
99 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
100 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
101 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
102 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
103 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
104 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
105 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
108 __m128 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
110 __m128 dummy_mask
,cutoff_mask
;
111 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
112 __m128 one
= _mm_set1_ps(1.0);
113 __m128 two
= _mm_set1_ps(2.0);
119 jindex
= nlist
->jindex
;
121 shiftidx
= nlist
->shift
;
123 shiftvec
= fr
->shift_vec
[0];
124 fshift
= fr
->fshift
[0];
125 facel
= _mm_set1_ps(fr
->epsfac
);
126 charge
= mdatoms
->chargeA
;
128 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
129 ewtab
= fr
->ic
->tabq_coul_FDV0
;
130 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
131 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
133 /* Setup water-specific parameters */
134 inr
= nlist
->iinr
[0];
135 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
136 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
137 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
139 jq0
= _mm_set1_ps(charge
[inr
+0]);
140 jq1
= _mm_set1_ps(charge
[inr
+1]);
141 jq2
= _mm_set1_ps(charge
[inr
+2]);
142 qq00
= _mm_mul_ps(iq0
,jq0
);
143 qq01
= _mm_mul_ps(iq0
,jq1
);
144 qq02
= _mm_mul_ps(iq0
,jq2
);
145 qq10
= _mm_mul_ps(iq1
,jq0
);
146 qq11
= _mm_mul_ps(iq1
,jq1
);
147 qq12
= _mm_mul_ps(iq1
,jq2
);
148 qq20
= _mm_mul_ps(iq2
,jq0
);
149 qq21
= _mm_mul_ps(iq2
,jq1
);
150 qq22
= _mm_mul_ps(iq2
,jq2
);
152 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
153 rcutoff_scalar
= fr
->rcoulomb
;
154 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
155 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
157 /* Avoid stupid compiler warnings */
158 jnrA
= jnrB
= jnrC
= jnrD
= 0;
167 for(iidx
=0;iidx
<4*DIM
;iidx
++)
172 /* Start outer loop over neighborlists */
173 for(iidx
=0; iidx
<nri
; iidx
++)
175 /* Load shift vector for this list */
176 i_shift_offset
= DIM
*shiftidx
[iidx
];
178 /* Load limits for loop over neighbors */
179 j_index_start
= jindex
[iidx
];
180 j_index_end
= jindex
[iidx
+1];
182 /* Get outer coordinate index */
184 i_coord_offset
= DIM
*inr
;
186 /* Load i particle coords and add shift vector */
187 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
188 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
190 fix0
= _mm_setzero_ps();
191 fiy0
= _mm_setzero_ps();
192 fiz0
= _mm_setzero_ps();
193 fix1
= _mm_setzero_ps();
194 fiy1
= _mm_setzero_ps();
195 fiz1
= _mm_setzero_ps();
196 fix2
= _mm_setzero_ps();
197 fiy2
= _mm_setzero_ps();
198 fiz2
= _mm_setzero_ps();
200 /* Reset potential sums */
201 velecsum
= _mm_setzero_ps();
203 /* Start inner kernel loop */
204 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
207 /* Get j neighbor index, and coordinate index */
212 j_coord_offsetA
= DIM
*jnrA
;
213 j_coord_offsetB
= DIM
*jnrB
;
214 j_coord_offsetC
= DIM
*jnrC
;
215 j_coord_offsetD
= DIM
*jnrD
;
217 /* load j atom coordinates */
218 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
219 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
220 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
222 /* Calculate displacement vector */
223 dx00
= _mm_sub_ps(ix0
,jx0
);
224 dy00
= _mm_sub_ps(iy0
,jy0
);
225 dz00
= _mm_sub_ps(iz0
,jz0
);
226 dx01
= _mm_sub_ps(ix0
,jx1
);
227 dy01
= _mm_sub_ps(iy0
,jy1
);
228 dz01
= _mm_sub_ps(iz0
,jz1
);
229 dx02
= _mm_sub_ps(ix0
,jx2
);
230 dy02
= _mm_sub_ps(iy0
,jy2
);
231 dz02
= _mm_sub_ps(iz0
,jz2
);
232 dx10
= _mm_sub_ps(ix1
,jx0
);
233 dy10
= _mm_sub_ps(iy1
,jy0
);
234 dz10
= _mm_sub_ps(iz1
,jz0
);
235 dx11
= _mm_sub_ps(ix1
,jx1
);
236 dy11
= _mm_sub_ps(iy1
,jy1
);
237 dz11
= _mm_sub_ps(iz1
,jz1
);
238 dx12
= _mm_sub_ps(ix1
,jx2
);
239 dy12
= _mm_sub_ps(iy1
,jy2
);
240 dz12
= _mm_sub_ps(iz1
,jz2
);
241 dx20
= _mm_sub_ps(ix2
,jx0
);
242 dy20
= _mm_sub_ps(iy2
,jy0
);
243 dz20
= _mm_sub_ps(iz2
,jz0
);
244 dx21
= _mm_sub_ps(ix2
,jx1
);
245 dy21
= _mm_sub_ps(iy2
,jy1
);
246 dz21
= _mm_sub_ps(iz2
,jz1
);
247 dx22
= _mm_sub_ps(ix2
,jx2
);
248 dy22
= _mm_sub_ps(iy2
,jy2
);
249 dz22
= _mm_sub_ps(iz2
,jz2
);
251 /* Calculate squared distance and things based on it */
252 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
253 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
254 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
255 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
256 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
257 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
258 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
259 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
260 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
262 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
263 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
264 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
265 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
266 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
267 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
268 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
269 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
270 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
272 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
273 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
274 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
275 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
276 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
277 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
278 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
279 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
280 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
282 fjx0
= _mm_setzero_ps();
283 fjy0
= _mm_setzero_ps();
284 fjz0
= _mm_setzero_ps();
285 fjx1
= _mm_setzero_ps();
286 fjy1
= _mm_setzero_ps();
287 fjz1
= _mm_setzero_ps();
288 fjx2
= _mm_setzero_ps();
289 fjy2
= _mm_setzero_ps();
290 fjz2
= _mm_setzero_ps();
292 /**************************
293 * CALCULATE INTERACTIONS *
294 **************************/
296 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
299 r00
= _mm_mul_ps(rsq00
,rinv00
);
301 /* EWALD ELECTROSTATICS */
303 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
304 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
305 ewitab
= _mm_cvttps_epi32(ewrt
);
306 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
307 ewitab
= _mm_slli_epi32(ewitab
,2);
308 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
309 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
310 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
311 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
312 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
313 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
314 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
315 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(_mm_sub_ps(rinv00
,sh_ewald
),velec
));
316 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
318 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
320 /* Update potential sum for this i atom from the interaction with this j atom. */
321 velec
= _mm_and_ps(velec
,cutoff_mask
);
322 velecsum
= _mm_add_ps(velecsum
,velec
);
326 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
328 /* Calculate temporary vectorial force */
329 tx
= _mm_mul_ps(fscal
,dx00
);
330 ty
= _mm_mul_ps(fscal
,dy00
);
331 tz
= _mm_mul_ps(fscal
,dz00
);
333 /* Update vectorial force */
334 fix0
= _mm_add_ps(fix0
,tx
);
335 fiy0
= _mm_add_ps(fiy0
,ty
);
336 fiz0
= _mm_add_ps(fiz0
,tz
);
338 fjx0
= _mm_add_ps(fjx0
,tx
);
339 fjy0
= _mm_add_ps(fjy0
,ty
);
340 fjz0
= _mm_add_ps(fjz0
,tz
);
344 /**************************
345 * CALCULATE INTERACTIONS *
346 **************************/
348 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
351 r01
= _mm_mul_ps(rsq01
,rinv01
);
353 /* EWALD ELECTROSTATICS */
355 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
356 ewrt
= _mm_mul_ps(r01
,ewtabscale
);
357 ewitab
= _mm_cvttps_epi32(ewrt
);
358 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
359 ewitab
= _mm_slli_epi32(ewitab
,2);
360 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
361 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
362 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
363 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
364 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
365 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
366 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
367 velec
= _mm_mul_ps(qq01
,_mm_sub_ps(_mm_sub_ps(rinv01
,sh_ewald
),velec
));
368 felec
= _mm_mul_ps(_mm_mul_ps(qq01
,rinv01
),_mm_sub_ps(rinvsq01
,felec
));
370 cutoff_mask
= _mm_cmplt_ps(rsq01
,rcutoff2
);
372 /* Update potential sum for this i atom from the interaction with this j atom. */
373 velec
= _mm_and_ps(velec
,cutoff_mask
);
374 velecsum
= _mm_add_ps(velecsum
,velec
);
378 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
380 /* Calculate temporary vectorial force */
381 tx
= _mm_mul_ps(fscal
,dx01
);
382 ty
= _mm_mul_ps(fscal
,dy01
);
383 tz
= _mm_mul_ps(fscal
,dz01
);
385 /* Update vectorial force */
386 fix0
= _mm_add_ps(fix0
,tx
);
387 fiy0
= _mm_add_ps(fiy0
,ty
);
388 fiz0
= _mm_add_ps(fiz0
,tz
);
390 fjx1
= _mm_add_ps(fjx1
,tx
);
391 fjy1
= _mm_add_ps(fjy1
,ty
);
392 fjz1
= _mm_add_ps(fjz1
,tz
);
396 /**************************
397 * CALCULATE INTERACTIONS *
398 **************************/
400 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
403 r02
= _mm_mul_ps(rsq02
,rinv02
);
405 /* EWALD ELECTROSTATICS */
407 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
408 ewrt
= _mm_mul_ps(r02
,ewtabscale
);
409 ewitab
= _mm_cvttps_epi32(ewrt
);
410 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
411 ewitab
= _mm_slli_epi32(ewitab
,2);
412 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
413 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
414 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
415 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
416 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
417 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
418 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
419 velec
= _mm_mul_ps(qq02
,_mm_sub_ps(_mm_sub_ps(rinv02
,sh_ewald
),velec
));
420 felec
= _mm_mul_ps(_mm_mul_ps(qq02
,rinv02
),_mm_sub_ps(rinvsq02
,felec
));
422 cutoff_mask
= _mm_cmplt_ps(rsq02
,rcutoff2
);
424 /* Update potential sum for this i atom from the interaction with this j atom. */
425 velec
= _mm_and_ps(velec
,cutoff_mask
);
426 velecsum
= _mm_add_ps(velecsum
,velec
);
430 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
432 /* Calculate temporary vectorial force */
433 tx
= _mm_mul_ps(fscal
,dx02
);
434 ty
= _mm_mul_ps(fscal
,dy02
);
435 tz
= _mm_mul_ps(fscal
,dz02
);
437 /* Update vectorial force */
438 fix0
= _mm_add_ps(fix0
,tx
);
439 fiy0
= _mm_add_ps(fiy0
,ty
);
440 fiz0
= _mm_add_ps(fiz0
,tz
);
442 fjx2
= _mm_add_ps(fjx2
,tx
);
443 fjy2
= _mm_add_ps(fjy2
,ty
);
444 fjz2
= _mm_add_ps(fjz2
,tz
);
448 /**************************
449 * CALCULATE INTERACTIONS *
450 **************************/
452 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
455 r10
= _mm_mul_ps(rsq10
,rinv10
);
457 /* EWALD ELECTROSTATICS */
459 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
460 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
461 ewitab
= _mm_cvttps_epi32(ewrt
);
462 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
463 ewitab
= _mm_slli_epi32(ewitab
,2);
464 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
465 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
466 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
467 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
468 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
469 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
470 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
471 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(_mm_sub_ps(rinv10
,sh_ewald
),velec
));
472 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
474 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
476 /* Update potential sum for this i atom from the interaction with this j atom. */
477 velec
= _mm_and_ps(velec
,cutoff_mask
);
478 velecsum
= _mm_add_ps(velecsum
,velec
);
482 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
484 /* Calculate temporary vectorial force */
485 tx
= _mm_mul_ps(fscal
,dx10
);
486 ty
= _mm_mul_ps(fscal
,dy10
);
487 tz
= _mm_mul_ps(fscal
,dz10
);
489 /* Update vectorial force */
490 fix1
= _mm_add_ps(fix1
,tx
);
491 fiy1
= _mm_add_ps(fiy1
,ty
);
492 fiz1
= _mm_add_ps(fiz1
,tz
);
494 fjx0
= _mm_add_ps(fjx0
,tx
);
495 fjy0
= _mm_add_ps(fjy0
,ty
);
496 fjz0
= _mm_add_ps(fjz0
,tz
);
500 /**************************
501 * CALCULATE INTERACTIONS *
502 **************************/
504 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
507 r11
= _mm_mul_ps(rsq11
,rinv11
);
509 /* EWALD ELECTROSTATICS */
511 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
512 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
513 ewitab
= _mm_cvttps_epi32(ewrt
);
514 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
515 ewitab
= _mm_slli_epi32(ewitab
,2);
516 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
517 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
518 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
519 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
520 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
521 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
522 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
523 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_sub_ps(rinv11
,sh_ewald
),velec
));
524 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
526 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
528 /* Update potential sum for this i atom from the interaction with this j atom. */
529 velec
= _mm_and_ps(velec
,cutoff_mask
);
530 velecsum
= _mm_add_ps(velecsum
,velec
);
534 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
536 /* Calculate temporary vectorial force */
537 tx
= _mm_mul_ps(fscal
,dx11
);
538 ty
= _mm_mul_ps(fscal
,dy11
);
539 tz
= _mm_mul_ps(fscal
,dz11
);
541 /* Update vectorial force */
542 fix1
= _mm_add_ps(fix1
,tx
);
543 fiy1
= _mm_add_ps(fiy1
,ty
);
544 fiz1
= _mm_add_ps(fiz1
,tz
);
546 fjx1
= _mm_add_ps(fjx1
,tx
);
547 fjy1
= _mm_add_ps(fjy1
,ty
);
548 fjz1
= _mm_add_ps(fjz1
,tz
);
552 /**************************
553 * CALCULATE INTERACTIONS *
554 **************************/
556 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
559 r12
= _mm_mul_ps(rsq12
,rinv12
);
561 /* EWALD ELECTROSTATICS */
563 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
564 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
565 ewitab
= _mm_cvttps_epi32(ewrt
);
566 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
567 ewitab
= _mm_slli_epi32(ewitab
,2);
568 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
569 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
570 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
571 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
572 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
573 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
574 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
575 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_sub_ps(rinv12
,sh_ewald
),velec
));
576 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
578 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
580 /* Update potential sum for this i atom from the interaction with this j atom. */
581 velec
= _mm_and_ps(velec
,cutoff_mask
);
582 velecsum
= _mm_add_ps(velecsum
,velec
);
586 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
588 /* Calculate temporary vectorial force */
589 tx
= _mm_mul_ps(fscal
,dx12
);
590 ty
= _mm_mul_ps(fscal
,dy12
);
591 tz
= _mm_mul_ps(fscal
,dz12
);
593 /* Update vectorial force */
594 fix1
= _mm_add_ps(fix1
,tx
);
595 fiy1
= _mm_add_ps(fiy1
,ty
);
596 fiz1
= _mm_add_ps(fiz1
,tz
);
598 fjx2
= _mm_add_ps(fjx2
,tx
);
599 fjy2
= _mm_add_ps(fjy2
,ty
);
600 fjz2
= _mm_add_ps(fjz2
,tz
);
604 /**************************
605 * CALCULATE INTERACTIONS *
606 **************************/
608 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
611 r20
= _mm_mul_ps(rsq20
,rinv20
);
613 /* EWALD ELECTROSTATICS */
615 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
616 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
617 ewitab
= _mm_cvttps_epi32(ewrt
);
618 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
619 ewitab
= _mm_slli_epi32(ewitab
,2);
620 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
621 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
622 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
623 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
624 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
625 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
626 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
627 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(_mm_sub_ps(rinv20
,sh_ewald
),velec
));
628 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
630 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
632 /* Update potential sum for this i atom from the interaction with this j atom. */
633 velec
= _mm_and_ps(velec
,cutoff_mask
);
634 velecsum
= _mm_add_ps(velecsum
,velec
);
638 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
640 /* Calculate temporary vectorial force */
641 tx
= _mm_mul_ps(fscal
,dx20
);
642 ty
= _mm_mul_ps(fscal
,dy20
);
643 tz
= _mm_mul_ps(fscal
,dz20
);
645 /* Update vectorial force */
646 fix2
= _mm_add_ps(fix2
,tx
);
647 fiy2
= _mm_add_ps(fiy2
,ty
);
648 fiz2
= _mm_add_ps(fiz2
,tz
);
650 fjx0
= _mm_add_ps(fjx0
,tx
);
651 fjy0
= _mm_add_ps(fjy0
,ty
);
652 fjz0
= _mm_add_ps(fjz0
,tz
);
656 /**************************
657 * CALCULATE INTERACTIONS *
658 **************************/
660 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
663 r21
= _mm_mul_ps(rsq21
,rinv21
);
665 /* EWALD ELECTROSTATICS */
667 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
668 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
669 ewitab
= _mm_cvttps_epi32(ewrt
);
670 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
671 ewitab
= _mm_slli_epi32(ewitab
,2);
672 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
673 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
674 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
675 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
676 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
677 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
678 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
679 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_sub_ps(rinv21
,sh_ewald
),velec
));
680 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
682 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
684 /* Update potential sum for this i atom from the interaction with this j atom. */
685 velec
= _mm_and_ps(velec
,cutoff_mask
);
686 velecsum
= _mm_add_ps(velecsum
,velec
);
690 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
692 /* Calculate temporary vectorial force */
693 tx
= _mm_mul_ps(fscal
,dx21
);
694 ty
= _mm_mul_ps(fscal
,dy21
);
695 tz
= _mm_mul_ps(fscal
,dz21
);
697 /* Update vectorial force */
698 fix2
= _mm_add_ps(fix2
,tx
);
699 fiy2
= _mm_add_ps(fiy2
,ty
);
700 fiz2
= _mm_add_ps(fiz2
,tz
);
702 fjx1
= _mm_add_ps(fjx1
,tx
);
703 fjy1
= _mm_add_ps(fjy1
,ty
);
704 fjz1
= _mm_add_ps(fjz1
,tz
);
708 /**************************
709 * CALCULATE INTERACTIONS *
710 **************************/
712 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
715 r22
= _mm_mul_ps(rsq22
,rinv22
);
717 /* EWALD ELECTROSTATICS */
719 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
720 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
721 ewitab
= _mm_cvttps_epi32(ewrt
);
722 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
723 ewitab
= _mm_slli_epi32(ewitab
,2);
724 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
725 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
726 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
727 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
728 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
729 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
730 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
731 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_sub_ps(rinv22
,sh_ewald
),velec
));
732 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
734 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
736 /* Update potential sum for this i atom from the interaction with this j atom. */
737 velec
= _mm_and_ps(velec
,cutoff_mask
);
738 velecsum
= _mm_add_ps(velecsum
,velec
);
742 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
744 /* Calculate temporary vectorial force */
745 tx
= _mm_mul_ps(fscal
,dx22
);
746 ty
= _mm_mul_ps(fscal
,dy22
);
747 tz
= _mm_mul_ps(fscal
,dz22
);
749 /* Update vectorial force */
750 fix2
= _mm_add_ps(fix2
,tx
);
751 fiy2
= _mm_add_ps(fiy2
,ty
);
752 fiz2
= _mm_add_ps(fiz2
,tz
);
754 fjx2
= _mm_add_ps(fjx2
,tx
);
755 fjy2
= _mm_add_ps(fjy2
,ty
);
756 fjz2
= _mm_add_ps(fjz2
,tz
);
760 fjptrA
= f
+j_coord_offsetA
;
761 fjptrB
= f
+j_coord_offsetB
;
762 fjptrC
= f
+j_coord_offsetC
;
763 fjptrD
= f
+j_coord_offsetD
;
765 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
766 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
768 /* Inner loop uses 414 flops */
774 /* Get j neighbor index, and coordinate index */
775 jnrlistA
= jjnr
[jidx
];
776 jnrlistB
= jjnr
[jidx
+1];
777 jnrlistC
= jjnr
[jidx
+2];
778 jnrlistD
= jjnr
[jidx
+3];
779 /* Sign of each element will be negative for non-real atoms.
780 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
781 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
783 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
784 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
785 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
786 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
787 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
788 j_coord_offsetA
= DIM
*jnrA
;
789 j_coord_offsetB
= DIM
*jnrB
;
790 j_coord_offsetC
= DIM
*jnrC
;
791 j_coord_offsetD
= DIM
*jnrD
;
793 /* load j atom coordinates */
794 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
795 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
796 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
798 /* Calculate displacement vector */
799 dx00
= _mm_sub_ps(ix0
,jx0
);
800 dy00
= _mm_sub_ps(iy0
,jy0
);
801 dz00
= _mm_sub_ps(iz0
,jz0
);
802 dx01
= _mm_sub_ps(ix0
,jx1
);
803 dy01
= _mm_sub_ps(iy0
,jy1
);
804 dz01
= _mm_sub_ps(iz0
,jz1
);
805 dx02
= _mm_sub_ps(ix0
,jx2
);
806 dy02
= _mm_sub_ps(iy0
,jy2
);
807 dz02
= _mm_sub_ps(iz0
,jz2
);
808 dx10
= _mm_sub_ps(ix1
,jx0
);
809 dy10
= _mm_sub_ps(iy1
,jy0
);
810 dz10
= _mm_sub_ps(iz1
,jz0
);
811 dx11
= _mm_sub_ps(ix1
,jx1
);
812 dy11
= _mm_sub_ps(iy1
,jy1
);
813 dz11
= _mm_sub_ps(iz1
,jz1
);
814 dx12
= _mm_sub_ps(ix1
,jx2
);
815 dy12
= _mm_sub_ps(iy1
,jy2
);
816 dz12
= _mm_sub_ps(iz1
,jz2
);
817 dx20
= _mm_sub_ps(ix2
,jx0
);
818 dy20
= _mm_sub_ps(iy2
,jy0
);
819 dz20
= _mm_sub_ps(iz2
,jz0
);
820 dx21
= _mm_sub_ps(ix2
,jx1
);
821 dy21
= _mm_sub_ps(iy2
,jy1
);
822 dz21
= _mm_sub_ps(iz2
,jz1
);
823 dx22
= _mm_sub_ps(ix2
,jx2
);
824 dy22
= _mm_sub_ps(iy2
,jy2
);
825 dz22
= _mm_sub_ps(iz2
,jz2
);
827 /* Calculate squared distance and things based on it */
828 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
829 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
830 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
831 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
832 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
833 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
834 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
835 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
836 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
838 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
839 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
840 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
841 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
842 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
843 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
844 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
845 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
846 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
848 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
849 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
850 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
851 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
852 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
853 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
854 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
855 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
856 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
858 fjx0
= _mm_setzero_ps();
859 fjy0
= _mm_setzero_ps();
860 fjz0
= _mm_setzero_ps();
861 fjx1
= _mm_setzero_ps();
862 fjy1
= _mm_setzero_ps();
863 fjz1
= _mm_setzero_ps();
864 fjx2
= _mm_setzero_ps();
865 fjy2
= _mm_setzero_ps();
866 fjz2
= _mm_setzero_ps();
868 /**************************
869 * CALCULATE INTERACTIONS *
870 **************************/
872 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
875 r00
= _mm_mul_ps(rsq00
,rinv00
);
876 r00
= _mm_andnot_ps(dummy_mask
,r00
);
878 /* EWALD ELECTROSTATICS */
880 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
881 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
882 ewitab
= _mm_cvttps_epi32(ewrt
);
883 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
884 ewitab
= _mm_slli_epi32(ewitab
,2);
885 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
886 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
887 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
888 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
889 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
890 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
891 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
892 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(_mm_sub_ps(rinv00
,sh_ewald
),velec
));
893 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
895 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
897 /* Update potential sum for this i atom from the interaction with this j atom. */
898 velec
= _mm_and_ps(velec
,cutoff_mask
);
899 velec
= _mm_andnot_ps(dummy_mask
,velec
);
900 velecsum
= _mm_add_ps(velecsum
,velec
);
904 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
906 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
908 /* Calculate temporary vectorial force */
909 tx
= _mm_mul_ps(fscal
,dx00
);
910 ty
= _mm_mul_ps(fscal
,dy00
);
911 tz
= _mm_mul_ps(fscal
,dz00
);
913 /* Update vectorial force */
914 fix0
= _mm_add_ps(fix0
,tx
);
915 fiy0
= _mm_add_ps(fiy0
,ty
);
916 fiz0
= _mm_add_ps(fiz0
,tz
);
918 fjx0
= _mm_add_ps(fjx0
,tx
);
919 fjy0
= _mm_add_ps(fjy0
,ty
);
920 fjz0
= _mm_add_ps(fjz0
,tz
);
924 /**************************
925 * CALCULATE INTERACTIONS *
926 **************************/
928 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
931 r01
= _mm_mul_ps(rsq01
,rinv01
);
932 r01
= _mm_andnot_ps(dummy_mask
,r01
);
934 /* EWALD ELECTROSTATICS */
936 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
937 ewrt
= _mm_mul_ps(r01
,ewtabscale
);
938 ewitab
= _mm_cvttps_epi32(ewrt
);
939 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
940 ewitab
= _mm_slli_epi32(ewitab
,2);
941 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
942 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
943 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
944 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
945 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
946 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
947 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
948 velec
= _mm_mul_ps(qq01
,_mm_sub_ps(_mm_sub_ps(rinv01
,sh_ewald
),velec
));
949 felec
= _mm_mul_ps(_mm_mul_ps(qq01
,rinv01
),_mm_sub_ps(rinvsq01
,felec
));
951 cutoff_mask
= _mm_cmplt_ps(rsq01
,rcutoff2
);
953 /* Update potential sum for this i atom from the interaction with this j atom. */
954 velec
= _mm_and_ps(velec
,cutoff_mask
);
955 velec
= _mm_andnot_ps(dummy_mask
,velec
);
956 velecsum
= _mm_add_ps(velecsum
,velec
);
960 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
962 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
964 /* Calculate temporary vectorial force */
965 tx
= _mm_mul_ps(fscal
,dx01
);
966 ty
= _mm_mul_ps(fscal
,dy01
);
967 tz
= _mm_mul_ps(fscal
,dz01
);
969 /* Update vectorial force */
970 fix0
= _mm_add_ps(fix0
,tx
);
971 fiy0
= _mm_add_ps(fiy0
,ty
);
972 fiz0
= _mm_add_ps(fiz0
,tz
);
974 fjx1
= _mm_add_ps(fjx1
,tx
);
975 fjy1
= _mm_add_ps(fjy1
,ty
);
976 fjz1
= _mm_add_ps(fjz1
,tz
);
980 /**************************
981 * CALCULATE INTERACTIONS *
982 **************************/
984 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
987 r02
= _mm_mul_ps(rsq02
,rinv02
);
988 r02
= _mm_andnot_ps(dummy_mask
,r02
);
990 /* EWALD ELECTROSTATICS */
992 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
993 ewrt
= _mm_mul_ps(r02
,ewtabscale
);
994 ewitab
= _mm_cvttps_epi32(ewrt
);
995 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
996 ewitab
= _mm_slli_epi32(ewitab
,2);
997 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
998 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
999 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1000 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1001 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1002 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1003 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1004 velec
= _mm_mul_ps(qq02
,_mm_sub_ps(_mm_sub_ps(rinv02
,sh_ewald
),velec
));
1005 felec
= _mm_mul_ps(_mm_mul_ps(qq02
,rinv02
),_mm_sub_ps(rinvsq02
,felec
));
1007 cutoff_mask
= _mm_cmplt_ps(rsq02
,rcutoff2
);
1009 /* Update potential sum for this i atom from the interaction with this j atom. */
1010 velec
= _mm_and_ps(velec
,cutoff_mask
);
1011 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1012 velecsum
= _mm_add_ps(velecsum
,velec
);
1016 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1018 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1020 /* Calculate temporary vectorial force */
1021 tx
= _mm_mul_ps(fscal
,dx02
);
1022 ty
= _mm_mul_ps(fscal
,dy02
);
1023 tz
= _mm_mul_ps(fscal
,dz02
);
1025 /* Update vectorial force */
1026 fix0
= _mm_add_ps(fix0
,tx
);
1027 fiy0
= _mm_add_ps(fiy0
,ty
);
1028 fiz0
= _mm_add_ps(fiz0
,tz
);
1030 fjx2
= _mm_add_ps(fjx2
,tx
);
1031 fjy2
= _mm_add_ps(fjy2
,ty
);
1032 fjz2
= _mm_add_ps(fjz2
,tz
);
1036 /**************************
1037 * CALCULATE INTERACTIONS *
1038 **************************/
1040 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1043 r10
= _mm_mul_ps(rsq10
,rinv10
);
1044 r10
= _mm_andnot_ps(dummy_mask
,r10
);
1046 /* EWALD ELECTROSTATICS */
1048 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1049 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
1050 ewitab
= _mm_cvttps_epi32(ewrt
);
1051 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1052 ewitab
= _mm_slli_epi32(ewitab
,2);
1053 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1054 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1055 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1056 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1057 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1058 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1059 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1060 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(_mm_sub_ps(rinv10
,sh_ewald
),velec
));
1061 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
1063 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
1065 /* Update potential sum for this i atom from the interaction with this j atom. */
1066 velec
= _mm_and_ps(velec
,cutoff_mask
);
1067 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1068 velecsum
= _mm_add_ps(velecsum
,velec
);
1072 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1074 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1076 /* Calculate temporary vectorial force */
1077 tx
= _mm_mul_ps(fscal
,dx10
);
1078 ty
= _mm_mul_ps(fscal
,dy10
);
1079 tz
= _mm_mul_ps(fscal
,dz10
);
1081 /* Update vectorial force */
1082 fix1
= _mm_add_ps(fix1
,tx
);
1083 fiy1
= _mm_add_ps(fiy1
,ty
);
1084 fiz1
= _mm_add_ps(fiz1
,tz
);
1086 fjx0
= _mm_add_ps(fjx0
,tx
);
1087 fjy0
= _mm_add_ps(fjy0
,ty
);
1088 fjz0
= _mm_add_ps(fjz0
,tz
);
1092 /**************************
1093 * CALCULATE INTERACTIONS *
1094 **************************/
1096 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1099 r11
= _mm_mul_ps(rsq11
,rinv11
);
1100 r11
= _mm_andnot_ps(dummy_mask
,r11
);
1102 /* EWALD ELECTROSTATICS */
1104 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1105 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
1106 ewitab
= _mm_cvttps_epi32(ewrt
);
1107 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1108 ewitab
= _mm_slli_epi32(ewitab
,2);
1109 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1110 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1111 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1112 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1113 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1114 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1115 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1116 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_sub_ps(rinv11
,sh_ewald
),velec
));
1117 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
1119 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
1121 /* Update potential sum for this i atom from the interaction with this j atom. */
1122 velec
= _mm_and_ps(velec
,cutoff_mask
);
1123 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1124 velecsum
= _mm_add_ps(velecsum
,velec
);
1128 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1130 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1132 /* Calculate temporary vectorial force */
1133 tx
= _mm_mul_ps(fscal
,dx11
);
1134 ty
= _mm_mul_ps(fscal
,dy11
);
1135 tz
= _mm_mul_ps(fscal
,dz11
);
1137 /* Update vectorial force */
1138 fix1
= _mm_add_ps(fix1
,tx
);
1139 fiy1
= _mm_add_ps(fiy1
,ty
);
1140 fiz1
= _mm_add_ps(fiz1
,tz
);
1142 fjx1
= _mm_add_ps(fjx1
,tx
);
1143 fjy1
= _mm_add_ps(fjy1
,ty
);
1144 fjz1
= _mm_add_ps(fjz1
,tz
);
1148 /**************************
1149 * CALCULATE INTERACTIONS *
1150 **************************/
1152 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1155 r12
= _mm_mul_ps(rsq12
,rinv12
);
1156 r12
= _mm_andnot_ps(dummy_mask
,r12
);
1158 /* EWALD ELECTROSTATICS */
1160 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1161 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
1162 ewitab
= _mm_cvttps_epi32(ewrt
);
1163 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1164 ewitab
= _mm_slli_epi32(ewitab
,2);
1165 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1166 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1167 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1168 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1169 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1170 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1171 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1172 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_sub_ps(rinv12
,sh_ewald
),velec
));
1173 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
1175 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
1177 /* Update potential sum for this i atom from the interaction with this j atom. */
1178 velec
= _mm_and_ps(velec
,cutoff_mask
);
1179 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1180 velecsum
= _mm_add_ps(velecsum
,velec
);
1184 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1186 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1188 /* Calculate temporary vectorial force */
1189 tx
= _mm_mul_ps(fscal
,dx12
);
1190 ty
= _mm_mul_ps(fscal
,dy12
);
1191 tz
= _mm_mul_ps(fscal
,dz12
);
1193 /* Update vectorial force */
1194 fix1
= _mm_add_ps(fix1
,tx
);
1195 fiy1
= _mm_add_ps(fiy1
,ty
);
1196 fiz1
= _mm_add_ps(fiz1
,tz
);
1198 fjx2
= _mm_add_ps(fjx2
,tx
);
1199 fjy2
= _mm_add_ps(fjy2
,ty
);
1200 fjz2
= _mm_add_ps(fjz2
,tz
);
1204 /**************************
1205 * CALCULATE INTERACTIONS *
1206 **************************/
1208 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1211 r20
= _mm_mul_ps(rsq20
,rinv20
);
1212 r20
= _mm_andnot_ps(dummy_mask
,r20
);
1214 /* EWALD ELECTROSTATICS */
1216 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1217 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
1218 ewitab
= _mm_cvttps_epi32(ewrt
);
1219 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1220 ewitab
= _mm_slli_epi32(ewitab
,2);
1221 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1222 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1223 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1224 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1225 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1226 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1227 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1228 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(_mm_sub_ps(rinv20
,sh_ewald
),velec
));
1229 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
1231 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
1233 /* Update potential sum for this i atom from the interaction with this j atom. */
1234 velec
= _mm_and_ps(velec
,cutoff_mask
);
1235 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1236 velecsum
= _mm_add_ps(velecsum
,velec
);
1240 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1242 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1244 /* Calculate temporary vectorial force */
1245 tx
= _mm_mul_ps(fscal
,dx20
);
1246 ty
= _mm_mul_ps(fscal
,dy20
);
1247 tz
= _mm_mul_ps(fscal
,dz20
);
1249 /* Update vectorial force */
1250 fix2
= _mm_add_ps(fix2
,tx
);
1251 fiy2
= _mm_add_ps(fiy2
,ty
);
1252 fiz2
= _mm_add_ps(fiz2
,tz
);
1254 fjx0
= _mm_add_ps(fjx0
,tx
);
1255 fjy0
= _mm_add_ps(fjy0
,ty
);
1256 fjz0
= _mm_add_ps(fjz0
,tz
);
1260 /**************************
1261 * CALCULATE INTERACTIONS *
1262 **************************/
1264 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1267 r21
= _mm_mul_ps(rsq21
,rinv21
);
1268 r21
= _mm_andnot_ps(dummy_mask
,r21
);
1270 /* EWALD ELECTROSTATICS */
1272 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1273 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
1274 ewitab
= _mm_cvttps_epi32(ewrt
);
1275 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1276 ewitab
= _mm_slli_epi32(ewitab
,2);
1277 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1278 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1279 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1280 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1281 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1282 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1283 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1284 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_sub_ps(rinv21
,sh_ewald
),velec
));
1285 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
1287 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
1289 /* Update potential sum for this i atom from the interaction with this j atom. */
1290 velec
= _mm_and_ps(velec
,cutoff_mask
);
1291 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1292 velecsum
= _mm_add_ps(velecsum
,velec
);
1296 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1298 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1300 /* Calculate temporary vectorial force */
1301 tx
= _mm_mul_ps(fscal
,dx21
);
1302 ty
= _mm_mul_ps(fscal
,dy21
);
1303 tz
= _mm_mul_ps(fscal
,dz21
);
1305 /* Update vectorial force */
1306 fix2
= _mm_add_ps(fix2
,tx
);
1307 fiy2
= _mm_add_ps(fiy2
,ty
);
1308 fiz2
= _mm_add_ps(fiz2
,tz
);
1310 fjx1
= _mm_add_ps(fjx1
,tx
);
1311 fjy1
= _mm_add_ps(fjy1
,ty
);
1312 fjz1
= _mm_add_ps(fjz1
,tz
);
1316 /**************************
1317 * CALCULATE INTERACTIONS *
1318 **************************/
1320 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1323 r22
= _mm_mul_ps(rsq22
,rinv22
);
1324 r22
= _mm_andnot_ps(dummy_mask
,r22
);
1326 /* EWALD ELECTROSTATICS */
1328 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1329 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
1330 ewitab
= _mm_cvttps_epi32(ewrt
);
1331 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1332 ewitab
= _mm_slli_epi32(ewitab
,2);
1333 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1334 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1335 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1336 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1337 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1338 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1339 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1340 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_sub_ps(rinv22
,sh_ewald
),velec
));
1341 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
1343 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
1345 /* Update potential sum for this i atom from the interaction with this j atom. */
1346 velec
= _mm_and_ps(velec
,cutoff_mask
);
1347 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1348 velecsum
= _mm_add_ps(velecsum
,velec
);
1352 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1354 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1356 /* Calculate temporary vectorial force */
1357 tx
= _mm_mul_ps(fscal
,dx22
);
1358 ty
= _mm_mul_ps(fscal
,dy22
);
1359 tz
= _mm_mul_ps(fscal
,dz22
);
1361 /* Update vectorial force */
1362 fix2
= _mm_add_ps(fix2
,tx
);
1363 fiy2
= _mm_add_ps(fiy2
,ty
);
1364 fiz2
= _mm_add_ps(fiz2
,tz
);
1366 fjx2
= _mm_add_ps(fjx2
,tx
);
1367 fjy2
= _mm_add_ps(fjy2
,ty
);
1368 fjz2
= _mm_add_ps(fjz2
,tz
);
1372 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1373 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1374 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1375 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1377 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1378 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1380 /* Inner loop uses 423 flops */
1383 /* End of innermost loop */
1385 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1386 f
+i_coord_offset
,fshift
+i_shift_offset
);
1389 /* Update potential energies */
1390 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1392 /* Increment number of inner iterations */
1393 inneriter
+= j_index_end
- j_index_start
;
1395 /* Outer loop uses 19 flops */
1398 /* Increment number of outer iterations */
1401 /* Update outer/inner flops */
1403 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3W3_VF
,outeriter
*19 + inneriter
*423);
1406 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single
1407 * Electrostatics interaction: Ewald
1408 * VdW interaction: None
1409 * Geometry: Water3-Water3
1410 * Calculate force/pot: Force
1413 nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single
1414 (t_nblist
* gmx_restrict nlist
,
1415 rvec
* gmx_restrict xx
,
1416 rvec
* gmx_restrict ff
,
1417 t_forcerec
* gmx_restrict fr
,
1418 t_mdatoms
* gmx_restrict mdatoms
,
1419 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1420 t_nrnb
* gmx_restrict nrnb
)
1422 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1423 * just 0 for non-waters.
1424 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1425 * jnr indices corresponding to data put in the four positions in the SIMD register.
1427 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1428 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1429 int jnrA
,jnrB
,jnrC
,jnrD
;
1430 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1431 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1432 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1433 real rcutoff_scalar
;
1434 real
*shiftvec
,*fshift
,*x
,*f
;
1435 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1436 real scratch
[4*DIM
];
1437 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1439 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1441 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1443 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1444 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1445 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1446 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1447 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1448 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1449 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1450 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1451 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1452 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1453 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1454 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1455 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1456 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1457 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1458 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1459 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1462 __m128 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1464 __m128 dummy_mask
,cutoff_mask
;
1465 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1466 __m128 one
= _mm_set1_ps(1.0);
1467 __m128 two
= _mm_set1_ps(2.0);
1473 jindex
= nlist
->jindex
;
1475 shiftidx
= nlist
->shift
;
1477 shiftvec
= fr
->shift_vec
[0];
1478 fshift
= fr
->fshift
[0];
1479 facel
= _mm_set1_ps(fr
->epsfac
);
1480 charge
= mdatoms
->chargeA
;
1482 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
1483 ewtab
= fr
->ic
->tabq_coul_F
;
1484 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
1485 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
1487 /* Setup water-specific parameters */
1488 inr
= nlist
->iinr
[0];
1489 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
1490 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1491 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1493 jq0
= _mm_set1_ps(charge
[inr
+0]);
1494 jq1
= _mm_set1_ps(charge
[inr
+1]);
1495 jq2
= _mm_set1_ps(charge
[inr
+2]);
1496 qq00
= _mm_mul_ps(iq0
,jq0
);
1497 qq01
= _mm_mul_ps(iq0
,jq1
);
1498 qq02
= _mm_mul_ps(iq0
,jq2
);
1499 qq10
= _mm_mul_ps(iq1
,jq0
);
1500 qq11
= _mm_mul_ps(iq1
,jq1
);
1501 qq12
= _mm_mul_ps(iq1
,jq2
);
1502 qq20
= _mm_mul_ps(iq2
,jq0
);
1503 qq21
= _mm_mul_ps(iq2
,jq1
);
1504 qq22
= _mm_mul_ps(iq2
,jq2
);
1506 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1507 rcutoff_scalar
= fr
->rcoulomb
;
1508 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
1509 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
1511 /* Avoid stupid compiler warnings */
1512 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1513 j_coord_offsetA
= 0;
1514 j_coord_offsetB
= 0;
1515 j_coord_offsetC
= 0;
1516 j_coord_offsetD
= 0;
1521 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1523 scratch
[iidx
] = 0.0;
1526 /* Start outer loop over neighborlists */
1527 for(iidx
=0; iidx
<nri
; iidx
++)
1529 /* Load shift vector for this list */
1530 i_shift_offset
= DIM
*shiftidx
[iidx
];
1532 /* Load limits for loop over neighbors */
1533 j_index_start
= jindex
[iidx
];
1534 j_index_end
= jindex
[iidx
+1];
1536 /* Get outer coordinate index */
1538 i_coord_offset
= DIM
*inr
;
1540 /* Load i particle coords and add shift vector */
1541 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1542 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1544 fix0
= _mm_setzero_ps();
1545 fiy0
= _mm_setzero_ps();
1546 fiz0
= _mm_setzero_ps();
1547 fix1
= _mm_setzero_ps();
1548 fiy1
= _mm_setzero_ps();
1549 fiz1
= _mm_setzero_ps();
1550 fix2
= _mm_setzero_ps();
1551 fiy2
= _mm_setzero_ps();
1552 fiz2
= _mm_setzero_ps();
1554 /* Start inner kernel loop */
1555 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1558 /* Get j neighbor index, and coordinate index */
1560 jnrB
= jjnr
[jidx
+1];
1561 jnrC
= jjnr
[jidx
+2];
1562 jnrD
= jjnr
[jidx
+3];
1563 j_coord_offsetA
= DIM
*jnrA
;
1564 j_coord_offsetB
= DIM
*jnrB
;
1565 j_coord_offsetC
= DIM
*jnrC
;
1566 j_coord_offsetD
= DIM
*jnrD
;
1568 /* load j atom coordinates */
1569 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1570 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1571 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1573 /* Calculate displacement vector */
1574 dx00
= _mm_sub_ps(ix0
,jx0
);
1575 dy00
= _mm_sub_ps(iy0
,jy0
);
1576 dz00
= _mm_sub_ps(iz0
,jz0
);
1577 dx01
= _mm_sub_ps(ix0
,jx1
);
1578 dy01
= _mm_sub_ps(iy0
,jy1
);
1579 dz01
= _mm_sub_ps(iz0
,jz1
);
1580 dx02
= _mm_sub_ps(ix0
,jx2
);
1581 dy02
= _mm_sub_ps(iy0
,jy2
);
1582 dz02
= _mm_sub_ps(iz0
,jz2
);
1583 dx10
= _mm_sub_ps(ix1
,jx0
);
1584 dy10
= _mm_sub_ps(iy1
,jy0
);
1585 dz10
= _mm_sub_ps(iz1
,jz0
);
1586 dx11
= _mm_sub_ps(ix1
,jx1
);
1587 dy11
= _mm_sub_ps(iy1
,jy1
);
1588 dz11
= _mm_sub_ps(iz1
,jz1
);
1589 dx12
= _mm_sub_ps(ix1
,jx2
);
1590 dy12
= _mm_sub_ps(iy1
,jy2
);
1591 dz12
= _mm_sub_ps(iz1
,jz2
);
1592 dx20
= _mm_sub_ps(ix2
,jx0
);
1593 dy20
= _mm_sub_ps(iy2
,jy0
);
1594 dz20
= _mm_sub_ps(iz2
,jz0
);
1595 dx21
= _mm_sub_ps(ix2
,jx1
);
1596 dy21
= _mm_sub_ps(iy2
,jy1
);
1597 dz21
= _mm_sub_ps(iz2
,jz1
);
1598 dx22
= _mm_sub_ps(ix2
,jx2
);
1599 dy22
= _mm_sub_ps(iy2
,jy2
);
1600 dz22
= _mm_sub_ps(iz2
,jz2
);
1602 /* Calculate squared distance and things based on it */
1603 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1604 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1605 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1606 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1607 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1608 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1609 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1610 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1611 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1613 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
1614 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
1615 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
1616 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
1617 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1618 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1619 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
1620 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1621 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1623 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
1624 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
1625 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
1626 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
1627 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1628 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1629 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
1630 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1631 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1633 fjx0
= _mm_setzero_ps();
1634 fjy0
= _mm_setzero_ps();
1635 fjz0
= _mm_setzero_ps();
1636 fjx1
= _mm_setzero_ps();
1637 fjy1
= _mm_setzero_ps();
1638 fjz1
= _mm_setzero_ps();
1639 fjx2
= _mm_setzero_ps();
1640 fjy2
= _mm_setzero_ps();
1641 fjz2
= _mm_setzero_ps();
1643 /**************************
1644 * CALCULATE INTERACTIONS *
1645 **************************/
1647 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1650 r00
= _mm_mul_ps(rsq00
,rinv00
);
1652 /* EWALD ELECTROSTATICS */
1654 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1655 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
1656 ewitab
= _mm_cvttps_epi32(ewrt
);
1657 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1658 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1659 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1661 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1662 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
1664 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
1668 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1670 /* Calculate temporary vectorial force */
1671 tx
= _mm_mul_ps(fscal
,dx00
);
1672 ty
= _mm_mul_ps(fscal
,dy00
);
1673 tz
= _mm_mul_ps(fscal
,dz00
);
1675 /* Update vectorial force */
1676 fix0
= _mm_add_ps(fix0
,tx
);
1677 fiy0
= _mm_add_ps(fiy0
,ty
);
1678 fiz0
= _mm_add_ps(fiz0
,tz
);
1680 fjx0
= _mm_add_ps(fjx0
,tx
);
1681 fjy0
= _mm_add_ps(fjy0
,ty
);
1682 fjz0
= _mm_add_ps(fjz0
,tz
);
1686 /**************************
1687 * CALCULATE INTERACTIONS *
1688 **************************/
1690 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
1693 r01
= _mm_mul_ps(rsq01
,rinv01
);
1695 /* EWALD ELECTROSTATICS */
1697 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1698 ewrt
= _mm_mul_ps(r01
,ewtabscale
);
1699 ewitab
= _mm_cvttps_epi32(ewrt
);
1700 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1701 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1702 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1704 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1705 felec
= _mm_mul_ps(_mm_mul_ps(qq01
,rinv01
),_mm_sub_ps(rinvsq01
,felec
));
1707 cutoff_mask
= _mm_cmplt_ps(rsq01
,rcutoff2
);
1711 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1713 /* Calculate temporary vectorial force */
1714 tx
= _mm_mul_ps(fscal
,dx01
);
1715 ty
= _mm_mul_ps(fscal
,dy01
);
1716 tz
= _mm_mul_ps(fscal
,dz01
);
1718 /* Update vectorial force */
1719 fix0
= _mm_add_ps(fix0
,tx
);
1720 fiy0
= _mm_add_ps(fiy0
,ty
);
1721 fiz0
= _mm_add_ps(fiz0
,tz
);
1723 fjx1
= _mm_add_ps(fjx1
,tx
);
1724 fjy1
= _mm_add_ps(fjy1
,ty
);
1725 fjz1
= _mm_add_ps(fjz1
,tz
);
1729 /**************************
1730 * CALCULATE INTERACTIONS *
1731 **************************/
1733 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
1736 r02
= _mm_mul_ps(rsq02
,rinv02
);
1738 /* EWALD ELECTROSTATICS */
1740 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1741 ewrt
= _mm_mul_ps(r02
,ewtabscale
);
1742 ewitab
= _mm_cvttps_epi32(ewrt
);
1743 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1744 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1745 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1747 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1748 felec
= _mm_mul_ps(_mm_mul_ps(qq02
,rinv02
),_mm_sub_ps(rinvsq02
,felec
));
1750 cutoff_mask
= _mm_cmplt_ps(rsq02
,rcutoff2
);
1754 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1756 /* Calculate temporary vectorial force */
1757 tx
= _mm_mul_ps(fscal
,dx02
);
1758 ty
= _mm_mul_ps(fscal
,dy02
);
1759 tz
= _mm_mul_ps(fscal
,dz02
);
1761 /* Update vectorial force */
1762 fix0
= _mm_add_ps(fix0
,tx
);
1763 fiy0
= _mm_add_ps(fiy0
,ty
);
1764 fiz0
= _mm_add_ps(fiz0
,tz
);
1766 fjx2
= _mm_add_ps(fjx2
,tx
);
1767 fjy2
= _mm_add_ps(fjy2
,ty
);
1768 fjz2
= _mm_add_ps(fjz2
,tz
);
1772 /**************************
1773 * CALCULATE INTERACTIONS *
1774 **************************/
1776 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1779 r10
= _mm_mul_ps(rsq10
,rinv10
);
1781 /* EWALD ELECTROSTATICS */
1783 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1784 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
1785 ewitab
= _mm_cvttps_epi32(ewrt
);
1786 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1787 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1788 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1790 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1791 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
1793 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
1797 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1799 /* Calculate temporary vectorial force */
1800 tx
= _mm_mul_ps(fscal
,dx10
);
1801 ty
= _mm_mul_ps(fscal
,dy10
);
1802 tz
= _mm_mul_ps(fscal
,dz10
);
1804 /* Update vectorial force */
1805 fix1
= _mm_add_ps(fix1
,tx
);
1806 fiy1
= _mm_add_ps(fiy1
,ty
);
1807 fiz1
= _mm_add_ps(fiz1
,tz
);
1809 fjx0
= _mm_add_ps(fjx0
,tx
);
1810 fjy0
= _mm_add_ps(fjy0
,ty
);
1811 fjz0
= _mm_add_ps(fjz0
,tz
);
1815 /**************************
1816 * CALCULATE INTERACTIONS *
1817 **************************/
1819 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1822 r11
= _mm_mul_ps(rsq11
,rinv11
);
1824 /* EWALD ELECTROSTATICS */
1826 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1827 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
1828 ewitab
= _mm_cvttps_epi32(ewrt
);
1829 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1830 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1831 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1833 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1834 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
1836 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
1840 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1842 /* Calculate temporary vectorial force */
1843 tx
= _mm_mul_ps(fscal
,dx11
);
1844 ty
= _mm_mul_ps(fscal
,dy11
);
1845 tz
= _mm_mul_ps(fscal
,dz11
);
1847 /* Update vectorial force */
1848 fix1
= _mm_add_ps(fix1
,tx
);
1849 fiy1
= _mm_add_ps(fiy1
,ty
);
1850 fiz1
= _mm_add_ps(fiz1
,tz
);
1852 fjx1
= _mm_add_ps(fjx1
,tx
);
1853 fjy1
= _mm_add_ps(fjy1
,ty
);
1854 fjz1
= _mm_add_ps(fjz1
,tz
);
1858 /**************************
1859 * CALCULATE INTERACTIONS *
1860 **************************/
1862 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1865 r12
= _mm_mul_ps(rsq12
,rinv12
);
1867 /* EWALD ELECTROSTATICS */
1869 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1870 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
1871 ewitab
= _mm_cvttps_epi32(ewrt
);
1872 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1873 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1874 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1876 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1877 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
1879 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
1883 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1885 /* Calculate temporary vectorial force */
1886 tx
= _mm_mul_ps(fscal
,dx12
);
1887 ty
= _mm_mul_ps(fscal
,dy12
);
1888 tz
= _mm_mul_ps(fscal
,dz12
);
1890 /* Update vectorial force */
1891 fix1
= _mm_add_ps(fix1
,tx
);
1892 fiy1
= _mm_add_ps(fiy1
,ty
);
1893 fiz1
= _mm_add_ps(fiz1
,tz
);
1895 fjx2
= _mm_add_ps(fjx2
,tx
);
1896 fjy2
= _mm_add_ps(fjy2
,ty
);
1897 fjz2
= _mm_add_ps(fjz2
,tz
);
1901 /**************************
1902 * CALCULATE INTERACTIONS *
1903 **************************/
1905 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1908 r20
= _mm_mul_ps(rsq20
,rinv20
);
1910 /* EWALD ELECTROSTATICS */
1912 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1913 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
1914 ewitab
= _mm_cvttps_epi32(ewrt
);
1915 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1916 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1917 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1919 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1920 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
1922 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
1926 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1928 /* Calculate temporary vectorial force */
1929 tx
= _mm_mul_ps(fscal
,dx20
);
1930 ty
= _mm_mul_ps(fscal
,dy20
);
1931 tz
= _mm_mul_ps(fscal
,dz20
);
1933 /* Update vectorial force */
1934 fix2
= _mm_add_ps(fix2
,tx
);
1935 fiy2
= _mm_add_ps(fiy2
,ty
);
1936 fiz2
= _mm_add_ps(fiz2
,tz
);
1938 fjx0
= _mm_add_ps(fjx0
,tx
);
1939 fjy0
= _mm_add_ps(fjy0
,ty
);
1940 fjz0
= _mm_add_ps(fjz0
,tz
);
1944 /**************************
1945 * CALCULATE INTERACTIONS *
1946 **************************/
1948 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1951 r21
= _mm_mul_ps(rsq21
,rinv21
);
1953 /* EWALD ELECTROSTATICS */
1955 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1956 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
1957 ewitab
= _mm_cvttps_epi32(ewrt
);
1958 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1959 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1960 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1962 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1963 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
1965 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
1969 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1971 /* Calculate temporary vectorial force */
1972 tx
= _mm_mul_ps(fscal
,dx21
);
1973 ty
= _mm_mul_ps(fscal
,dy21
);
1974 tz
= _mm_mul_ps(fscal
,dz21
);
1976 /* Update vectorial force */
1977 fix2
= _mm_add_ps(fix2
,tx
);
1978 fiy2
= _mm_add_ps(fiy2
,ty
);
1979 fiz2
= _mm_add_ps(fiz2
,tz
);
1981 fjx1
= _mm_add_ps(fjx1
,tx
);
1982 fjy1
= _mm_add_ps(fjy1
,ty
);
1983 fjz1
= _mm_add_ps(fjz1
,tz
);
1987 /**************************
1988 * CALCULATE INTERACTIONS *
1989 **************************/
1991 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1994 r22
= _mm_mul_ps(rsq22
,rinv22
);
1996 /* EWALD ELECTROSTATICS */
1998 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1999 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
2000 ewitab
= _mm_cvttps_epi32(ewrt
);
2001 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2002 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2003 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2005 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2006 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
2008 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
2012 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2014 /* Calculate temporary vectorial force */
2015 tx
= _mm_mul_ps(fscal
,dx22
);
2016 ty
= _mm_mul_ps(fscal
,dy22
);
2017 tz
= _mm_mul_ps(fscal
,dz22
);
2019 /* Update vectorial force */
2020 fix2
= _mm_add_ps(fix2
,tx
);
2021 fiy2
= _mm_add_ps(fiy2
,ty
);
2022 fiz2
= _mm_add_ps(fiz2
,tz
);
2024 fjx2
= _mm_add_ps(fjx2
,tx
);
2025 fjy2
= _mm_add_ps(fjy2
,ty
);
2026 fjz2
= _mm_add_ps(fjz2
,tz
);
2030 fjptrA
= f
+j_coord_offsetA
;
2031 fjptrB
= f
+j_coord_offsetB
;
2032 fjptrC
= f
+j_coord_offsetC
;
2033 fjptrD
= f
+j_coord_offsetD
;
2035 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2036 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2038 /* Inner loop uses 351 flops */
2041 if(jidx
<j_index_end
)
2044 /* Get j neighbor index, and coordinate index */
2045 jnrlistA
= jjnr
[jidx
];
2046 jnrlistB
= jjnr
[jidx
+1];
2047 jnrlistC
= jjnr
[jidx
+2];
2048 jnrlistD
= jjnr
[jidx
+3];
2049 /* Sign of each element will be negative for non-real atoms.
2050 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2051 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2053 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
2054 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
2055 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
2056 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
2057 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
2058 j_coord_offsetA
= DIM
*jnrA
;
2059 j_coord_offsetB
= DIM
*jnrB
;
2060 j_coord_offsetC
= DIM
*jnrC
;
2061 j_coord_offsetD
= DIM
*jnrD
;
2063 /* load j atom coordinates */
2064 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
2065 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
2066 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
2068 /* Calculate displacement vector */
2069 dx00
= _mm_sub_ps(ix0
,jx0
);
2070 dy00
= _mm_sub_ps(iy0
,jy0
);
2071 dz00
= _mm_sub_ps(iz0
,jz0
);
2072 dx01
= _mm_sub_ps(ix0
,jx1
);
2073 dy01
= _mm_sub_ps(iy0
,jy1
);
2074 dz01
= _mm_sub_ps(iz0
,jz1
);
2075 dx02
= _mm_sub_ps(ix0
,jx2
);
2076 dy02
= _mm_sub_ps(iy0
,jy2
);
2077 dz02
= _mm_sub_ps(iz0
,jz2
);
2078 dx10
= _mm_sub_ps(ix1
,jx0
);
2079 dy10
= _mm_sub_ps(iy1
,jy0
);
2080 dz10
= _mm_sub_ps(iz1
,jz0
);
2081 dx11
= _mm_sub_ps(ix1
,jx1
);
2082 dy11
= _mm_sub_ps(iy1
,jy1
);
2083 dz11
= _mm_sub_ps(iz1
,jz1
);
2084 dx12
= _mm_sub_ps(ix1
,jx2
);
2085 dy12
= _mm_sub_ps(iy1
,jy2
);
2086 dz12
= _mm_sub_ps(iz1
,jz2
);
2087 dx20
= _mm_sub_ps(ix2
,jx0
);
2088 dy20
= _mm_sub_ps(iy2
,jy0
);
2089 dz20
= _mm_sub_ps(iz2
,jz0
);
2090 dx21
= _mm_sub_ps(ix2
,jx1
);
2091 dy21
= _mm_sub_ps(iy2
,jy1
);
2092 dz21
= _mm_sub_ps(iz2
,jz1
);
2093 dx22
= _mm_sub_ps(ix2
,jx2
);
2094 dy22
= _mm_sub_ps(iy2
,jy2
);
2095 dz22
= _mm_sub_ps(iz2
,jz2
);
2097 /* Calculate squared distance and things based on it */
2098 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
2099 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
2100 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
2101 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
2102 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
2103 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
2104 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
2105 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
2106 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
2108 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
2109 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
2110 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
2111 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
2112 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
2113 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
2114 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
2115 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
2116 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
2118 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
2119 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
2120 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
2121 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
2122 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
2123 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
2124 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
2125 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
2126 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
2128 fjx0
= _mm_setzero_ps();
2129 fjy0
= _mm_setzero_ps();
2130 fjz0
= _mm_setzero_ps();
2131 fjx1
= _mm_setzero_ps();
2132 fjy1
= _mm_setzero_ps();
2133 fjz1
= _mm_setzero_ps();
2134 fjx2
= _mm_setzero_ps();
2135 fjy2
= _mm_setzero_ps();
2136 fjz2
= _mm_setzero_ps();
2138 /**************************
2139 * CALCULATE INTERACTIONS *
2140 **************************/
2142 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
2145 r00
= _mm_mul_ps(rsq00
,rinv00
);
2146 r00
= _mm_andnot_ps(dummy_mask
,r00
);
2148 /* EWALD ELECTROSTATICS */
2150 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2151 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
2152 ewitab
= _mm_cvttps_epi32(ewrt
);
2153 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2154 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2155 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2157 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2158 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
2160 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
2164 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2166 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2168 /* Calculate temporary vectorial force */
2169 tx
= _mm_mul_ps(fscal
,dx00
);
2170 ty
= _mm_mul_ps(fscal
,dy00
);
2171 tz
= _mm_mul_ps(fscal
,dz00
);
2173 /* Update vectorial force */
2174 fix0
= _mm_add_ps(fix0
,tx
);
2175 fiy0
= _mm_add_ps(fiy0
,ty
);
2176 fiz0
= _mm_add_ps(fiz0
,tz
);
2178 fjx0
= _mm_add_ps(fjx0
,tx
);
2179 fjy0
= _mm_add_ps(fjy0
,ty
);
2180 fjz0
= _mm_add_ps(fjz0
,tz
);
2184 /**************************
2185 * CALCULATE INTERACTIONS *
2186 **************************/
2188 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
2191 r01
= _mm_mul_ps(rsq01
,rinv01
);
2192 r01
= _mm_andnot_ps(dummy_mask
,r01
);
2194 /* EWALD ELECTROSTATICS */
2196 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2197 ewrt
= _mm_mul_ps(r01
,ewtabscale
);
2198 ewitab
= _mm_cvttps_epi32(ewrt
);
2199 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2200 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2201 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2203 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2204 felec
= _mm_mul_ps(_mm_mul_ps(qq01
,rinv01
),_mm_sub_ps(rinvsq01
,felec
));
2206 cutoff_mask
= _mm_cmplt_ps(rsq01
,rcutoff2
);
2210 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2212 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2214 /* Calculate temporary vectorial force */
2215 tx
= _mm_mul_ps(fscal
,dx01
);
2216 ty
= _mm_mul_ps(fscal
,dy01
);
2217 tz
= _mm_mul_ps(fscal
,dz01
);
2219 /* Update vectorial force */
2220 fix0
= _mm_add_ps(fix0
,tx
);
2221 fiy0
= _mm_add_ps(fiy0
,ty
);
2222 fiz0
= _mm_add_ps(fiz0
,tz
);
2224 fjx1
= _mm_add_ps(fjx1
,tx
);
2225 fjy1
= _mm_add_ps(fjy1
,ty
);
2226 fjz1
= _mm_add_ps(fjz1
,tz
);
2230 /**************************
2231 * CALCULATE INTERACTIONS *
2232 **************************/
2234 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
2237 r02
= _mm_mul_ps(rsq02
,rinv02
);
2238 r02
= _mm_andnot_ps(dummy_mask
,r02
);
2240 /* EWALD ELECTROSTATICS */
2242 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2243 ewrt
= _mm_mul_ps(r02
,ewtabscale
);
2244 ewitab
= _mm_cvttps_epi32(ewrt
);
2245 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2246 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2247 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2249 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2250 felec
= _mm_mul_ps(_mm_mul_ps(qq02
,rinv02
),_mm_sub_ps(rinvsq02
,felec
));
2252 cutoff_mask
= _mm_cmplt_ps(rsq02
,rcutoff2
);
2256 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2258 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2260 /* Calculate temporary vectorial force */
2261 tx
= _mm_mul_ps(fscal
,dx02
);
2262 ty
= _mm_mul_ps(fscal
,dy02
);
2263 tz
= _mm_mul_ps(fscal
,dz02
);
2265 /* Update vectorial force */
2266 fix0
= _mm_add_ps(fix0
,tx
);
2267 fiy0
= _mm_add_ps(fiy0
,ty
);
2268 fiz0
= _mm_add_ps(fiz0
,tz
);
2270 fjx2
= _mm_add_ps(fjx2
,tx
);
2271 fjy2
= _mm_add_ps(fjy2
,ty
);
2272 fjz2
= _mm_add_ps(fjz2
,tz
);
2276 /**************************
2277 * CALCULATE INTERACTIONS *
2278 **************************/
2280 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
2283 r10
= _mm_mul_ps(rsq10
,rinv10
);
2284 r10
= _mm_andnot_ps(dummy_mask
,r10
);
2286 /* EWALD ELECTROSTATICS */
2288 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2289 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
2290 ewitab
= _mm_cvttps_epi32(ewrt
);
2291 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2292 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2293 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2295 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2296 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
2298 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
2302 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2304 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2306 /* Calculate temporary vectorial force */
2307 tx
= _mm_mul_ps(fscal
,dx10
);
2308 ty
= _mm_mul_ps(fscal
,dy10
);
2309 tz
= _mm_mul_ps(fscal
,dz10
);
2311 /* Update vectorial force */
2312 fix1
= _mm_add_ps(fix1
,tx
);
2313 fiy1
= _mm_add_ps(fiy1
,ty
);
2314 fiz1
= _mm_add_ps(fiz1
,tz
);
2316 fjx0
= _mm_add_ps(fjx0
,tx
);
2317 fjy0
= _mm_add_ps(fjy0
,ty
);
2318 fjz0
= _mm_add_ps(fjz0
,tz
);
2322 /**************************
2323 * CALCULATE INTERACTIONS *
2324 **************************/
2326 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2329 r11
= _mm_mul_ps(rsq11
,rinv11
);
2330 r11
= _mm_andnot_ps(dummy_mask
,r11
);
2332 /* EWALD ELECTROSTATICS */
2334 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2335 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
2336 ewitab
= _mm_cvttps_epi32(ewrt
);
2337 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2338 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2339 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2341 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2342 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
2344 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
2348 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2350 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2352 /* Calculate temporary vectorial force */
2353 tx
= _mm_mul_ps(fscal
,dx11
);
2354 ty
= _mm_mul_ps(fscal
,dy11
);
2355 tz
= _mm_mul_ps(fscal
,dz11
);
2357 /* Update vectorial force */
2358 fix1
= _mm_add_ps(fix1
,tx
);
2359 fiy1
= _mm_add_ps(fiy1
,ty
);
2360 fiz1
= _mm_add_ps(fiz1
,tz
);
2362 fjx1
= _mm_add_ps(fjx1
,tx
);
2363 fjy1
= _mm_add_ps(fjy1
,ty
);
2364 fjz1
= _mm_add_ps(fjz1
,tz
);
2368 /**************************
2369 * CALCULATE INTERACTIONS *
2370 **************************/
2372 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2375 r12
= _mm_mul_ps(rsq12
,rinv12
);
2376 r12
= _mm_andnot_ps(dummy_mask
,r12
);
2378 /* EWALD ELECTROSTATICS */
2380 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2381 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
2382 ewitab
= _mm_cvttps_epi32(ewrt
);
2383 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2384 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2385 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2387 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2388 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
2390 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
2394 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2396 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2398 /* Calculate temporary vectorial force */
2399 tx
= _mm_mul_ps(fscal
,dx12
);
2400 ty
= _mm_mul_ps(fscal
,dy12
);
2401 tz
= _mm_mul_ps(fscal
,dz12
);
2403 /* Update vectorial force */
2404 fix1
= _mm_add_ps(fix1
,tx
);
2405 fiy1
= _mm_add_ps(fiy1
,ty
);
2406 fiz1
= _mm_add_ps(fiz1
,tz
);
2408 fjx2
= _mm_add_ps(fjx2
,tx
);
2409 fjy2
= _mm_add_ps(fjy2
,ty
);
2410 fjz2
= _mm_add_ps(fjz2
,tz
);
2414 /**************************
2415 * CALCULATE INTERACTIONS *
2416 **************************/
2418 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
2421 r20
= _mm_mul_ps(rsq20
,rinv20
);
2422 r20
= _mm_andnot_ps(dummy_mask
,r20
);
2424 /* EWALD ELECTROSTATICS */
2426 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2427 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
2428 ewitab
= _mm_cvttps_epi32(ewrt
);
2429 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2430 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2431 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2433 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2434 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
2436 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
2440 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2442 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2444 /* Calculate temporary vectorial force */
2445 tx
= _mm_mul_ps(fscal
,dx20
);
2446 ty
= _mm_mul_ps(fscal
,dy20
);
2447 tz
= _mm_mul_ps(fscal
,dz20
);
2449 /* Update vectorial force */
2450 fix2
= _mm_add_ps(fix2
,tx
);
2451 fiy2
= _mm_add_ps(fiy2
,ty
);
2452 fiz2
= _mm_add_ps(fiz2
,tz
);
2454 fjx0
= _mm_add_ps(fjx0
,tx
);
2455 fjy0
= _mm_add_ps(fjy0
,ty
);
2456 fjz0
= _mm_add_ps(fjz0
,tz
);
2460 /**************************
2461 * CALCULATE INTERACTIONS *
2462 **************************/
2464 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2467 r21
= _mm_mul_ps(rsq21
,rinv21
);
2468 r21
= _mm_andnot_ps(dummy_mask
,r21
);
2470 /* EWALD ELECTROSTATICS */
2472 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2473 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
2474 ewitab
= _mm_cvttps_epi32(ewrt
);
2475 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2476 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2477 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2479 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2480 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
2482 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
2486 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2488 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2490 /* Calculate temporary vectorial force */
2491 tx
= _mm_mul_ps(fscal
,dx21
);
2492 ty
= _mm_mul_ps(fscal
,dy21
);
2493 tz
= _mm_mul_ps(fscal
,dz21
);
2495 /* Update vectorial force */
2496 fix2
= _mm_add_ps(fix2
,tx
);
2497 fiy2
= _mm_add_ps(fiy2
,ty
);
2498 fiz2
= _mm_add_ps(fiz2
,tz
);
2500 fjx1
= _mm_add_ps(fjx1
,tx
);
2501 fjy1
= _mm_add_ps(fjy1
,ty
);
2502 fjz1
= _mm_add_ps(fjz1
,tz
);
2506 /**************************
2507 * CALCULATE INTERACTIONS *
2508 **************************/
2510 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2513 r22
= _mm_mul_ps(rsq22
,rinv22
);
2514 r22
= _mm_andnot_ps(dummy_mask
,r22
);
2516 /* EWALD ELECTROSTATICS */
2518 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2519 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
2520 ewitab
= _mm_cvttps_epi32(ewrt
);
2521 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2522 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2523 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
2525 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
2526 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
2528 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
2532 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2534 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2536 /* Calculate temporary vectorial force */
2537 tx
= _mm_mul_ps(fscal
,dx22
);
2538 ty
= _mm_mul_ps(fscal
,dy22
);
2539 tz
= _mm_mul_ps(fscal
,dz22
);
2541 /* Update vectorial force */
2542 fix2
= _mm_add_ps(fix2
,tx
);
2543 fiy2
= _mm_add_ps(fiy2
,ty
);
2544 fiz2
= _mm_add_ps(fiz2
,tz
);
2546 fjx2
= _mm_add_ps(fjx2
,tx
);
2547 fjy2
= _mm_add_ps(fjy2
,ty
);
2548 fjz2
= _mm_add_ps(fjz2
,tz
);
2552 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2553 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2554 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2555 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2557 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2558 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2560 /* Inner loop uses 360 flops */
2563 /* End of innermost loop */
2565 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2566 f
+i_coord_offset
,fshift
+i_shift_offset
);
2568 /* Increment number of inner iterations */
2569 inneriter
+= j_index_end
- j_index_start
;
2571 /* Outer loop uses 18 flops */
2574 /* Increment number of outer iterations */
2577 /* Update outer/inner flops */
2579 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3W3_F
,outeriter
*18 + inneriter
*360);