2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse4_1_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water3-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_single
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
81 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
83 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
85 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
87 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
89 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
90 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
91 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
92 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
93 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
96 __m128 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
98 __m128 rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
99 real rswitch_scalar
,d_scalar
;
100 __m128 dummy_mask
,cutoff_mask
;
101 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
102 __m128 one
= _mm_set1_ps(1.0);
103 __m128 two
= _mm_set1_ps(2.0);
109 jindex
= nlist
->jindex
;
111 shiftidx
= nlist
->shift
;
113 shiftvec
= fr
->shift_vec
[0];
114 fshift
= fr
->fshift
[0];
115 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
116 charge
= mdatoms
->chargeA
;
118 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
119 ewtab
= fr
->ic
->tabq_coul_FDV0
;
120 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
121 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
123 /* Setup water-specific parameters */
124 inr
= nlist
->iinr
[0];
125 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
126 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
127 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
129 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
130 rcutoff_scalar
= fr
->ic
->rcoulomb
;
131 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
132 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
134 rswitch_scalar
= fr
->ic
->rcoulomb_switch
;
135 rswitch
= _mm_set1_ps(rswitch_scalar
);
136 /* Setup switch parameters */
137 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
138 d
= _mm_set1_ps(d_scalar
);
139 swV3
= _mm_set1_ps(-10.0/(d_scalar
*d_scalar
*d_scalar
));
140 swV4
= _mm_set1_ps( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
141 swV5
= _mm_set1_ps( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
142 swF2
= _mm_set1_ps(-30.0/(d_scalar
*d_scalar
*d_scalar
));
143 swF3
= _mm_set1_ps( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
144 swF4
= _mm_set1_ps(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
146 /* Avoid stupid compiler warnings */
147 jnrA
= jnrB
= jnrC
= jnrD
= 0;
156 for(iidx
=0;iidx
<4*DIM
;iidx
++)
161 /* Start outer loop over neighborlists */
162 for(iidx
=0; iidx
<nri
; iidx
++)
164 /* Load shift vector for this list */
165 i_shift_offset
= DIM
*shiftidx
[iidx
];
167 /* Load limits for loop over neighbors */
168 j_index_start
= jindex
[iidx
];
169 j_index_end
= jindex
[iidx
+1];
171 /* Get outer coordinate index */
173 i_coord_offset
= DIM
*inr
;
175 /* Load i particle coords and add shift vector */
176 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
177 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
179 fix0
= _mm_setzero_ps();
180 fiy0
= _mm_setzero_ps();
181 fiz0
= _mm_setzero_ps();
182 fix1
= _mm_setzero_ps();
183 fiy1
= _mm_setzero_ps();
184 fiz1
= _mm_setzero_ps();
185 fix2
= _mm_setzero_ps();
186 fiy2
= _mm_setzero_ps();
187 fiz2
= _mm_setzero_ps();
189 /* Reset potential sums */
190 velecsum
= _mm_setzero_ps();
192 /* Start inner kernel loop */
193 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
196 /* Get j neighbor index, and coordinate index */
201 j_coord_offsetA
= DIM
*jnrA
;
202 j_coord_offsetB
= DIM
*jnrB
;
203 j_coord_offsetC
= DIM
*jnrC
;
204 j_coord_offsetD
= DIM
*jnrD
;
206 /* load j atom coordinates */
207 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
208 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
211 /* Calculate displacement vector */
212 dx00
= _mm_sub_ps(ix0
,jx0
);
213 dy00
= _mm_sub_ps(iy0
,jy0
);
214 dz00
= _mm_sub_ps(iz0
,jz0
);
215 dx10
= _mm_sub_ps(ix1
,jx0
);
216 dy10
= _mm_sub_ps(iy1
,jy0
);
217 dz10
= _mm_sub_ps(iz1
,jz0
);
218 dx20
= _mm_sub_ps(ix2
,jx0
);
219 dy20
= _mm_sub_ps(iy2
,jy0
);
220 dz20
= _mm_sub_ps(iz2
,jz0
);
222 /* Calculate squared distance and things based on it */
223 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
224 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
225 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
227 rinv00
= sse41_invsqrt_f(rsq00
);
228 rinv10
= sse41_invsqrt_f(rsq10
);
229 rinv20
= sse41_invsqrt_f(rsq20
);
231 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
232 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
233 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
235 /* Load parameters for j particles */
236 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
237 charge
+jnrC
+0,charge
+jnrD
+0);
239 fjx0
= _mm_setzero_ps();
240 fjy0
= _mm_setzero_ps();
241 fjz0
= _mm_setzero_ps();
243 /**************************
244 * CALCULATE INTERACTIONS *
245 **************************/
247 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
250 r00
= _mm_mul_ps(rsq00
,rinv00
);
252 /* Compute parameters for interactions between i and j atoms */
253 qq00
= _mm_mul_ps(iq0
,jq0
);
255 /* EWALD ELECTROSTATICS */
257 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
258 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
259 ewitab
= _mm_cvttps_epi32(ewrt
);
260 eweps
= _mm_sub_ps(ewrt
,_mm_round_ps(ewrt
, _MM_FROUND_FLOOR
));
261 ewitab
= _mm_slli_epi32(ewitab
,2);
262 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
263 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
264 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
265 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
266 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
267 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
268 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
269 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(rinv00
,velec
));
270 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
272 d
= _mm_sub_ps(r00
,rswitch
);
273 d
= _mm_max_ps(d
,_mm_setzero_ps());
274 d2
= _mm_mul_ps(d
,d
);
275 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
277 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
279 /* Evaluate switch function */
280 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
281 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv00
,_mm_mul_ps(velec
,dsw
)) );
282 velec
= _mm_mul_ps(velec
,sw
);
283 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
285 /* Update potential sum for this i atom from the interaction with this j atom. */
286 velec
= _mm_and_ps(velec
,cutoff_mask
);
287 velecsum
= _mm_add_ps(velecsum
,velec
);
291 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
293 /* Calculate temporary vectorial force */
294 tx
= _mm_mul_ps(fscal
,dx00
);
295 ty
= _mm_mul_ps(fscal
,dy00
);
296 tz
= _mm_mul_ps(fscal
,dz00
);
298 /* Update vectorial force */
299 fix0
= _mm_add_ps(fix0
,tx
);
300 fiy0
= _mm_add_ps(fiy0
,ty
);
301 fiz0
= _mm_add_ps(fiz0
,tz
);
303 fjx0
= _mm_add_ps(fjx0
,tx
);
304 fjy0
= _mm_add_ps(fjy0
,ty
);
305 fjz0
= _mm_add_ps(fjz0
,tz
);
309 /**************************
310 * CALCULATE INTERACTIONS *
311 **************************/
313 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
316 r10
= _mm_mul_ps(rsq10
,rinv10
);
318 /* Compute parameters for interactions between i and j atoms */
319 qq10
= _mm_mul_ps(iq1
,jq0
);
321 /* EWALD ELECTROSTATICS */
323 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
324 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
325 ewitab
= _mm_cvttps_epi32(ewrt
);
326 eweps
= _mm_sub_ps(ewrt
,_mm_round_ps(ewrt
, _MM_FROUND_FLOOR
));
327 ewitab
= _mm_slli_epi32(ewitab
,2);
328 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
329 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
330 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
331 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
332 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
333 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
334 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
335 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(rinv10
,velec
));
336 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
338 d
= _mm_sub_ps(r10
,rswitch
);
339 d
= _mm_max_ps(d
,_mm_setzero_ps());
340 d2
= _mm_mul_ps(d
,d
);
341 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
343 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
345 /* Evaluate switch function */
346 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
347 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv10
,_mm_mul_ps(velec
,dsw
)) );
348 velec
= _mm_mul_ps(velec
,sw
);
349 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
351 /* Update potential sum for this i atom from the interaction with this j atom. */
352 velec
= _mm_and_ps(velec
,cutoff_mask
);
353 velecsum
= _mm_add_ps(velecsum
,velec
);
357 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
359 /* Calculate temporary vectorial force */
360 tx
= _mm_mul_ps(fscal
,dx10
);
361 ty
= _mm_mul_ps(fscal
,dy10
);
362 tz
= _mm_mul_ps(fscal
,dz10
);
364 /* Update vectorial force */
365 fix1
= _mm_add_ps(fix1
,tx
);
366 fiy1
= _mm_add_ps(fiy1
,ty
);
367 fiz1
= _mm_add_ps(fiz1
,tz
);
369 fjx0
= _mm_add_ps(fjx0
,tx
);
370 fjy0
= _mm_add_ps(fjy0
,ty
);
371 fjz0
= _mm_add_ps(fjz0
,tz
);
375 /**************************
376 * CALCULATE INTERACTIONS *
377 **************************/
379 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
382 r20
= _mm_mul_ps(rsq20
,rinv20
);
384 /* Compute parameters for interactions between i and j atoms */
385 qq20
= _mm_mul_ps(iq2
,jq0
);
387 /* EWALD ELECTROSTATICS */
389 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
390 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
391 ewitab
= _mm_cvttps_epi32(ewrt
);
392 eweps
= _mm_sub_ps(ewrt
,_mm_round_ps(ewrt
, _MM_FROUND_FLOOR
));
393 ewitab
= _mm_slli_epi32(ewitab
,2);
394 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
395 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
396 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
397 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
398 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
399 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
400 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
401 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(rinv20
,velec
));
402 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
404 d
= _mm_sub_ps(r20
,rswitch
);
405 d
= _mm_max_ps(d
,_mm_setzero_ps());
406 d2
= _mm_mul_ps(d
,d
);
407 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
409 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
411 /* Evaluate switch function */
412 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
413 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv20
,_mm_mul_ps(velec
,dsw
)) );
414 velec
= _mm_mul_ps(velec
,sw
);
415 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
417 /* Update potential sum for this i atom from the interaction with this j atom. */
418 velec
= _mm_and_ps(velec
,cutoff_mask
);
419 velecsum
= _mm_add_ps(velecsum
,velec
);
423 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
425 /* Calculate temporary vectorial force */
426 tx
= _mm_mul_ps(fscal
,dx20
);
427 ty
= _mm_mul_ps(fscal
,dy20
);
428 tz
= _mm_mul_ps(fscal
,dz20
);
430 /* Update vectorial force */
431 fix2
= _mm_add_ps(fix2
,tx
);
432 fiy2
= _mm_add_ps(fiy2
,ty
);
433 fiz2
= _mm_add_ps(fiz2
,tz
);
435 fjx0
= _mm_add_ps(fjx0
,tx
);
436 fjy0
= _mm_add_ps(fjy0
,ty
);
437 fjz0
= _mm_add_ps(fjz0
,tz
);
441 fjptrA
= f
+j_coord_offsetA
;
442 fjptrB
= f
+j_coord_offsetB
;
443 fjptrC
= f
+j_coord_offsetC
;
444 fjptrD
= f
+j_coord_offsetD
;
446 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
448 /* Inner loop uses 195 flops */
454 /* Get j neighbor index, and coordinate index */
455 jnrlistA
= jjnr
[jidx
];
456 jnrlistB
= jjnr
[jidx
+1];
457 jnrlistC
= jjnr
[jidx
+2];
458 jnrlistD
= jjnr
[jidx
+3];
459 /* Sign of each element will be negative for non-real atoms.
460 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
461 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
463 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
464 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
465 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
466 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
467 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
468 j_coord_offsetA
= DIM
*jnrA
;
469 j_coord_offsetB
= DIM
*jnrB
;
470 j_coord_offsetC
= DIM
*jnrC
;
471 j_coord_offsetD
= DIM
*jnrD
;
473 /* load j atom coordinates */
474 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
475 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
478 /* Calculate displacement vector */
479 dx00
= _mm_sub_ps(ix0
,jx0
);
480 dy00
= _mm_sub_ps(iy0
,jy0
);
481 dz00
= _mm_sub_ps(iz0
,jz0
);
482 dx10
= _mm_sub_ps(ix1
,jx0
);
483 dy10
= _mm_sub_ps(iy1
,jy0
);
484 dz10
= _mm_sub_ps(iz1
,jz0
);
485 dx20
= _mm_sub_ps(ix2
,jx0
);
486 dy20
= _mm_sub_ps(iy2
,jy0
);
487 dz20
= _mm_sub_ps(iz2
,jz0
);
489 /* Calculate squared distance and things based on it */
490 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
491 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
492 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
494 rinv00
= sse41_invsqrt_f(rsq00
);
495 rinv10
= sse41_invsqrt_f(rsq10
);
496 rinv20
= sse41_invsqrt_f(rsq20
);
498 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
499 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
500 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
502 /* Load parameters for j particles */
503 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
504 charge
+jnrC
+0,charge
+jnrD
+0);
506 fjx0
= _mm_setzero_ps();
507 fjy0
= _mm_setzero_ps();
508 fjz0
= _mm_setzero_ps();
510 /**************************
511 * CALCULATE INTERACTIONS *
512 **************************/
514 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
517 r00
= _mm_mul_ps(rsq00
,rinv00
);
518 r00
= _mm_andnot_ps(dummy_mask
,r00
);
520 /* Compute parameters for interactions between i and j atoms */
521 qq00
= _mm_mul_ps(iq0
,jq0
);
523 /* EWALD ELECTROSTATICS */
525 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
526 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
527 ewitab
= _mm_cvttps_epi32(ewrt
);
528 eweps
= _mm_sub_ps(ewrt
,_mm_round_ps(ewrt
, _MM_FROUND_FLOOR
));
529 ewitab
= _mm_slli_epi32(ewitab
,2);
530 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
531 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
532 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
533 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
534 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
535 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
536 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
537 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(rinv00
,velec
));
538 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
540 d
= _mm_sub_ps(r00
,rswitch
);
541 d
= _mm_max_ps(d
,_mm_setzero_ps());
542 d2
= _mm_mul_ps(d
,d
);
543 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
545 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
547 /* Evaluate switch function */
548 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
549 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv00
,_mm_mul_ps(velec
,dsw
)) );
550 velec
= _mm_mul_ps(velec
,sw
);
551 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
553 /* Update potential sum for this i atom from the interaction with this j atom. */
554 velec
= _mm_and_ps(velec
,cutoff_mask
);
555 velec
= _mm_andnot_ps(dummy_mask
,velec
);
556 velecsum
= _mm_add_ps(velecsum
,velec
);
560 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
562 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
564 /* Calculate temporary vectorial force */
565 tx
= _mm_mul_ps(fscal
,dx00
);
566 ty
= _mm_mul_ps(fscal
,dy00
);
567 tz
= _mm_mul_ps(fscal
,dz00
);
569 /* Update vectorial force */
570 fix0
= _mm_add_ps(fix0
,tx
);
571 fiy0
= _mm_add_ps(fiy0
,ty
);
572 fiz0
= _mm_add_ps(fiz0
,tz
);
574 fjx0
= _mm_add_ps(fjx0
,tx
);
575 fjy0
= _mm_add_ps(fjy0
,ty
);
576 fjz0
= _mm_add_ps(fjz0
,tz
);
580 /**************************
581 * CALCULATE INTERACTIONS *
582 **************************/
584 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
587 r10
= _mm_mul_ps(rsq10
,rinv10
);
588 r10
= _mm_andnot_ps(dummy_mask
,r10
);
590 /* Compute parameters for interactions between i and j atoms */
591 qq10
= _mm_mul_ps(iq1
,jq0
);
593 /* EWALD ELECTROSTATICS */
595 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
596 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
597 ewitab
= _mm_cvttps_epi32(ewrt
);
598 eweps
= _mm_sub_ps(ewrt
,_mm_round_ps(ewrt
, _MM_FROUND_FLOOR
));
599 ewitab
= _mm_slli_epi32(ewitab
,2);
600 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
601 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
602 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
603 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
604 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
605 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
606 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
607 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(rinv10
,velec
));
608 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
610 d
= _mm_sub_ps(r10
,rswitch
);
611 d
= _mm_max_ps(d
,_mm_setzero_ps());
612 d2
= _mm_mul_ps(d
,d
);
613 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
615 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
617 /* Evaluate switch function */
618 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
619 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv10
,_mm_mul_ps(velec
,dsw
)) );
620 velec
= _mm_mul_ps(velec
,sw
);
621 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
623 /* Update potential sum for this i atom from the interaction with this j atom. */
624 velec
= _mm_and_ps(velec
,cutoff_mask
);
625 velec
= _mm_andnot_ps(dummy_mask
,velec
);
626 velecsum
= _mm_add_ps(velecsum
,velec
);
630 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
632 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
634 /* Calculate temporary vectorial force */
635 tx
= _mm_mul_ps(fscal
,dx10
);
636 ty
= _mm_mul_ps(fscal
,dy10
);
637 tz
= _mm_mul_ps(fscal
,dz10
);
639 /* Update vectorial force */
640 fix1
= _mm_add_ps(fix1
,tx
);
641 fiy1
= _mm_add_ps(fiy1
,ty
);
642 fiz1
= _mm_add_ps(fiz1
,tz
);
644 fjx0
= _mm_add_ps(fjx0
,tx
);
645 fjy0
= _mm_add_ps(fjy0
,ty
);
646 fjz0
= _mm_add_ps(fjz0
,tz
);
650 /**************************
651 * CALCULATE INTERACTIONS *
652 **************************/
654 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
657 r20
= _mm_mul_ps(rsq20
,rinv20
);
658 r20
= _mm_andnot_ps(dummy_mask
,r20
);
660 /* Compute parameters for interactions between i and j atoms */
661 qq20
= _mm_mul_ps(iq2
,jq0
);
663 /* EWALD ELECTROSTATICS */
665 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
666 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
667 ewitab
= _mm_cvttps_epi32(ewrt
);
668 eweps
= _mm_sub_ps(ewrt
,_mm_round_ps(ewrt
, _MM_FROUND_FLOOR
));
669 ewitab
= _mm_slli_epi32(ewitab
,2);
670 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
671 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
672 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
673 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
674 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
675 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
676 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
677 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(rinv20
,velec
));
678 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
680 d
= _mm_sub_ps(r20
,rswitch
);
681 d
= _mm_max_ps(d
,_mm_setzero_ps());
682 d2
= _mm_mul_ps(d
,d
);
683 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
685 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
687 /* Evaluate switch function */
688 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
689 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv20
,_mm_mul_ps(velec
,dsw
)) );
690 velec
= _mm_mul_ps(velec
,sw
);
691 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
693 /* Update potential sum for this i atom from the interaction with this j atom. */
694 velec
= _mm_and_ps(velec
,cutoff_mask
);
695 velec
= _mm_andnot_ps(dummy_mask
,velec
);
696 velecsum
= _mm_add_ps(velecsum
,velec
);
700 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
702 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
704 /* Calculate temporary vectorial force */
705 tx
= _mm_mul_ps(fscal
,dx20
);
706 ty
= _mm_mul_ps(fscal
,dy20
);
707 tz
= _mm_mul_ps(fscal
,dz20
);
709 /* Update vectorial force */
710 fix2
= _mm_add_ps(fix2
,tx
);
711 fiy2
= _mm_add_ps(fiy2
,ty
);
712 fiz2
= _mm_add_ps(fiz2
,tz
);
714 fjx0
= _mm_add_ps(fjx0
,tx
);
715 fjy0
= _mm_add_ps(fjy0
,ty
);
716 fjz0
= _mm_add_ps(fjz0
,tz
);
720 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
721 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
722 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
723 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
725 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
727 /* Inner loop uses 198 flops */
730 /* End of innermost loop */
732 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
733 f
+i_coord_offset
,fshift
+i_shift_offset
);
736 /* Update potential energies */
737 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
739 /* Increment number of inner iterations */
740 inneriter
+= j_index_end
- j_index_start
;
742 /* Outer loop uses 19 flops */
745 /* Increment number of outer iterations */
748 /* Update outer/inner flops */
750 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3_VF
,outeriter
*19 + inneriter
*198);
753 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_single
754 * Electrostatics interaction: Ewald
755 * VdW interaction: None
756 * Geometry: Water3-Particle
757 * Calculate force/pot: Force
760 nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_single
761 (t_nblist
* gmx_restrict nlist
,
762 rvec
* gmx_restrict xx
,
763 rvec
* gmx_restrict ff
,
764 struct t_forcerec
* gmx_restrict fr
,
765 t_mdatoms
* gmx_restrict mdatoms
,
766 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
767 t_nrnb
* gmx_restrict nrnb
)
769 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
770 * just 0 for non-waters.
771 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
772 * jnr indices corresponding to data put in the four positions in the SIMD register.
774 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
775 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
776 int jnrA
,jnrB
,jnrC
,jnrD
;
777 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
778 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
779 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
781 real
*shiftvec
,*fshift
,*x
,*f
;
782 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
784 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
786 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
788 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
790 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
791 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
792 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
793 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
794 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
795 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
796 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
799 __m128 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
801 __m128 rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
802 real rswitch_scalar
,d_scalar
;
803 __m128 dummy_mask
,cutoff_mask
;
804 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
805 __m128 one
= _mm_set1_ps(1.0);
806 __m128 two
= _mm_set1_ps(2.0);
812 jindex
= nlist
->jindex
;
814 shiftidx
= nlist
->shift
;
816 shiftvec
= fr
->shift_vec
[0];
817 fshift
= fr
->fshift
[0];
818 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
819 charge
= mdatoms
->chargeA
;
821 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
822 ewtab
= fr
->ic
->tabq_coul_FDV0
;
823 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
824 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
826 /* Setup water-specific parameters */
827 inr
= nlist
->iinr
[0];
828 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
829 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
830 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
832 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
833 rcutoff_scalar
= fr
->ic
->rcoulomb
;
834 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
835 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
837 rswitch_scalar
= fr
->ic
->rcoulomb_switch
;
838 rswitch
= _mm_set1_ps(rswitch_scalar
);
839 /* Setup switch parameters */
840 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
841 d
= _mm_set1_ps(d_scalar
);
842 swV3
= _mm_set1_ps(-10.0/(d_scalar
*d_scalar
*d_scalar
));
843 swV4
= _mm_set1_ps( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
844 swV5
= _mm_set1_ps( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
845 swF2
= _mm_set1_ps(-30.0/(d_scalar
*d_scalar
*d_scalar
));
846 swF3
= _mm_set1_ps( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
847 swF4
= _mm_set1_ps(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
849 /* Avoid stupid compiler warnings */
850 jnrA
= jnrB
= jnrC
= jnrD
= 0;
859 for(iidx
=0;iidx
<4*DIM
;iidx
++)
864 /* Start outer loop over neighborlists */
865 for(iidx
=0; iidx
<nri
; iidx
++)
867 /* Load shift vector for this list */
868 i_shift_offset
= DIM
*shiftidx
[iidx
];
870 /* Load limits for loop over neighbors */
871 j_index_start
= jindex
[iidx
];
872 j_index_end
= jindex
[iidx
+1];
874 /* Get outer coordinate index */
876 i_coord_offset
= DIM
*inr
;
878 /* Load i particle coords and add shift vector */
879 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
880 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
882 fix0
= _mm_setzero_ps();
883 fiy0
= _mm_setzero_ps();
884 fiz0
= _mm_setzero_ps();
885 fix1
= _mm_setzero_ps();
886 fiy1
= _mm_setzero_ps();
887 fiz1
= _mm_setzero_ps();
888 fix2
= _mm_setzero_ps();
889 fiy2
= _mm_setzero_ps();
890 fiz2
= _mm_setzero_ps();
892 /* Start inner kernel loop */
893 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
896 /* Get j neighbor index, and coordinate index */
901 j_coord_offsetA
= DIM
*jnrA
;
902 j_coord_offsetB
= DIM
*jnrB
;
903 j_coord_offsetC
= DIM
*jnrC
;
904 j_coord_offsetD
= DIM
*jnrD
;
906 /* load j atom coordinates */
907 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
908 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
911 /* Calculate displacement vector */
912 dx00
= _mm_sub_ps(ix0
,jx0
);
913 dy00
= _mm_sub_ps(iy0
,jy0
);
914 dz00
= _mm_sub_ps(iz0
,jz0
);
915 dx10
= _mm_sub_ps(ix1
,jx0
);
916 dy10
= _mm_sub_ps(iy1
,jy0
);
917 dz10
= _mm_sub_ps(iz1
,jz0
);
918 dx20
= _mm_sub_ps(ix2
,jx0
);
919 dy20
= _mm_sub_ps(iy2
,jy0
);
920 dz20
= _mm_sub_ps(iz2
,jz0
);
922 /* Calculate squared distance and things based on it */
923 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
924 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
925 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
927 rinv00
= sse41_invsqrt_f(rsq00
);
928 rinv10
= sse41_invsqrt_f(rsq10
);
929 rinv20
= sse41_invsqrt_f(rsq20
);
931 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
932 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
933 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
935 /* Load parameters for j particles */
936 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
937 charge
+jnrC
+0,charge
+jnrD
+0);
939 fjx0
= _mm_setzero_ps();
940 fjy0
= _mm_setzero_ps();
941 fjz0
= _mm_setzero_ps();
943 /**************************
944 * CALCULATE INTERACTIONS *
945 **************************/
947 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
950 r00
= _mm_mul_ps(rsq00
,rinv00
);
952 /* Compute parameters for interactions between i and j atoms */
953 qq00
= _mm_mul_ps(iq0
,jq0
);
955 /* EWALD ELECTROSTATICS */
957 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
958 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
959 ewitab
= _mm_cvttps_epi32(ewrt
);
960 eweps
= _mm_sub_ps(ewrt
,_mm_round_ps(ewrt
, _MM_FROUND_FLOOR
));
961 ewitab
= _mm_slli_epi32(ewitab
,2);
962 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
963 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
964 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
965 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
966 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
967 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
968 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
969 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(rinv00
,velec
));
970 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
972 d
= _mm_sub_ps(r00
,rswitch
);
973 d
= _mm_max_ps(d
,_mm_setzero_ps());
974 d2
= _mm_mul_ps(d
,d
);
975 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
977 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
979 /* Evaluate switch function */
980 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
981 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv00
,_mm_mul_ps(velec
,dsw
)) );
982 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
986 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
988 /* Calculate temporary vectorial force */
989 tx
= _mm_mul_ps(fscal
,dx00
);
990 ty
= _mm_mul_ps(fscal
,dy00
);
991 tz
= _mm_mul_ps(fscal
,dz00
);
993 /* Update vectorial force */
994 fix0
= _mm_add_ps(fix0
,tx
);
995 fiy0
= _mm_add_ps(fiy0
,ty
);
996 fiz0
= _mm_add_ps(fiz0
,tz
);
998 fjx0
= _mm_add_ps(fjx0
,tx
);
999 fjy0
= _mm_add_ps(fjy0
,ty
);
1000 fjz0
= _mm_add_ps(fjz0
,tz
);
1004 /**************************
1005 * CALCULATE INTERACTIONS *
1006 **************************/
1008 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1011 r10
= _mm_mul_ps(rsq10
,rinv10
);
1013 /* Compute parameters for interactions between i and j atoms */
1014 qq10
= _mm_mul_ps(iq1
,jq0
);
1016 /* EWALD ELECTROSTATICS */
1018 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1019 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
1020 ewitab
= _mm_cvttps_epi32(ewrt
);
1021 eweps
= _mm_sub_ps(ewrt
,_mm_round_ps(ewrt
, _MM_FROUND_FLOOR
));
1022 ewitab
= _mm_slli_epi32(ewitab
,2);
1023 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1024 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1025 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1026 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1027 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1028 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1029 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1030 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(rinv10
,velec
));
1031 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
1033 d
= _mm_sub_ps(r10
,rswitch
);
1034 d
= _mm_max_ps(d
,_mm_setzero_ps());
1035 d2
= _mm_mul_ps(d
,d
);
1036 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1038 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1040 /* Evaluate switch function */
1041 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1042 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv10
,_mm_mul_ps(velec
,dsw
)) );
1043 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
1047 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1049 /* Calculate temporary vectorial force */
1050 tx
= _mm_mul_ps(fscal
,dx10
);
1051 ty
= _mm_mul_ps(fscal
,dy10
);
1052 tz
= _mm_mul_ps(fscal
,dz10
);
1054 /* Update vectorial force */
1055 fix1
= _mm_add_ps(fix1
,tx
);
1056 fiy1
= _mm_add_ps(fiy1
,ty
);
1057 fiz1
= _mm_add_ps(fiz1
,tz
);
1059 fjx0
= _mm_add_ps(fjx0
,tx
);
1060 fjy0
= _mm_add_ps(fjy0
,ty
);
1061 fjz0
= _mm_add_ps(fjz0
,tz
);
1065 /**************************
1066 * CALCULATE INTERACTIONS *
1067 **************************/
1069 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1072 r20
= _mm_mul_ps(rsq20
,rinv20
);
1074 /* Compute parameters for interactions between i and j atoms */
1075 qq20
= _mm_mul_ps(iq2
,jq0
);
1077 /* EWALD ELECTROSTATICS */
1079 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1080 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
1081 ewitab
= _mm_cvttps_epi32(ewrt
);
1082 eweps
= _mm_sub_ps(ewrt
,_mm_round_ps(ewrt
, _MM_FROUND_FLOOR
));
1083 ewitab
= _mm_slli_epi32(ewitab
,2);
1084 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1085 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1086 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1087 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1088 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1089 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1090 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1091 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(rinv20
,velec
));
1092 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
1094 d
= _mm_sub_ps(r20
,rswitch
);
1095 d
= _mm_max_ps(d
,_mm_setzero_ps());
1096 d2
= _mm_mul_ps(d
,d
);
1097 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1099 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1101 /* Evaluate switch function */
1102 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1103 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv20
,_mm_mul_ps(velec
,dsw
)) );
1104 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
1108 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1110 /* Calculate temporary vectorial force */
1111 tx
= _mm_mul_ps(fscal
,dx20
);
1112 ty
= _mm_mul_ps(fscal
,dy20
);
1113 tz
= _mm_mul_ps(fscal
,dz20
);
1115 /* Update vectorial force */
1116 fix2
= _mm_add_ps(fix2
,tx
);
1117 fiy2
= _mm_add_ps(fiy2
,ty
);
1118 fiz2
= _mm_add_ps(fiz2
,tz
);
1120 fjx0
= _mm_add_ps(fjx0
,tx
);
1121 fjy0
= _mm_add_ps(fjy0
,ty
);
1122 fjz0
= _mm_add_ps(fjz0
,tz
);
1126 fjptrA
= f
+j_coord_offsetA
;
1127 fjptrB
= f
+j_coord_offsetB
;
1128 fjptrC
= f
+j_coord_offsetC
;
1129 fjptrD
= f
+j_coord_offsetD
;
1131 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
1133 /* Inner loop uses 186 flops */
1136 if(jidx
<j_index_end
)
1139 /* Get j neighbor index, and coordinate index */
1140 jnrlistA
= jjnr
[jidx
];
1141 jnrlistB
= jjnr
[jidx
+1];
1142 jnrlistC
= jjnr
[jidx
+2];
1143 jnrlistD
= jjnr
[jidx
+3];
1144 /* Sign of each element will be negative for non-real atoms.
1145 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1146 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1148 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1149 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1150 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1151 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1152 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1153 j_coord_offsetA
= DIM
*jnrA
;
1154 j_coord_offsetB
= DIM
*jnrB
;
1155 j_coord_offsetC
= DIM
*jnrC
;
1156 j_coord_offsetD
= DIM
*jnrD
;
1158 /* load j atom coordinates */
1159 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1160 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1163 /* Calculate displacement vector */
1164 dx00
= _mm_sub_ps(ix0
,jx0
);
1165 dy00
= _mm_sub_ps(iy0
,jy0
);
1166 dz00
= _mm_sub_ps(iz0
,jz0
);
1167 dx10
= _mm_sub_ps(ix1
,jx0
);
1168 dy10
= _mm_sub_ps(iy1
,jy0
);
1169 dz10
= _mm_sub_ps(iz1
,jz0
);
1170 dx20
= _mm_sub_ps(ix2
,jx0
);
1171 dy20
= _mm_sub_ps(iy2
,jy0
);
1172 dz20
= _mm_sub_ps(iz2
,jz0
);
1174 /* Calculate squared distance and things based on it */
1175 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1176 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1177 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1179 rinv00
= sse41_invsqrt_f(rsq00
);
1180 rinv10
= sse41_invsqrt_f(rsq10
);
1181 rinv20
= sse41_invsqrt_f(rsq20
);
1183 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
1184 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
1185 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
1187 /* Load parameters for j particles */
1188 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
1189 charge
+jnrC
+0,charge
+jnrD
+0);
1191 fjx0
= _mm_setzero_ps();
1192 fjy0
= _mm_setzero_ps();
1193 fjz0
= _mm_setzero_ps();
1195 /**************************
1196 * CALCULATE INTERACTIONS *
1197 **************************/
1199 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1202 r00
= _mm_mul_ps(rsq00
,rinv00
);
1203 r00
= _mm_andnot_ps(dummy_mask
,r00
);
1205 /* Compute parameters for interactions between i and j atoms */
1206 qq00
= _mm_mul_ps(iq0
,jq0
);
1208 /* EWALD ELECTROSTATICS */
1210 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1211 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
1212 ewitab
= _mm_cvttps_epi32(ewrt
);
1213 eweps
= _mm_sub_ps(ewrt
,_mm_round_ps(ewrt
, _MM_FROUND_FLOOR
));
1214 ewitab
= _mm_slli_epi32(ewitab
,2);
1215 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1216 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1217 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1218 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1219 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1220 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1221 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1222 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(rinv00
,velec
));
1223 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
1225 d
= _mm_sub_ps(r00
,rswitch
);
1226 d
= _mm_max_ps(d
,_mm_setzero_ps());
1227 d2
= _mm_mul_ps(d
,d
);
1228 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1230 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1232 /* Evaluate switch function */
1233 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1234 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv00
,_mm_mul_ps(velec
,dsw
)) );
1235 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
1239 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1241 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1243 /* Calculate temporary vectorial force */
1244 tx
= _mm_mul_ps(fscal
,dx00
);
1245 ty
= _mm_mul_ps(fscal
,dy00
);
1246 tz
= _mm_mul_ps(fscal
,dz00
);
1248 /* Update vectorial force */
1249 fix0
= _mm_add_ps(fix0
,tx
);
1250 fiy0
= _mm_add_ps(fiy0
,ty
);
1251 fiz0
= _mm_add_ps(fiz0
,tz
);
1253 fjx0
= _mm_add_ps(fjx0
,tx
);
1254 fjy0
= _mm_add_ps(fjy0
,ty
);
1255 fjz0
= _mm_add_ps(fjz0
,tz
);
1259 /**************************
1260 * CALCULATE INTERACTIONS *
1261 **************************/
1263 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1266 r10
= _mm_mul_ps(rsq10
,rinv10
);
1267 r10
= _mm_andnot_ps(dummy_mask
,r10
);
1269 /* Compute parameters for interactions between i and j atoms */
1270 qq10
= _mm_mul_ps(iq1
,jq0
);
1272 /* EWALD ELECTROSTATICS */
1274 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1275 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
1276 ewitab
= _mm_cvttps_epi32(ewrt
);
1277 eweps
= _mm_sub_ps(ewrt
,_mm_round_ps(ewrt
, _MM_FROUND_FLOOR
));
1278 ewitab
= _mm_slli_epi32(ewitab
,2);
1279 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1280 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1281 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1282 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1283 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1284 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1285 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1286 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(rinv10
,velec
));
1287 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
1289 d
= _mm_sub_ps(r10
,rswitch
);
1290 d
= _mm_max_ps(d
,_mm_setzero_ps());
1291 d2
= _mm_mul_ps(d
,d
);
1292 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1294 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1296 /* Evaluate switch function */
1297 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1298 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv10
,_mm_mul_ps(velec
,dsw
)) );
1299 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
1303 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1305 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1307 /* Calculate temporary vectorial force */
1308 tx
= _mm_mul_ps(fscal
,dx10
);
1309 ty
= _mm_mul_ps(fscal
,dy10
);
1310 tz
= _mm_mul_ps(fscal
,dz10
);
1312 /* Update vectorial force */
1313 fix1
= _mm_add_ps(fix1
,tx
);
1314 fiy1
= _mm_add_ps(fiy1
,ty
);
1315 fiz1
= _mm_add_ps(fiz1
,tz
);
1317 fjx0
= _mm_add_ps(fjx0
,tx
);
1318 fjy0
= _mm_add_ps(fjy0
,ty
);
1319 fjz0
= _mm_add_ps(fjz0
,tz
);
1323 /**************************
1324 * CALCULATE INTERACTIONS *
1325 **************************/
1327 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1330 r20
= _mm_mul_ps(rsq20
,rinv20
);
1331 r20
= _mm_andnot_ps(dummy_mask
,r20
);
1333 /* Compute parameters for interactions between i and j atoms */
1334 qq20
= _mm_mul_ps(iq2
,jq0
);
1336 /* EWALD ELECTROSTATICS */
1338 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1339 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
1340 ewitab
= _mm_cvttps_epi32(ewrt
);
1341 eweps
= _mm_sub_ps(ewrt
,_mm_round_ps(ewrt
, _MM_FROUND_FLOOR
));
1342 ewitab
= _mm_slli_epi32(ewitab
,2);
1343 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1344 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1345 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1346 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1347 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1348 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1349 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1350 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(rinv20
,velec
));
1351 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
1353 d
= _mm_sub_ps(r20
,rswitch
);
1354 d
= _mm_max_ps(d
,_mm_setzero_ps());
1355 d2
= _mm_mul_ps(d
,d
);
1356 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1358 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1360 /* Evaluate switch function */
1361 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1362 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv20
,_mm_mul_ps(velec
,dsw
)) );
1363 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
1367 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1369 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1371 /* Calculate temporary vectorial force */
1372 tx
= _mm_mul_ps(fscal
,dx20
);
1373 ty
= _mm_mul_ps(fscal
,dy20
);
1374 tz
= _mm_mul_ps(fscal
,dz20
);
1376 /* Update vectorial force */
1377 fix2
= _mm_add_ps(fix2
,tx
);
1378 fiy2
= _mm_add_ps(fiy2
,ty
);
1379 fiz2
= _mm_add_ps(fiz2
,tz
);
1381 fjx0
= _mm_add_ps(fjx0
,tx
);
1382 fjy0
= _mm_add_ps(fjy0
,ty
);
1383 fjz0
= _mm_add_ps(fjz0
,tz
);
1387 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1388 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1389 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1390 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1392 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
1394 /* Inner loop uses 189 flops */
1397 /* End of innermost loop */
1399 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1400 f
+i_coord_offset
,fshift
+i_shift_offset
);
1402 /* Increment number of inner iterations */
1403 inneriter
+= j_index_end
- j_index_start
;
1405 /* Outer loop uses 18 flops */
1408 /* Increment number of outer iterations */
1411 /* Update outer/inner flops */
1413 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3_F
,outeriter
*18 + inneriter
*189);