2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_single.h"
49 #include "kernelutil_x86_sse2_single.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single
53 * Electrostatics interaction: Ewald
54 * VdW interaction: None
55 * Geometry: Water3-Particle
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_single
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
75 int jnrA
,jnrB
,jnrC
,jnrD
;
76 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
77 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
78 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
80 real
*shiftvec
,*fshift
,*x
,*f
;
81 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
83 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
85 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
87 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
89 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
90 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
91 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
92 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
93 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
94 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
95 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
98 __m128 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
100 __m128 dummy_mask
,cutoff_mask
;
101 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
102 __m128 one
= _mm_set1_ps(1.0);
103 __m128 two
= _mm_set1_ps(2.0);
109 jindex
= nlist
->jindex
;
111 shiftidx
= nlist
->shift
;
113 shiftvec
= fr
->shift_vec
[0];
114 fshift
= fr
->fshift
[0];
115 facel
= _mm_set1_ps(fr
->epsfac
);
116 charge
= mdatoms
->chargeA
;
118 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
119 ewtab
= fr
->ic
->tabq_coul_FDV0
;
120 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
121 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
123 /* Setup water-specific parameters */
124 inr
= nlist
->iinr
[0];
125 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
126 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
127 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
129 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
130 rcutoff_scalar
= fr
->rcoulomb
;
131 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
132 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
134 /* Avoid stupid compiler warnings */
135 jnrA
= jnrB
= jnrC
= jnrD
= 0;
144 for(iidx
=0;iidx
<4*DIM
;iidx
++)
149 /* Start outer loop over neighborlists */
150 for(iidx
=0; iidx
<nri
; iidx
++)
152 /* Load shift vector for this list */
153 i_shift_offset
= DIM
*shiftidx
[iidx
];
155 /* Load limits for loop over neighbors */
156 j_index_start
= jindex
[iidx
];
157 j_index_end
= jindex
[iidx
+1];
159 /* Get outer coordinate index */
161 i_coord_offset
= DIM
*inr
;
163 /* Load i particle coords and add shift vector */
164 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
165 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
167 fix0
= _mm_setzero_ps();
168 fiy0
= _mm_setzero_ps();
169 fiz0
= _mm_setzero_ps();
170 fix1
= _mm_setzero_ps();
171 fiy1
= _mm_setzero_ps();
172 fiz1
= _mm_setzero_ps();
173 fix2
= _mm_setzero_ps();
174 fiy2
= _mm_setzero_ps();
175 fiz2
= _mm_setzero_ps();
177 /* Reset potential sums */
178 velecsum
= _mm_setzero_ps();
180 /* Start inner kernel loop */
181 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
184 /* Get j neighbor index, and coordinate index */
189 j_coord_offsetA
= DIM
*jnrA
;
190 j_coord_offsetB
= DIM
*jnrB
;
191 j_coord_offsetC
= DIM
*jnrC
;
192 j_coord_offsetD
= DIM
*jnrD
;
194 /* load j atom coordinates */
195 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
196 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
199 /* Calculate displacement vector */
200 dx00
= _mm_sub_ps(ix0
,jx0
);
201 dy00
= _mm_sub_ps(iy0
,jy0
);
202 dz00
= _mm_sub_ps(iz0
,jz0
);
203 dx10
= _mm_sub_ps(ix1
,jx0
);
204 dy10
= _mm_sub_ps(iy1
,jy0
);
205 dz10
= _mm_sub_ps(iz1
,jz0
);
206 dx20
= _mm_sub_ps(ix2
,jx0
);
207 dy20
= _mm_sub_ps(iy2
,jy0
);
208 dz20
= _mm_sub_ps(iz2
,jz0
);
210 /* Calculate squared distance and things based on it */
211 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
212 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
213 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
215 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
216 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
217 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
219 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
220 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
221 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
223 /* Load parameters for j particles */
224 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
225 charge
+jnrC
+0,charge
+jnrD
+0);
227 fjx0
= _mm_setzero_ps();
228 fjy0
= _mm_setzero_ps();
229 fjz0
= _mm_setzero_ps();
231 /**************************
232 * CALCULATE INTERACTIONS *
233 **************************/
235 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
238 r00
= _mm_mul_ps(rsq00
,rinv00
);
240 /* Compute parameters for interactions between i and j atoms */
241 qq00
= _mm_mul_ps(iq0
,jq0
);
243 /* EWALD ELECTROSTATICS */
245 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
246 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
247 ewitab
= _mm_cvttps_epi32(ewrt
);
248 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
249 ewitab
= _mm_slli_epi32(ewitab
,2);
250 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
251 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
252 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
253 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
254 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
255 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
256 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
257 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(_mm_sub_ps(rinv00
,sh_ewald
),velec
));
258 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
260 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
262 /* Update potential sum for this i atom from the interaction with this j atom. */
263 velec
= _mm_and_ps(velec
,cutoff_mask
);
264 velecsum
= _mm_add_ps(velecsum
,velec
);
268 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
270 /* Calculate temporary vectorial force */
271 tx
= _mm_mul_ps(fscal
,dx00
);
272 ty
= _mm_mul_ps(fscal
,dy00
);
273 tz
= _mm_mul_ps(fscal
,dz00
);
275 /* Update vectorial force */
276 fix0
= _mm_add_ps(fix0
,tx
);
277 fiy0
= _mm_add_ps(fiy0
,ty
);
278 fiz0
= _mm_add_ps(fiz0
,tz
);
280 fjx0
= _mm_add_ps(fjx0
,tx
);
281 fjy0
= _mm_add_ps(fjy0
,ty
);
282 fjz0
= _mm_add_ps(fjz0
,tz
);
286 /**************************
287 * CALCULATE INTERACTIONS *
288 **************************/
290 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
293 r10
= _mm_mul_ps(rsq10
,rinv10
);
295 /* Compute parameters for interactions between i and j atoms */
296 qq10
= _mm_mul_ps(iq1
,jq0
);
298 /* EWALD ELECTROSTATICS */
300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
301 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
302 ewitab
= _mm_cvttps_epi32(ewrt
);
303 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
304 ewitab
= _mm_slli_epi32(ewitab
,2);
305 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
306 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
307 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
308 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
309 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
310 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
311 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
312 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(_mm_sub_ps(rinv10
,sh_ewald
),velec
));
313 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
315 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
317 /* Update potential sum for this i atom from the interaction with this j atom. */
318 velec
= _mm_and_ps(velec
,cutoff_mask
);
319 velecsum
= _mm_add_ps(velecsum
,velec
);
323 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
325 /* Calculate temporary vectorial force */
326 tx
= _mm_mul_ps(fscal
,dx10
);
327 ty
= _mm_mul_ps(fscal
,dy10
);
328 tz
= _mm_mul_ps(fscal
,dz10
);
330 /* Update vectorial force */
331 fix1
= _mm_add_ps(fix1
,tx
);
332 fiy1
= _mm_add_ps(fiy1
,ty
);
333 fiz1
= _mm_add_ps(fiz1
,tz
);
335 fjx0
= _mm_add_ps(fjx0
,tx
);
336 fjy0
= _mm_add_ps(fjy0
,ty
);
337 fjz0
= _mm_add_ps(fjz0
,tz
);
341 /**************************
342 * CALCULATE INTERACTIONS *
343 **************************/
345 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
348 r20
= _mm_mul_ps(rsq20
,rinv20
);
350 /* Compute parameters for interactions between i and j atoms */
351 qq20
= _mm_mul_ps(iq2
,jq0
);
353 /* EWALD ELECTROSTATICS */
355 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
356 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
357 ewitab
= _mm_cvttps_epi32(ewrt
);
358 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
359 ewitab
= _mm_slli_epi32(ewitab
,2);
360 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
361 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
362 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
363 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
364 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
365 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
366 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
367 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(_mm_sub_ps(rinv20
,sh_ewald
),velec
));
368 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
370 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
372 /* Update potential sum for this i atom from the interaction with this j atom. */
373 velec
= _mm_and_ps(velec
,cutoff_mask
);
374 velecsum
= _mm_add_ps(velecsum
,velec
);
378 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
380 /* Calculate temporary vectorial force */
381 tx
= _mm_mul_ps(fscal
,dx20
);
382 ty
= _mm_mul_ps(fscal
,dy20
);
383 tz
= _mm_mul_ps(fscal
,dz20
);
385 /* Update vectorial force */
386 fix2
= _mm_add_ps(fix2
,tx
);
387 fiy2
= _mm_add_ps(fiy2
,ty
);
388 fiz2
= _mm_add_ps(fiz2
,tz
);
390 fjx0
= _mm_add_ps(fjx0
,tx
);
391 fjy0
= _mm_add_ps(fjy0
,ty
);
392 fjz0
= _mm_add_ps(fjz0
,tz
);
396 fjptrA
= f
+j_coord_offsetA
;
397 fjptrB
= f
+j_coord_offsetB
;
398 fjptrC
= f
+j_coord_offsetC
;
399 fjptrD
= f
+j_coord_offsetD
;
401 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
403 /* Inner loop uses 138 flops */
409 /* Get j neighbor index, and coordinate index */
410 jnrlistA
= jjnr
[jidx
];
411 jnrlistB
= jjnr
[jidx
+1];
412 jnrlistC
= jjnr
[jidx
+2];
413 jnrlistD
= jjnr
[jidx
+3];
414 /* Sign of each element will be negative for non-real atoms.
415 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
416 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
418 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
419 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
420 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
421 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
422 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
423 j_coord_offsetA
= DIM
*jnrA
;
424 j_coord_offsetB
= DIM
*jnrB
;
425 j_coord_offsetC
= DIM
*jnrC
;
426 j_coord_offsetD
= DIM
*jnrD
;
428 /* load j atom coordinates */
429 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
430 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
433 /* Calculate displacement vector */
434 dx00
= _mm_sub_ps(ix0
,jx0
);
435 dy00
= _mm_sub_ps(iy0
,jy0
);
436 dz00
= _mm_sub_ps(iz0
,jz0
);
437 dx10
= _mm_sub_ps(ix1
,jx0
);
438 dy10
= _mm_sub_ps(iy1
,jy0
);
439 dz10
= _mm_sub_ps(iz1
,jz0
);
440 dx20
= _mm_sub_ps(ix2
,jx0
);
441 dy20
= _mm_sub_ps(iy2
,jy0
);
442 dz20
= _mm_sub_ps(iz2
,jz0
);
444 /* Calculate squared distance and things based on it */
445 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
446 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
447 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
449 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
450 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
451 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
453 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
454 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
455 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
457 /* Load parameters for j particles */
458 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
459 charge
+jnrC
+0,charge
+jnrD
+0);
461 fjx0
= _mm_setzero_ps();
462 fjy0
= _mm_setzero_ps();
463 fjz0
= _mm_setzero_ps();
465 /**************************
466 * CALCULATE INTERACTIONS *
467 **************************/
469 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
472 r00
= _mm_mul_ps(rsq00
,rinv00
);
473 r00
= _mm_andnot_ps(dummy_mask
,r00
);
475 /* Compute parameters for interactions between i and j atoms */
476 qq00
= _mm_mul_ps(iq0
,jq0
);
478 /* EWALD ELECTROSTATICS */
480 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
481 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
482 ewitab
= _mm_cvttps_epi32(ewrt
);
483 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
484 ewitab
= _mm_slli_epi32(ewitab
,2);
485 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
486 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
487 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
488 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
489 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
490 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
491 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
492 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(_mm_sub_ps(rinv00
,sh_ewald
),velec
));
493 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
495 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
497 /* Update potential sum for this i atom from the interaction with this j atom. */
498 velec
= _mm_and_ps(velec
,cutoff_mask
);
499 velec
= _mm_andnot_ps(dummy_mask
,velec
);
500 velecsum
= _mm_add_ps(velecsum
,velec
);
504 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
506 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
508 /* Calculate temporary vectorial force */
509 tx
= _mm_mul_ps(fscal
,dx00
);
510 ty
= _mm_mul_ps(fscal
,dy00
);
511 tz
= _mm_mul_ps(fscal
,dz00
);
513 /* Update vectorial force */
514 fix0
= _mm_add_ps(fix0
,tx
);
515 fiy0
= _mm_add_ps(fiy0
,ty
);
516 fiz0
= _mm_add_ps(fiz0
,tz
);
518 fjx0
= _mm_add_ps(fjx0
,tx
);
519 fjy0
= _mm_add_ps(fjy0
,ty
);
520 fjz0
= _mm_add_ps(fjz0
,tz
);
524 /**************************
525 * CALCULATE INTERACTIONS *
526 **************************/
528 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
531 r10
= _mm_mul_ps(rsq10
,rinv10
);
532 r10
= _mm_andnot_ps(dummy_mask
,r10
);
534 /* Compute parameters for interactions between i and j atoms */
535 qq10
= _mm_mul_ps(iq1
,jq0
);
537 /* EWALD ELECTROSTATICS */
539 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
540 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
541 ewitab
= _mm_cvttps_epi32(ewrt
);
542 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
543 ewitab
= _mm_slli_epi32(ewitab
,2);
544 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
545 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
546 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
547 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
548 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
549 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
550 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
551 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(_mm_sub_ps(rinv10
,sh_ewald
),velec
));
552 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
554 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
556 /* Update potential sum for this i atom from the interaction with this j atom. */
557 velec
= _mm_and_ps(velec
,cutoff_mask
);
558 velec
= _mm_andnot_ps(dummy_mask
,velec
);
559 velecsum
= _mm_add_ps(velecsum
,velec
);
563 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
565 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
567 /* Calculate temporary vectorial force */
568 tx
= _mm_mul_ps(fscal
,dx10
);
569 ty
= _mm_mul_ps(fscal
,dy10
);
570 tz
= _mm_mul_ps(fscal
,dz10
);
572 /* Update vectorial force */
573 fix1
= _mm_add_ps(fix1
,tx
);
574 fiy1
= _mm_add_ps(fiy1
,ty
);
575 fiz1
= _mm_add_ps(fiz1
,tz
);
577 fjx0
= _mm_add_ps(fjx0
,tx
);
578 fjy0
= _mm_add_ps(fjy0
,ty
);
579 fjz0
= _mm_add_ps(fjz0
,tz
);
583 /**************************
584 * CALCULATE INTERACTIONS *
585 **************************/
587 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
590 r20
= _mm_mul_ps(rsq20
,rinv20
);
591 r20
= _mm_andnot_ps(dummy_mask
,r20
);
593 /* Compute parameters for interactions between i and j atoms */
594 qq20
= _mm_mul_ps(iq2
,jq0
);
596 /* EWALD ELECTROSTATICS */
598 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
599 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
600 ewitab
= _mm_cvttps_epi32(ewrt
);
601 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
602 ewitab
= _mm_slli_epi32(ewitab
,2);
603 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
604 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
605 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
606 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
607 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
608 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
609 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
610 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(_mm_sub_ps(rinv20
,sh_ewald
),velec
));
611 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
613 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
615 /* Update potential sum for this i atom from the interaction with this j atom. */
616 velec
= _mm_and_ps(velec
,cutoff_mask
);
617 velec
= _mm_andnot_ps(dummy_mask
,velec
);
618 velecsum
= _mm_add_ps(velecsum
,velec
);
622 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
624 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
626 /* Calculate temporary vectorial force */
627 tx
= _mm_mul_ps(fscal
,dx20
);
628 ty
= _mm_mul_ps(fscal
,dy20
);
629 tz
= _mm_mul_ps(fscal
,dz20
);
631 /* Update vectorial force */
632 fix2
= _mm_add_ps(fix2
,tx
);
633 fiy2
= _mm_add_ps(fiy2
,ty
);
634 fiz2
= _mm_add_ps(fiz2
,tz
);
636 fjx0
= _mm_add_ps(fjx0
,tx
);
637 fjy0
= _mm_add_ps(fjy0
,ty
);
638 fjz0
= _mm_add_ps(fjz0
,tz
);
642 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
643 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
644 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
645 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
647 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
649 /* Inner loop uses 141 flops */
652 /* End of innermost loop */
654 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
655 f
+i_coord_offset
,fshift
+i_shift_offset
);
658 /* Update potential energies */
659 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
661 /* Increment number of inner iterations */
662 inneriter
+= j_index_end
- j_index_start
;
664 /* Outer loop uses 19 flops */
667 /* Increment number of outer iterations */
670 /* Update outer/inner flops */
672 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3_VF
,outeriter
*19 + inneriter
*141);
675 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
676 * Electrostatics interaction: Ewald
677 * VdW interaction: None
678 * Geometry: Water3-Particle
679 * Calculate force/pot: Force
682 nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
683 (t_nblist
* gmx_restrict nlist
,
684 rvec
* gmx_restrict xx
,
685 rvec
* gmx_restrict ff
,
686 t_forcerec
* gmx_restrict fr
,
687 t_mdatoms
* gmx_restrict mdatoms
,
688 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
689 t_nrnb
* gmx_restrict nrnb
)
691 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
692 * just 0 for non-waters.
693 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
694 * jnr indices corresponding to data put in the four positions in the SIMD register.
696 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
697 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
698 int jnrA
,jnrB
,jnrC
,jnrD
;
699 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
700 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
701 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
703 real
*shiftvec
,*fshift
,*x
,*f
;
704 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
706 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
708 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
710 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
712 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
713 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
714 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
715 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
716 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
717 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
718 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
721 __m128 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
723 __m128 dummy_mask
,cutoff_mask
;
724 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
725 __m128 one
= _mm_set1_ps(1.0);
726 __m128 two
= _mm_set1_ps(2.0);
732 jindex
= nlist
->jindex
;
734 shiftidx
= nlist
->shift
;
736 shiftvec
= fr
->shift_vec
[0];
737 fshift
= fr
->fshift
[0];
738 facel
= _mm_set1_ps(fr
->epsfac
);
739 charge
= mdatoms
->chargeA
;
741 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
742 ewtab
= fr
->ic
->tabq_coul_F
;
743 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
744 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
746 /* Setup water-specific parameters */
747 inr
= nlist
->iinr
[0];
748 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
749 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
750 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
752 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
753 rcutoff_scalar
= fr
->rcoulomb
;
754 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
755 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
757 /* Avoid stupid compiler warnings */
758 jnrA
= jnrB
= jnrC
= jnrD
= 0;
767 for(iidx
=0;iidx
<4*DIM
;iidx
++)
772 /* Start outer loop over neighborlists */
773 for(iidx
=0; iidx
<nri
; iidx
++)
775 /* Load shift vector for this list */
776 i_shift_offset
= DIM
*shiftidx
[iidx
];
778 /* Load limits for loop over neighbors */
779 j_index_start
= jindex
[iidx
];
780 j_index_end
= jindex
[iidx
+1];
782 /* Get outer coordinate index */
784 i_coord_offset
= DIM
*inr
;
786 /* Load i particle coords and add shift vector */
787 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
788 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
790 fix0
= _mm_setzero_ps();
791 fiy0
= _mm_setzero_ps();
792 fiz0
= _mm_setzero_ps();
793 fix1
= _mm_setzero_ps();
794 fiy1
= _mm_setzero_ps();
795 fiz1
= _mm_setzero_ps();
796 fix2
= _mm_setzero_ps();
797 fiy2
= _mm_setzero_ps();
798 fiz2
= _mm_setzero_ps();
800 /* Start inner kernel loop */
801 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
804 /* Get j neighbor index, and coordinate index */
809 j_coord_offsetA
= DIM
*jnrA
;
810 j_coord_offsetB
= DIM
*jnrB
;
811 j_coord_offsetC
= DIM
*jnrC
;
812 j_coord_offsetD
= DIM
*jnrD
;
814 /* load j atom coordinates */
815 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
816 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
819 /* Calculate displacement vector */
820 dx00
= _mm_sub_ps(ix0
,jx0
);
821 dy00
= _mm_sub_ps(iy0
,jy0
);
822 dz00
= _mm_sub_ps(iz0
,jz0
);
823 dx10
= _mm_sub_ps(ix1
,jx0
);
824 dy10
= _mm_sub_ps(iy1
,jy0
);
825 dz10
= _mm_sub_ps(iz1
,jz0
);
826 dx20
= _mm_sub_ps(ix2
,jx0
);
827 dy20
= _mm_sub_ps(iy2
,jy0
);
828 dz20
= _mm_sub_ps(iz2
,jz0
);
830 /* Calculate squared distance and things based on it */
831 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
832 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
833 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
835 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
836 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
837 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
839 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
840 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
841 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
843 /* Load parameters for j particles */
844 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
845 charge
+jnrC
+0,charge
+jnrD
+0);
847 fjx0
= _mm_setzero_ps();
848 fjy0
= _mm_setzero_ps();
849 fjz0
= _mm_setzero_ps();
851 /**************************
852 * CALCULATE INTERACTIONS *
853 **************************/
855 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
858 r00
= _mm_mul_ps(rsq00
,rinv00
);
860 /* Compute parameters for interactions between i and j atoms */
861 qq00
= _mm_mul_ps(iq0
,jq0
);
863 /* EWALD ELECTROSTATICS */
865 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
866 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
867 ewitab
= _mm_cvttps_epi32(ewrt
);
868 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
869 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
870 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
872 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
873 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
875 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
879 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
881 /* Calculate temporary vectorial force */
882 tx
= _mm_mul_ps(fscal
,dx00
);
883 ty
= _mm_mul_ps(fscal
,dy00
);
884 tz
= _mm_mul_ps(fscal
,dz00
);
886 /* Update vectorial force */
887 fix0
= _mm_add_ps(fix0
,tx
);
888 fiy0
= _mm_add_ps(fiy0
,ty
);
889 fiz0
= _mm_add_ps(fiz0
,tz
);
891 fjx0
= _mm_add_ps(fjx0
,tx
);
892 fjy0
= _mm_add_ps(fjy0
,ty
);
893 fjz0
= _mm_add_ps(fjz0
,tz
);
897 /**************************
898 * CALCULATE INTERACTIONS *
899 **************************/
901 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
904 r10
= _mm_mul_ps(rsq10
,rinv10
);
906 /* Compute parameters for interactions between i and j atoms */
907 qq10
= _mm_mul_ps(iq1
,jq0
);
909 /* EWALD ELECTROSTATICS */
911 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
912 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
913 ewitab
= _mm_cvttps_epi32(ewrt
);
914 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
915 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
916 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
918 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
919 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
921 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
925 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
927 /* Calculate temporary vectorial force */
928 tx
= _mm_mul_ps(fscal
,dx10
);
929 ty
= _mm_mul_ps(fscal
,dy10
);
930 tz
= _mm_mul_ps(fscal
,dz10
);
932 /* Update vectorial force */
933 fix1
= _mm_add_ps(fix1
,tx
);
934 fiy1
= _mm_add_ps(fiy1
,ty
);
935 fiz1
= _mm_add_ps(fiz1
,tz
);
937 fjx0
= _mm_add_ps(fjx0
,tx
);
938 fjy0
= _mm_add_ps(fjy0
,ty
);
939 fjz0
= _mm_add_ps(fjz0
,tz
);
943 /**************************
944 * CALCULATE INTERACTIONS *
945 **************************/
947 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
950 r20
= _mm_mul_ps(rsq20
,rinv20
);
952 /* Compute parameters for interactions between i and j atoms */
953 qq20
= _mm_mul_ps(iq2
,jq0
);
955 /* EWALD ELECTROSTATICS */
957 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
958 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
959 ewitab
= _mm_cvttps_epi32(ewrt
);
960 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
961 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
962 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
964 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
965 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
967 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
971 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
973 /* Calculate temporary vectorial force */
974 tx
= _mm_mul_ps(fscal
,dx20
);
975 ty
= _mm_mul_ps(fscal
,dy20
);
976 tz
= _mm_mul_ps(fscal
,dz20
);
978 /* Update vectorial force */
979 fix2
= _mm_add_ps(fix2
,tx
);
980 fiy2
= _mm_add_ps(fiy2
,ty
);
981 fiz2
= _mm_add_ps(fiz2
,tz
);
983 fjx0
= _mm_add_ps(fjx0
,tx
);
984 fjy0
= _mm_add_ps(fjy0
,ty
);
985 fjz0
= _mm_add_ps(fjz0
,tz
);
989 fjptrA
= f
+j_coord_offsetA
;
990 fjptrB
= f
+j_coord_offsetB
;
991 fjptrC
= f
+j_coord_offsetC
;
992 fjptrD
= f
+j_coord_offsetD
;
994 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
996 /* Inner loop uses 117 flops */
1002 /* Get j neighbor index, and coordinate index */
1003 jnrlistA
= jjnr
[jidx
];
1004 jnrlistB
= jjnr
[jidx
+1];
1005 jnrlistC
= jjnr
[jidx
+2];
1006 jnrlistD
= jjnr
[jidx
+3];
1007 /* Sign of each element will be negative for non-real atoms.
1008 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1009 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1011 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1012 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1013 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1014 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1015 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1016 j_coord_offsetA
= DIM
*jnrA
;
1017 j_coord_offsetB
= DIM
*jnrB
;
1018 j_coord_offsetC
= DIM
*jnrC
;
1019 j_coord_offsetD
= DIM
*jnrD
;
1021 /* load j atom coordinates */
1022 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1023 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1026 /* Calculate displacement vector */
1027 dx00
= _mm_sub_ps(ix0
,jx0
);
1028 dy00
= _mm_sub_ps(iy0
,jy0
);
1029 dz00
= _mm_sub_ps(iz0
,jz0
);
1030 dx10
= _mm_sub_ps(ix1
,jx0
);
1031 dy10
= _mm_sub_ps(iy1
,jy0
);
1032 dz10
= _mm_sub_ps(iz1
,jz0
);
1033 dx20
= _mm_sub_ps(ix2
,jx0
);
1034 dy20
= _mm_sub_ps(iy2
,jy0
);
1035 dz20
= _mm_sub_ps(iz2
,jz0
);
1037 /* Calculate squared distance and things based on it */
1038 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1039 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1040 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1042 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
1043 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
1044 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
1046 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
1047 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
1048 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
1050 /* Load parameters for j particles */
1051 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
1052 charge
+jnrC
+0,charge
+jnrD
+0);
1054 fjx0
= _mm_setzero_ps();
1055 fjy0
= _mm_setzero_ps();
1056 fjz0
= _mm_setzero_ps();
1058 /**************************
1059 * CALCULATE INTERACTIONS *
1060 **************************/
1062 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1065 r00
= _mm_mul_ps(rsq00
,rinv00
);
1066 r00
= _mm_andnot_ps(dummy_mask
,r00
);
1068 /* Compute parameters for interactions between i and j atoms */
1069 qq00
= _mm_mul_ps(iq0
,jq0
);
1071 /* EWALD ELECTROSTATICS */
1073 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1074 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
1075 ewitab
= _mm_cvttps_epi32(ewrt
);
1076 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1077 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1078 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1080 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1081 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
1083 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
1087 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1089 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1091 /* Calculate temporary vectorial force */
1092 tx
= _mm_mul_ps(fscal
,dx00
);
1093 ty
= _mm_mul_ps(fscal
,dy00
);
1094 tz
= _mm_mul_ps(fscal
,dz00
);
1096 /* Update vectorial force */
1097 fix0
= _mm_add_ps(fix0
,tx
);
1098 fiy0
= _mm_add_ps(fiy0
,ty
);
1099 fiz0
= _mm_add_ps(fiz0
,tz
);
1101 fjx0
= _mm_add_ps(fjx0
,tx
);
1102 fjy0
= _mm_add_ps(fjy0
,ty
);
1103 fjz0
= _mm_add_ps(fjz0
,tz
);
1107 /**************************
1108 * CALCULATE INTERACTIONS *
1109 **************************/
1111 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1114 r10
= _mm_mul_ps(rsq10
,rinv10
);
1115 r10
= _mm_andnot_ps(dummy_mask
,r10
);
1117 /* Compute parameters for interactions between i and j atoms */
1118 qq10
= _mm_mul_ps(iq1
,jq0
);
1120 /* EWALD ELECTROSTATICS */
1122 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1123 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
1124 ewitab
= _mm_cvttps_epi32(ewrt
);
1125 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1126 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1127 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1129 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1130 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
1132 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
1136 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1138 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1140 /* Calculate temporary vectorial force */
1141 tx
= _mm_mul_ps(fscal
,dx10
);
1142 ty
= _mm_mul_ps(fscal
,dy10
);
1143 tz
= _mm_mul_ps(fscal
,dz10
);
1145 /* Update vectorial force */
1146 fix1
= _mm_add_ps(fix1
,tx
);
1147 fiy1
= _mm_add_ps(fiy1
,ty
);
1148 fiz1
= _mm_add_ps(fiz1
,tz
);
1150 fjx0
= _mm_add_ps(fjx0
,tx
);
1151 fjy0
= _mm_add_ps(fjy0
,ty
);
1152 fjz0
= _mm_add_ps(fjz0
,tz
);
1156 /**************************
1157 * CALCULATE INTERACTIONS *
1158 **************************/
1160 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1163 r20
= _mm_mul_ps(rsq20
,rinv20
);
1164 r20
= _mm_andnot_ps(dummy_mask
,r20
);
1166 /* Compute parameters for interactions between i and j atoms */
1167 qq20
= _mm_mul_ps(iq2
,jq0
);
1169 /* EWALD ELECTROSTATICS */
1171 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1172 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
1173 ewitab
= _mm_cvttps_epi32(ewrt
);
1174 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1175 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1176 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1178 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1179 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
1181 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
1185 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1187 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1189 /* Calculate temporary vectorial force */
1190 tx
= _mm_mul_ps(fscal
,dx20
);
1191 ty
= _mm_mul_ps(fscal
,dy20
);
1192 tz
= _mm_mul_ps(fscal
,dz20
);
1194 /* Update vectorial force */
1195 fix2
= _mm_add_ps(fix2
,tx
);
1196 fiy2
= _mm_add_ps(fiy2
,ty
);
1197 fiz2
= _mm_add_ps(fiz2
,tz
);
1199 fjx0
= _mm_add_ps(fjx0
,tx
);
1200 fjy0
= _mm_add_ps(fjy0
,ty
);
1201 fjz0
= _mm_add_ps(fjz0
,tz
);
1205 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1206 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1207 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1208 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1210 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
1212 /* Inner loop uses 120 flops */
1215 /* End of innermost loop */
1217 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1218 f
+i_coord_offset
,fshift
+i_shift_offset
);
1220 /* Increment number of inner iterations */
1221 inneriter
+= j_index_end
- j_index_start
;
1223 /* Outer loop uses 18 flops */
1226 /* Increment number of outer iterations */
1229 /* Update outer/inner flops */
1231 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3_F
,outeriter
*18 + inneriter
*120);