2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water4-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_single
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
81 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
83 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
85 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
87 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
88 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
89 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
90 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
91 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
92 __m128 dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
93 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
96 __m128 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
98 __m128 dummy_mask
,cutoff_mask
;
99 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
100 __m128 one
= _mm_set1_ps(1.0);
101 __m128 two
= _mm_set1_ps(2.0);
107 jindex
= nlist
->jindex
;
109 shiftidx
= nlist
->shift
;
111 shiftvec
= fr
->shift_vec
[0];
112 fshift
= fr
->fshift
[0];
113 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
114 charge
= mdatoms
->chargeA
;
116 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
117 ewtab
= fr
->ic
->tabq_coul_FDV0
;
118 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
119 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
121 /* Setup water-specific parameters */
122 inr
= nlist
->iinr
[0];
123 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
124 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
125 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
127 /* Avoid stupid compiler warnings */
128 jnrA
= jnrB
= jnrC
= jnrD
= 0;
137 for(iidx
=0;iidx
<4*DIM
;iidx
++)
142 /* Start outer loop over neighborlists */
143 for(iidx
=0; iidx
<nri
; iidx
++)
145 /* Load shift vector for this list */
146 i_shift_offset
= DIM
*shiftidx
[iidx
];
148 /* Load limits for loop over neighbors */
149 j_index_start
= jindex
[iidx
];
150 j_index_end
= jindex
[iidx
+1];
152 /* Get outer coordinate index */
154 i_coord_offset
= DIM
*inr
;
156 /* Load i particle coords and add shift vector */
157 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
+DIM
,
158 &ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
160 fix1
= _mm_setzero_ps();
161 fiy1
= _mm_setzero_ps();
162 fiz1
= _mm_setzero_ps();
163 fix2
= _mm_setzero_ps();
164 fiy2
= _mm_setzero_ps();
165 fiz2
= _mm_setzero_ps();
166 fix3
= _mm_setzero_ps();
167 fiy3
= _mm_setzero_ps();
168 fiz3
= _mm_setzero_ps();
170 /* Reset potential sums */
171 velecsum
= _mm_setzero_ps();
173 /* Start inner kernel loop */
174 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
177 /* Get j neighbor index, and coordinate index */
182 j_coord_offsetA
= DIM
*jnrA
;
183 j_coord_offsetB
= DIM
*jnrB
;
184 j_coord_offsetC
= DIM
*jnrC
;
185 j_coord_offsetD
= DIM
*jnrD
;
187 /* load j atom coordinates */
188 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
189 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
192 /* Calculate displacement vector */
193 dx10
= _mm_sub_ps(ix1
,jx0
);
194 dy10
= _mm_sub_ps(iy1
,jy0
);
195 dz10
= _mm_sub_ps(iz1
,jz0
);
196 dx20
= _mm_sub_ps(ix2
,jx0
);
197 dy20
= _mm_sub_ps(iy2
,jy0
);
198 dz20
= _mm_sub_ps(iz2
,jz0
);
199 dx30
= _mm_sub_ps(ix3
,jx0
);
200 dy30
= _mm_sub_ps(iy3
,jy0
);
201 dz30
= _mm_sub_ps(iz3
,jz0
);
203 /* Calculate squared distance and things based on it */
204 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
205 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
206 rsq30
= gmx_mm_calc_rsq_ps(dx30
,dy30
,dz30
);
208 rinv10
= sse2_invsqrt_f(rsq10
);
209 rinv20
= sse2_invsqrt_f(rsq20
);
210 rinv30
= sse2_invsqrt_f(rsq30
);
212 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
213 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
214 rinvsq30
= _mm_mul_ps(rinv30
,rinv30
);
216 /* Load parameters for j particles */
217 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
218 charge
+jnrC
+0,charge
+jnrD
+0);
220 fjx0
= _mm_setzero_ps();
221 fjy0
= _mm_setzero_ps();
222 fjz0
= _mm_setzero_ps();
224 /**************************
225 * CALCULATE INTERACTIONS *
226 **************************/
228 r10
= _mm_mul_ps(rsq10
,rinv10
);
230 /* Compute parameters for interactions between i and j atoms */
231 qq10
= _mm_mul_ps(iq1
,jq0
);
233 /* EWALD ELECTROSTATICS */
235 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
236 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
237 ewitab
= _mm_cvttps_epi32(ewrt
);
238 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
239 ewitab
= _mm_slli_epi32(ewitab
,2);
240 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
241 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
242 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
243 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
244 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
245 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
246 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
247 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(rinv10
,velec
));
248 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
250 /* Update potential sum for this i atom from the interaction with this j atom. */
251 velecsum
= _mm_add_ps(velecsum
,velec
);
255 /* Calculate temporary vectorial force */
256 tx
= _mm_mul_ps(fscal
,dx10
);
257 ty
= _mm_mul_ps(fscal
,dy10
);
258 tz
= _mm_mul_ps(fscal
,dz10
);
260 /* Update vectorial force */
261 fix1
= _mm_add_ps(fix1
,tx
);
262 fiy1
= _mm_add_ps(fiy1
,ty
);
263 fiz1
= _mm_add_ps(fiz1
,tz
);
265 fjx0
= _mm_add_ps(fjx0
,tx
);
266 fjy0
= _mm_add_ps(fjy0
,ty
);
267 fjz0
= _mm_add_ps(fjz0
,tz
);
269 /**************************
270 * CALCULATE INTERACTIONS *
271 **************************/
273 r20
= _mm_mul_ps(rsq20
,rinv20
);
275 /* Compute parameters for interactions between i and j atoms */
276 qq20
= _mm_mul_ps(iq2
,jq0
);
278 /* EWALD ELECTROSTATICS */
280 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
281 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
282 ewitab
= _mm_cvttps_epi32(ewrt
);
283 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
284 ewitab
= _mm_slli_epi32(ewitab
,2);
285 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
286 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
287 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
288 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
289 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
290 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
291 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
292 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(rinv20
,velec
));
293 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
295 /* Update potential sum for this i atom from the interaction with this j atom. */
296 velecsum
= _mm_add_ps(velecsum
,velec
);
300 /* Calculate temporary vectorial force */
301 tx
= _mm_mul_ps(fscal
,dx20
);
302 ty
= _mm_mul_ps(fscal
,dy20
);
303 tz
= _mm_mul_ps(fscal
,dz20
);
305 /* Update vectorial force */
306 fix2
= _mm_add_ps(fix2
,tx
);
307 fiy2
= _mm_add_ps(fiy2
,ty
);
308 fiz2
= _mm_add_ps(fiz2
,tz
);
310 fjx0
= _mm_add_ps(fjx0
,tx
);
311 fjy0
= _mm_add_ps(fjy0
,ty
);
312 fjz0
= _mm_add_ps(fjz0
,tz
);
314 /**************************
315 * CALCULATE INTERACTIONS *
316 **************************/
318 r30
= _mm_mul_ps(rsq30
,rinv30
);
320 /* Compute parameters for interactions between i and j atoms */
321 qq30
= _mm_mul_ps(iq3
,jq0
);
323 /* EWALD ELECTROSTATICS */
325 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
326 ewrt
= _mm_mul_ps(r30
,ewtabscale
);
327 ewitab
= _mm_cvttps_epi32(ewrt
);
328 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
329 ewitab
= _mm_slli_epi32(ewitab
,2);
330 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
331 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
332 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
333 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
334 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
335 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
336 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
337 velec
= _mm_mul_ps(qq30
,_mm_sub_ps(rinv30
,velec
));
338 felec
= _mm_mul_ps(_mm_mul_ps(qq30
,rinv30
),_mm_sub_ps(rinvsq30
,felec
));
340 /* Update potential sum for this i atom from the interaction with this j atom. */
341 velecsum
= _mm_add_ps(velecsum
,velec
);
345 /* Calculate temporary vectorial force */
346 tx
= _mm_mul_ps(fscal
,dx30
);
347 ty
= _mm_mul_ps(fscal
,dy30
);
348 tz
= _mm_mul_ps(fscal
,dz30
);
350 /* Update vectorial force */
351 fix3
= _mm_add_ps(fix3
,tx
);
352 fiy3
= _mm_add_ps(fiy3
,ty
);
353 fiz3
= _mm_add_ps(fiz3
,tz
);
355 fjx0
= _mm_add_ps(fjx0
,tx
);
356 fjy0
= _mm_add_ps(fjy0
,ty
);
357 fjz0
= _mm_add_ps(fjz0
,tz
);
359 fjptrA
= f
+j_coord_offsetA
;
360 fjptrB
= f
+j_coord_offsetB
;
361 fjptrC
= f
+j_coord_offsetC
;
362 fjptrD
= f
+j_coord_offsetD
;
364 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
366 /* Inner loop uses 123 flops */
372 /* Get j neighbor index, and coordinate index */
373 jnrlistA
= jjnr
[jidx
];
374 jnrlistB
= jjnr
[jidx
+1];
375 jnrlistC
= jjnr
[jidx
+2];
376 jnrlistD
= jjnr
[jidx
+3];
377 /* Sign of each element will be negative for non-real atoms.
378 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
379 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
381 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
382 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
383 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
384 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
385 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
386 j_coord_offsetA
= DIM
*jnrA
;
387 j_coord_offsetB
= DIM
*jnrB
;
388 j_coord_offsetC
= DIM
*jnrC
;
389 j_coord_offsetD
= DIM
*jnrD
;
391 /* load j atom coordinates */
392 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
393 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
396 /* Calculate displacement vector */
397 dx10
= _mm_sub_ps(ix1
,jx0
);
398 dy10
= _mm_sub_ps(iy1
,jy0
);
399 dz10
= _mm_sub_ps(iz1
,jz0
);
400 dx20
= _mm_sub_ps(ix2
,jx0
);
401 dy20
= _mm_sub_ps(iy2
,jy0
);
402 dz20
= _mm_sub_ps(iz2
,jz0
);
403 dx30
= _mm_sub_ps(ix3
,jx0
);
404 dy30
= _mm_sub_ps(iy3
,jy0
);
405 dz30
= _mm_sub_ps(iz3
,jz0
);
407 /* Calculate squared distance and things based on it */
408 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
409 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
410 rsq30
= gmx_mm_calc_rsq_ps(dx30
,dy30
,dz30
);
412 rinv10
= sse2_invsqrt_f(rsq10
);
413 rinv20
= sse2_invsqrt_f(rsq20
);
414 rinv30
= sse2_invsqrt_f(rsq30
);
416 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
417 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
418 rinvsq30
= _mm_mul_ps(rinv30
,rinv30
);
420 /* Load parameters for j particles */
421 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
422 charge
+jnrC
+0,charge
+jnrD
+0);
424 fjx0
= _mm_setzero_ps();
425 fjy0
= _mm_setzero_ps();
426 fjz0
= _mm_setzero_ps();
428 /**************************
429 * CALCULATE INTERACTIONS *
430 **************************/
432 r10
= _mm_mul_ps(rsq10
,rinv10
);
433 r10
= _mm_andnot_ps(dummy_mask
,r10
);
435 /* Compute parameters for interactions between i and j atoms */
436 qq10
= _mm_mul_ps(iq1
,jq0
);
438 /* EWALD ELECTROSTATICS */
440 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
441 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
442 ewitab
= _mm_cvttps_epi32(ewrt
);
443 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
444 ewitab
= _mm_slli_epi32(ewitab
,2);
445 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
446 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
447 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
448 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
449 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
450 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
451 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
452 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(rinv10
,velec
));
453 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
455 /* Update potential sum for this i atom from the interaction with this j atom. */
456 velec
= _mm_andnot_ps(dummy_mask
,velec
);
457 velecsum
= _mm_add_ps(velecsum
,velec
);
461 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
463 /* Calculate temporary vectorial force */
464 tx
= _mm_mul_ps(fscal
,dx10
);
465 ty
= _mm_mul_ps(fscal
,dy10
);
466 tz
= _mm_mul_ps(fscal
,dz10
);
468 /* Update vectorial force */
469 fix1
= _mm_add_ps(fix1
,tx
);
470 fiy1
= _mm_add_ps(fiy1
,ty
);
471 fiz1
= _mm_add_ps(fiz1
,tz
);
473 fjx0
= _mm_add_ps(fjx0
,tx
);
474 fjy0
= _mm_add_ps(fjy0
,ty
);
475 fjz0
= _mm_add_ps(fjz0
,tz
);
477 /**************************
478 * CALCULATE INTERACTIONS *
479 **************************/
481 r20
= _mm_mul_ps(rsq20
,rinv20
);
482 r20
= _mm_andnot_ps(dummy_mask
,r20
);
484 /* Compute parameters for interactions between i and j atoms */
485 qq20
= _mm_mul_ps(iq2
,jq0
);
487 /* EWALD ELECTROSTATICS */
489 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
490 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
491 ewitab
= _mm_cvttps_epi32(ewrt
);
492 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
493 ewitab
= _mm_slli_epi32(ewitab
,2);
494 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
495 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
496 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
497 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
498 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
499 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
500 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
501 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(rinv20
,velec
));
502 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
504 /* Update potential sum for this i atom from the interaction with this j atom. */
505 velec
= _mm_andnot_ps(dummy_mask
,velec
);
506 velecsum
= _mm_add_ps(velecsum
,velec
);
510 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
512 /* Calculate temporary vectorial force */
513 tx
= _mm_mul_ps(fscal
,dx20
);
514 ty
= _mm_mul_ps(fscal
,dy20
);
515 tz
= _mm_mul_ps(fscal
,dz20
);
517 /* Update vectorial force */
518 fix2
= _mm_add_ps(fix2
,tx
);
519 fiy2
= _mm_add_ps(fiy2
,ty
);
520 fiz2
= _mm_add_ps(fiz2
,tz
);
522 fjx0
= _mm_add_ps(fjx0
,tx
);
523 fjy0
= _mm_add_ps(fjy0
,ty
);
524 fjz0
= _mm_add_ps(fjz0
,tz
);
526 /**************************
527 * CALCULATE INTERACTIONS *
528 **************************/
530 r30
= _mm_mul_ps(rsq30
,rinv30
);
531 r30
= _mm_andnot_ps(dummy_mask
,r30
);
533 /* Compute parameters for interactions between i and j atoms */
534 qq30
= _mm_mul_ps(iq3
,jq0
);
536 /* EWALD ELECTROSTATICS */
538 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
539 ewrt
= _mm_mul_ps(r30
,ewtabscale
);
540 ewitab
= _mm_cvttps_epi32(ewrt
);
541 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
542 ewitab
= _mm_slli_epi32(ewitab
,2);
543 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
544 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
545 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
546 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
547 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
548 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
549 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
550 velec
= _mm_mul_ps(qq30
,_mm_sub_ps(rinv30
,velec
));
551 felec
= _mm_mul_ps(_mm_mul_ps(qq30
,rinv30
),_mm_sub_ps(rinvsq30
,felec
));
553 /* Update potential sum for this i atom from the interaction with this j atom. */
554 velec
= _mm_andnot_ps(dummy_mask
,velec
);
555 velecsum
= _mm_add_ps(velecsum
,velec
);
559 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
561 /* Calculate temporary vectorial force */
562 tx
= _mm_mul_ps(fscal
,dx30
);
563 ty
= _mm_mul_ps(fscal
,dy30
);
564 tz
= _mm_mul_ps(fscal
,dz30
);
566 /* Update vectorial force */
567 fix3
= _mm_add_ps(fix3
,tx
);
568 fiy3
= _mm_add_ps(fiy3
,ty
);
569 fiz3
= _mm_add_ps(fiz3
,tz
);
571 fjx0
= _mm_add_ps(fjx0
,tx
);
572 fjy0
= _mm_add_ps(fjy0
,ty
);
573 fjz0
= _mm_add_ps(fjz0
,tz
);
575 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
576 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
577 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
578 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
580 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
582 /* Inner loop uses 126 flops */
585 /* End of innermost loop */
587 gmx_mm_update_iforce_3atom_swizzle_ps(fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
588 f
+i_coord_offset
+DIM
,fshift
+i_shift_offset
);
591 /* Update potential energies */
592 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
594 /* Increment number of inner iterations */
595 inneriter
+= j_index_end
- j_index_start
;
597 /* Outer loop uses 19 flops */
600 /* Increment number of outer iterations */
603 /* Update outer/inner flops */
605 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W4_VF
,outeriter
*19 + inneriter
*126);
608 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
609 * Electrostatics interaction: Ewald
610 * VdW interaction: None
611 * Geometry: Water4-Particle
612 * Calculate force/pot: Force
615 nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
616 (t_nblist
* gmx_restrict nlist
,
617 rvec
* gmx_restrict xx
,
618 rvec
* gmx_restrict ff
,
619 struct t_forcerec
* gmx_restrict fr
,
620 t_mdatoms
* gmx_restrict mdatoms
,
621 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
622 t_nrnb
* gmx_restrict nrnb
)
624 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
625 * just 0 for non-waters.
626 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
627 * jnr indices corresponding to data put in the four positions in the SIMD register.
629 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
630 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
631 int jnrA
,jnrB
,jnrC
,jnrD
;
632 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
633 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
634 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
636 real
*shiftvec
,*fshift
,*x
,*f
;
637 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
639 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
641 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
643 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
645 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
646 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
647 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
648 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
649 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
650 __m128 dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
651 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
654 __m128 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
656 __m128 dummy_mask
,cutoff_mask
;
657 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
658 __m128 one
= _mm_set1_ps(1.0);
659 __m128 two
= _mm_set1_ps(2.0);
665 jindex
= nlist
->jindex
;
667 shiftidx
= nlist
->shift
;
669 shiftvec
= fr
->shift_vec
[0];
670 fshift
= fr
->fshift
[0];
671 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
672 charge
= mdatoms
->chargeA
;
674 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
675 ewtab
= fr
->ic
->tabq_coul_F
;
676 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
677 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
679 /* Setup water-specific parameters */
680 inr
= nlist
->iinr
[0];
681 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
682 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
683 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
685 /* Avoid stupid compiler warnings */
686 jnrA
= jnrB
= jnrC
= jnrD
= 0;
695 for(iidx
=0;iidx
<4*DIM
;iidx
++)
700 /* Start outer loop over neighborlists */
701 for(iidx
=0; iidx
<nri
; iidx
++)
703 /* Load shift vector for this list */
704 i_shift_offset
= DIM
*shiftidx
[iidx
];
706 /* Load limits for loop over neighbors */
707 j_index_start
= jindex
[iidx
];
708 j_index_end
= jindex
[iidx
+1];
710 /* Get outer coordinate index */
712 i_coord_offset
= DIM
*inr
;
714 /* Load i particle coords and add shift vector */
715 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
+DIM
,
716 &ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
718 fix1
= _mm_setzero_ps();
719 fiy1
= _mm_setzero_ps();
720 fiz1
= _mm_setzero_ps();
721 fix2
= _mm_setzero_ps();
722 fiy2
= _mm_setzero_ps();
723 fiz2
= _mm_setzero_ps();
724 fix3
= _mm_setzero_ps();
725 fiy3
= _mm_setzero_ps();
726 fiz3
= _mm_setzero_ps();
728 /* Start inner kernel loop */
729 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
732 /* Get j neighbor index, and coordinate index */
737 j_coord_offsetA
= DIM
*jnrA
;
738 j_coord_offsetB
= DIM
*jnrB
;
739 j_coord_offsetC
= DIM
*jnrC
;
740 j_coord_offsetD
= DIM
*jnrD
;
742 /* load j atom coordinates */
743 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
744 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
747 /* Calculate displacement vector */
748 dx10
= _mm_sub_ps(ix1
,jx0
);
749 dy10
= _mm_sub_ps(iy1
,jy0
);
750 dz10
= _mm_sub_ps(iz1
,jz0
);
751 dx20
= _mm_sub_ps(ix2
,jx0
);
752 dy20
= _mm_sub_ps(iy2
,jy0
);
753 dz20
= _mm_sub_ps(iz2
,jz0
);
754 dx30
= _mm_sub_ps(ix3
,jx0
);
755 dy30
= _mm_sub_ps(iy3
,jy0
);
756 dz30
= _mm_sub_ps(iz3
,jz0
);
758 /* Calculate squared distance and things based on it */
759 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
760 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
761 rsq30
= gmx_mm_calc_rsq_ps(dx30
,dy30
,dz30
);
763 rinv10
= sse2_invsqrt_f(rsq10
);
764 rinv20
= sse2_invsqrt_f(rsq20
);
765 rinv30
= sse2_invsqrt_f(rsq30
);
767 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
768 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
769 rinvsq30
= _mm_mul_ps(rinv30
,rinv30
);
771 /* Load parameters for j particles */
772 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
773 charge
+jnrC
+0,charge
+jnrD
+0);
775 fjx0
= _mm_setzero_ps();
776 fjy0
= _mm_setzero_ps();
777 fjz0
= _mm_setzero_ps();
779 /**************************
780 * CALCULATE INTERACTIONS *
781 **************************/
783 r10
= _mm_mul_ps(rsq10
,rinv10
);
785 /* Compute parameters for interactions between i and j atoms */
786 qq10
= _mm_mul_ps(iq1
,jq0
);
788 /* EWALD ELECTROSTATICS */
790 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
791 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
792 ewitab
= _mm_cvttps_epi32(ewrt
);
793 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
794 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
795 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
797 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
798 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
802 /* Calculate temporary vectorial force */
803 tx
= _mm_mul_ps(fscal
,dx10
);
804 ty
= _mm_mul_ps(fscal
,dy10
);
805 tz
= _mm_mul_ps(fscal
,dz10
);
807 /* Update vectorial force */
808 fix1
= _mm_add_ps(fix1
,tx
);
809 fiy1
= _mm_add_ps(fiy1
,ty
);
810 fiz1
= _mm_add_ps(fiz1
,tz
);
812 fjx0
= _mm_add_ps(fjx0
,tx
);
813 fjy0
= _mm_add_ps(fjy0
,ty
);
814 fjz0
= _mm_add_ps(fjz0
,tz
);
816 /**************************
817 * CALCULATE INTERACTIONS *
818 **************************/
820 r20
= _mm_mul_ps(rsq20
,rinv20
);
822 /* Compute parameters for interactions between i and j atoms */
823 qq20
= _mm_mul_ps(iq2
,jq0
);
825 /* EWALD ELECTROSTATICS */
827 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
828 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
829 ewitab
= _mm_cvttps_epi32(ewrt
);
830 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
831 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
832 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
834 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
835 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
839 /* Calculate temporary vectorial force */
840 tx
= _mm_mul_ps(fscal
,dx20
);
841 ty
= _mm_mul_ps(fscal
,dy20
);
842 tz
= _mm_mul_ps(fscal
,dz20
);
844 /* Update vectorial force */
845 fix2
= _mm_add_ps(fix2
,tx
);
846 fiy2
= _mm_add_ps(fiy2
,ty
);
847 fiz2
= _mm_add_ps(fiz2
,tz
);
849 fjx0
= _mm_add_ps(fjx0
,tx
);
850 fjy0
= _mm_add_ps(fjy0
,ty
);
851 fjz0
= _mm_add_ps(fjz0
,tz
);
853 /**************************
854 * CALCULATE INTERACTIONS *
855 **************************/
857 r30
= _mm_mul_ps(rsq30
,rinv30
);
859 /* Compute parameters for interactions between i and j atoms */
860 qq30
= _mm_mul_ps(iq3
,jq0
);
862 /* EWALD ELECTROSTATICS */
864 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
865 ewrt
= _mm_mul_ps(r30
,ewtabscale
);
866 ewitab
= _mm_cvttps_epi32(ewrt
);
867 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
868 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
869 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
871 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
872 felec
= _mm_mul_ps(_mm_mul_ps(qq30
,rinv30
),_mm_sub_ps(rinvsq30
,felec
));
876 /* Calculate temporary vectorial force */
877 tx
= _mm_mul_ps(fscal
,dx30
);
878 ty
= _mm_mul_ps(fscal
,dy30
);
879 tz
= _mm_mul_ps(fscal
,dz30
);
881 /* Update vectorial force */
882 fix3
= _mm_add_ps(fix3
,tx
);
883 fiy3
= _mm_add_ps(fiy3
,ty
);
884 fiz3
= _mm_add_ps(fiz3
,tz
);
886 fjx0
= _mm_add_ps(fjx0
,tx
);
887 fjy0
= _mm_add_ps(fjy0
,ty
);
888 fjz0
= _mm_add_ps(fjz0
,tz
);
890 fjptrA
= f
+j_coord_offsetA
;
891 fjptrB
= f
+j_coord_offsetB
;
892 fjptrC
= f
+j_coord_offsetC
;
893 fjptrD
= f
+j_coord_offsetD
;
895 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
897 /* Inner loop uses 108 flops */
903 /* Get j neighbor index, and coordinate index */
904 jnrlistA
= jjnr
[jidx
];
905 jnrlistB
= jjnr
[jidx
+1];
906 jnrlistC
= jjnr
[jidx
+2];
907 jnrlistD
= jjnr
[jidx
+3];
908 /* Sign of each element will be negative for non-real atoms.
909 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
910 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
912 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
913 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
914 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
915 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
916 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
917 j_coord_offsetA
= DIM
*jnrA
;
918 j_coord_offsetB
= DIM
*jnrB
;
919 j_coord_offsetC
= DIM
*jnrC
;
920 j_coord_offsetD
= DIM
*jnrD
;
922 /* load j atom coordinates */
923 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
924 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
927 /* Calculate displacement vector */
928 dx10
= _mm_sub_ps(ix1
,jx0
);
929 dy10
= _mm_sub_ps(iy1
,jy0
);
930 dz10
= _mm_sub_ps(iz1
,jz0
);
931 dx20
= _mm_sub_ps(ix2
,jx0
);
932 dy20
= _mm_sub_ps(iy2
,jy0
);
933 dz20
= _mm_sub_ps(iz2
,jz0
);
934 dx30
= _mm_sub_ps(ix3
,jx0
);
935 dy30
= _mm_sub_ps(iy3
,jy0
);
936 dz30
= _mm_sub_ps(iz3
,jz0
);
938 /* Calculate squared distance and things based on it */
939 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
940 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
941 rsq30
= gmx_mm_calc_rsq_ps(dx30
,dy30
,dz30
);
943 rinv10
= sse2_invsqrt_f(rsq10
);
944 rinv20
= sse2_invsqrt_f(rsq20
);
945 rinv30
= sse2_invsqrt_f(rsq30
);
947 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
948 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
949 rinvsq30
= _mm_mul_ps(rinv30
,rinv30
);
951 /* Load parameters for j particles */
952 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
953 charge
+jnrC
+0,charge
+jnrD
+0);
955 fjx0
= _mm_setzero_ps();
956 fjy0
= _mm_setzero_ps();
957 fjz0
= _mm_setzero_ps();
959 /**************************
960 * CALCULATE INTERACTIONS *
961 **************************/
963 r10
= _mm_mul_ps(rsq10
,rinv10
);
964 r10
= _mm_andnot_ps(dummy_mask
,r10
);
966 /* Compute parameters for interactions between i and j atoms */
967 qq10
= _mm_mul_ps(iq1
,jq0
);
969 /* EWALD ELECTROSTATICS */
971 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
972 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
973 ewitab
= _mm_cvttps_epi32(ewrt
);
974 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
975 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
976 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
978 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
979 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
983 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
985 /* Calculate temporary vectorial force */
986 tx
= _mm_mul_ps(fscal
,dx10
);
987 ty
= _mm_mul_ps(fscal
,dy10
);
988 tz
= _mm_mul_ps(fscal
,dz10
);
990 /* Update vectorial force */
991 fix1
= _mm_add_ps(fix1
,tx
);
992 fiy1
= _mm_add_ps(fiy1
,ty
);
993 fiz1
= _mm_add_ps(fiz1
,tz
);
995 fjx0
= _mm_add_ps(fjx0
,tx
);
996 fjy0
= _mm_add_ps(fjy0
,ty
);
997 fjz0
= _mm_add_ps(fjz0
,tz
);
999 /**************************
1000 * CALCULATE INTERACTIONS *
1001 **************************/
1003 r20
= _mm_mul_ps(rsq20
,rinv20
);
1004 r20
= _mm_andnot_ps(dummy_mask
,r20
);
1006 /* Compute parameters for interactions between i and j atoms */
1007 qq20
= _mm_mul_ps(iq2
,jq0
);
1009 /* EWALD ELECTROSTATICS */
1011 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1012 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
1013 ewitab
= _mm_cvttps_epi32(ewrt
);
1014 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1015 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1016 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1018 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1019 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
1023 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1025 /* Calculate temporary vectorial force */
1026 tx
= _mm_mul_ps(fscal
,dx20
);
1027 ty
= _mm_mul_ps(fscal
,dy20
);
1028 tz
= _mm_mul_ps(fscal
,dz20
);
1030 /* Update vectorial force */
1031 fix2
= _mm_add_ps(fix2
,tx
);
1032 fiy2
= _mm_add_ps(fiy2
,ty
);
1033 fiz2
= _mm_add_ps(fiz2
,tz
);
1035 fjx0
= _mm_add_ps(fjx0
,tx
);
1036 fjy0
= _mm_add_ps(fjy0
,ty
);
1037 fjz0
= _mm_add_ps(fjz0
,tz
);
1039 /**************************
1040 * CALCULATE INTERACTIONS *
1041 **************************/
1043 r30
= _mm_mul_ps(rsq30
,rinv30
);
1044 r30
= _mm_andnot_ps(dummy_mask
,r30
);
1046 /* Compute parameters for interactions between i and j atoms */
1047 qq30
= _mm_mul_ps(iq3
,jq0
);
1049 /* EWALD ELECTROSTATICS */
1051 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1052 ewrt
= _mm_mul_ps(r30
,ewtabscale
);
1053 ewitab
= _mm_cvttps_epi32(ewrt
);
1054 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1055 gmx_mm_load_4pair_swizzle_ps(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1056 ewtab
+gmx_mm_extract_epi32(ewitab
,2),ewtab
+gmx_mm_extract_epi32(ewitab
,3),
1058 felec
= _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one
,eweps
),ewtabF
),_mm_mul_ps(eweps
,ewtabFn
));
1059 felec
= _mm_mul_ps(_mm_mul_ps(qq30
,rinv30
),_mm_sub_ps(rinvsq30
,felec
));
1063 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1065 /* Calculate temporary vectorial force */
1066 tx
= _mm_mul_ps(fscal
,dx30
);
1067 ty
= _mm_mul_ps(fscal
,dy30
);
1068 tz
= _mm_mul_ps(fscal
,dz30
);
1070 /* Update vectorial force */
1071 fix3
= _mm_add_ps(fix3
,tx
);
1072 fiy3
= _mm_add_ps(fiy3
,ty
);
1073 fiz3
= _mm_add_ps(fiz3
,tz
);
1075 fjx0
= _mm_add_ps(fjx0
,tx
);
1076 fjy0
= _mm_add_ps(fjy0
,ty
);
1077 fjz0
= _mm_add_ps(fjz0
,tz
);
1079 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1080 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1081 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1082 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1084 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
1086 /* Inner loop uses 111 flops */
1089 /* End of innermost loop */
1091 gmx_mm_update_iforce_3atom_swizzle_ps(fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1092 f
+i_coord_offset
+DIM
,fshift
+i_shift_offset
);
1094 /* Increment number of inner iterations */
1095 inneriter
+= j_index_end
- j_index_start
;
1097 /* Outer loop uses 18 flops */
1100 /* Increment number of outer iterations */
1103 /* Update outer/inner flops */
1105 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W4_F
,outeriter
*18 + inneriter
*111);