2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_double
45 (t_nblist
* gmx_restrict nlist
,
46 rvec
* gmx_restrict xx
,
47 rvec
* gmx_restrict ff
,
48 t_forcerec
* gmx_restrict fr
,
49 t_mdatoms
* gmx_restrict mdatoms
,
50 nb_kernel_data_t
* gmx_restrict kernel_data
,
51 t_nrnb
* gmx_restrict nrnb
)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
59 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
61 int j_coord_offsetA
,j_coord_offsetB
;
62 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
64 real
*shiftvec
,*fshift
,*x
,*f
;
65 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
67 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
69 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
71 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
72 int vdwjidx0A
,vdwjidx0B
;
73 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
74 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
75 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
76 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
77 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
80 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
82 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
83 real rswitch_scalar
,d_scalar
;
84 __m128d dummy_mask
,cutoff_mask
;
85 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
86 __m128d one
= _mm_set1_pd(1.0);
87 __m128d two
= _mm_set1_pd(2.0);
93 jindex
= nlist
->jindex
;
95 shiftidx
= nlist
->shift
;
97 shiftvec
= fr
->shift_vec
[0];
98 fshift
= fr
->fshift
[0];
99 facel
= _mm_set1_pd(fr
->epsfac
);
100 charge
= mdatoms
->chargeA
;
102 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
103 ewtab
= fr
->ic
->tabq_coul_FDV0
;
104 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
105 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
107 /* Setup water-specific parameters */
108 inr
= nlist
->iinr
[0];
109 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
110 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
111 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
113 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
114 rcutoff_scalar
= fr
->rcoulomb
;
115 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
116 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
118 rswitch_scalar
= fr
->rcoulomb_switch
;
119 rswitch
= _mm_set1_pd(rswitch_scalar
);
120 /* Setup switch parameters */
121 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
122 d
= _mm_set1_pd(d_scalar
);
123 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
124 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
125 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
126 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
127 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
128 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
130 /* Avoid stupid compiler warnings */
138 /* Start outer loop over neighborlists */
139 for(iidx
=0; iidx
<nri
; iidx
++)
141 /* Load shift vector for this list */
142 i_shift_offset
= DIM
*shiftidx
[iidx
];
144 /* Load limits for loop over neighbors */
145 j_index_start
= jindex
[iidx
];
146 j_index_end
= jindex
[iidx
+1];
148 /* Get outer coordinate index */
150 i_coord_offset
= DIM
*inr
;
152 /* Load i particle coords and add shift vector */
153 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
+DIM
,
154 &ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
156 fix1
= _mm_setzero_pd();
157 fiy1
= _mm_setzero_pd();
158 fiz1
= _mm_setzero_pd();
159 fix2
= _mm_setzero_pd();
160 fiy2
= _mm_setzero_pd();
161 fiz2
= _mm_setzero_pd();
162 fix3
= _mm_setzero_pd();
163 fiy3
= _mm_setzero_pd();
164 fiz3
= _mm_setzero_pd();
166 /* Reset potential sums */
167 velecsum
= _mm_setzero_pd();
169 /* Start inner kernel loop */
170 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
173 /* Get j neighbor index, and coordinate index */
176 j_coord_offsetA
= DIM
*jnrA
;
177 j_coord_offsetB
= DIM
*jnrB
;
179 /* load j atom coordinates */
180 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
183 /* Calculate displacement vector */
184 dx10
= _mm_sub_pd(ix1
,jx0
);
185 dy10
= _mm_sub_pd(iy1
,jy0
);
186 dz10
= _mm_sub_pd(iz1
,jz0
);
187 dx20
= _mm_sub_pd(ix2
,jx0
);
188 dy20
= _mm_sub_pd(iy2
,jy0
);
189 dz20
= _mm_sub_pd(iz2
,jz0
);
190 dx30
= _mm_sub_pd(ix3
,jx0
);
191 dy30
= _mm_sub_pd(iy3
,jy0
);
192 dz30
= _mm_sub_pd(iz3
,jz0
);
194 /* Calculate squared distance and things based on it */
195 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
196 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
197 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
199 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
200 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
201 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
203 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
204 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
205 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
207 /* Load parameters for j particles */
208 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
210 fjx0
= _mm_setzero_pd();
211 fjy0
= _mm_setzero_pd();
212 fjz0
= _mm_setzero_pd();
214 /**************************
215 * CALCULATE INTERACTIONS *
216 **************************/
218 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
221 r10
= _mm_mul_pd(rsq10
,rinv10
);
223 /* Compute parameters for interactions between i and j atoms */
224 qq10
= _mm_mul_pd(iq1
,jq0
);
226 /* EWALD ELECTROSTATICS */
228 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
229 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
230 ewitab
= _mm_cvttpd_epi32(ewrt
);
231 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
232 ewitab
= _mm_slli_epi32(ewitab
,2);
233 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
234 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
235 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
236 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
237 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
238 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
239 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
240 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
241 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
242 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
244 d
= _mm_sub_pd(r10
,rswitch
);
245 d
= _mm_max_pd(d
,_mm_setzero_pd());
246 d2
= _mm_mul_pd(d
,d
);
247 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
249 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
251 /* Evaluate switch function */
252 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
253 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
254 velec
= _mm_mul_pd(velec
,sw
);
255 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
257 /* Update potential sum for this i atom from the interaction with this j atom. */
258 velec
= _mm_and_pd(velec
,cutoff_mask
);
259 velecsum
= _mm_add_pd(velecsum
,velec
);
263 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
265 /* Calculate temporary vectorial force */
266 tx
= _mm_mul_pd(fscal
,dx10
);
267 ty
= _mm_mul_pd(fscal
,dy10
);
268 tz
= _mm_mul_pd(fscal
,dz10
);
270 /* Update vectorial force */
271 fix1
= _mm_add_pd(fix1
,tx
);
272 fiy1
= _mm_add_pd(fiy1
,ty
);
273 fiz1
= _mm_add_pd(fiz1
,tz
);
275 fjx0
= _mm_add_pd(fjx0
,tx
);
276 fjy0
= _mm_add_pd(fjy0
,ty
);
277 fjz0
= _mm_add_pd(fjz0
,tz
);
281 /**************************
282 * CALCULATE INTERACTIONS *
283 **************************/
285 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
288 r20
= _mm_mul_pd(rsq20
,rinv20
);
290 /* Compute parameters for interactions between i and j atoms */
291 qq20
= _mm_mul_pd(iq2
,jq0
);
293 /* EWALD ELECTROSTATICS */
295 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
296 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
297 ewitab
= _mm_cvttpd_epi32(ewrt
);
298 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
299 ewitab
= _mm_slli_epi32(ewitab
,2);
300 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
301 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
302 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
303 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
304 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
305 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
306 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
307 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
308 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
309 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
311 d
= _mm_sub_pd(r20
,rswitch
);
312 d
= _mm_max_pd(d
,_mm_setzero_pd());
313 d2
= _mm_mul_pd(d
,d
);
314 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
316 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
318 /* Evaluate switch function */
319 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
320 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
321 velec
= _mm_mul_pd(velec
,sw
);
322 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
324 /* Update potential sum for this i atom from the interaction with this j atom. */
325 velec
= _mm_and_pd(velec
,cutoff_mask
);
326 velecsum
= _mm_add_pd(velecsum
,velec
);
330 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
332 /* Calculate temporary vectorial force */
333 tx
= _mm_mul_pd(fscal
,dx20
);
334 ty
= _mm_mul_pd(fscal
,dy20
);
335 tz
= _mm_mul_pd(fscal
,dz20
);
337 /* Update vectorial force */
338 fix2
= _mm_add_pd(fix2
,tx
);
339 fiy2
= _mm_add_pd(fiy2
,ty
);
340 fiz2
= _mm_add_pd(fiz2
,tz
);
342 fjx0
= _mm_add_pd(fjx0
,tx
);
343 fjy0
= _mm_add_pd(fjy0
,ty
);
344 fjz0
= _mm_add_pd(fjz0
,tz
);
348 /**************************
349 * CALCULATE INTERACTIONS *
350 **************************/
352 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
355 r30
= _mm_mul_pd(rsq30
,rinv30
);
357 /* Compute parameters for interactions between i and j atoms */
358 qq30
= _mm_mul_pd(iq3
,jq0
);
360 /* EWALD ELECTROSTATICS */
362 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
363 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
364 ewitab
= _mm_cvttpd_epi32(ewrt
);
365 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
366 ewitab
= _mm_slli_epi32(ewitab
,2);
367 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
368 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
369 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
370 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
371 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
372 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
373 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
374 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
375 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(rinv30
,velec
));
376 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
378 d
= _mm_sub_pd(r30
,rswitch
);
379 d
= _mm_max_pd(d
,_mm_setzero_pd());
380 d2
= _mm_mul_pd(d
,d
);
381 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
383 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
385 /* Evaluate switch function */
386 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
387 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv30
,_mm_mul_pd(velec
,dsw
)) );
388 velec
= _mm_mul_pd(velec
,sw
);
389 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
391 /* Update potential sum for this i atom from the interaction with this j atom. */
392 velec
= _mm_and_pd(velec
,cutoff_mask
);
393 velecsum
= _mm_add_pd(velecsum
,velec
);
397 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
399 /* Calculate temporary vectorial force */
400 tx
= _mm_mul_pd(fscal
,dx30
);
401 ty
= _mm_mul_pd(fscal
,dy30
);
402 tz
= _mm_mul_pd(fscal
,dz30
);
404 /* Update vectorial force */
405 fix3
= _mm_add_pd(fix3
,tx
);
406 fiy3
= _mm_add_pd(fiy3
,ty
);
407 fiz3
= _mm_add_pd(fiz3
,tz
);
409 fjx0
= _mm_add_pd(fjx0
,tx
);
410 fjy0
= _mm_add_pd(fjy0
,ty
);
411 fjz0
= _mm_add_pd(fjz0
,tz
);
415 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
417 /* Inner loop uses 198 flops */
424 j_coord_offsetA
= DIM
*jnrA
;
426 /* load j atom coordinates */
427 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
430 /* Calculate displacement vector */
431 dx10
= _mm_sub_pd(ix1
,jx0
);
432 dy10
= _mm_sub_pd(iy1
,jy0
);
433 dz10
= _mm_sub_pd(iz1
,jz0
);
434 dx20
= _mm_sub_pd(ix2
,jx0
);
435 dy20
= _mm_sub_pd(iy2
,jy0
);
436 dz20
= _mm_sub_pd(iz2
,jz0
);
437 dx30
= _mm_sub_pd(ix3
,jx0
);
438 dy30
= _mm_sub_pd(iy3
,jy0
);
439 dz30
= _mm_sub_pd(iz3
,jz0
);
441 /* Calculate squared distance and things based on it */
442 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
443 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
444 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
446 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
447 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
448 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
450 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
451 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
452 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
454 /* Load parameters for j particles */
455 jq0
= _mm_load_sd(charge
+jnrA
+0);
457 fjx0
= _mm_setzero_pd();
458 fjy0
= _mm_setzero_pd();
459 fjz0
= _mm_setzero_pd();
461 /**************************
462 * CALCULATE INTERACTIONS *
463 **************************/
465 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
468 r10
= _mm_mul_pd(rsq10
,rinv10
);
470 /* Compute parameters for interactions between i and j atoms */
471 qq10
= _mm_mul_pd(iq1
,jq0
);
473 /* EWALD ELECTROSTATICS */
475 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
476 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
477 ewitab
= _mm_cvttpd_epi32(ewrt
);
478 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
479 ewitab
= _mm_slli_epi32(ewitab
,2);
480 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
481 ewtabD
= _mm_setzero_pd();
482 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
483 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
484 ewtabFn
= _mm_setzero_pd();
485 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
486 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
487 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
488 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
489 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
491 d
= _mm_sub_pd(r10
,rswitch
);
492 d
= _mm_max_pd(d
,_mm_setzero_pd());
493 d2
= _mm_mul_pd(d
,d
);
494 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
496 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
498 /* Evaluate switch function */
499 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
500 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
501 velec
= _mm_mul_pd(velec
,sw
);
502 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
504 /* Update potential sum for this i atom from the interaction with this j atom. */
505 velec
= _mm_and_pd(velec
,cutoff_mask
);
506 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
507 velecsum
= _mm_add_pd(velecsum
,velec
);
511 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
513 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
515 /* Calculate temporary vectorial force */
516 tx
= _mm_mul_pd(fscal
,dx10
);
517 ty
= _mm_mul_pd(fscal
,dy10
);
518 tz
= _mm_mul_pd(fscal
,dz10
);
520 /* Update vectorial force */
521 fix1
= _mm_add_pd(fix1
,tx
);
522 fiy1
= _mm_add_pd(fiy1
,ty
);
523 fiz1
= _mm_add_pd(fiz1
,tz
);
525 fjx0
= _mm_add_pd(fjx0
,tx
);
526 fjy0
= _mm_add_pd(fjy0
,ty
);
527 fjz0
= _mm_add_pd(fjz0
,tz
);
531 /**************************
532 * CALCULATE INTERACTIONS *
533 **************************/
535 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
538 r20
= _mm_mul_pd(rsq20
,rinv20
);
540 /* Compute parameters for interactions between i and j atoms */
541 qq20
= _mm_mul_pd(iq2
,jq0
);
543 /* EWALD ELECTROSTATICS */
545 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
546 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
547 ewitab
= _mm_cvttpd_epi32(ewrt
);
548 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
549 ewitab
= _mm_slli_epi32(ewitab
,2);
550 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
551 ewtabD
= _mm_setzero_pd();
552 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
553 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
554 ewtabFn
= _mm_setzero_pd();
555 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
556 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
557 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
558 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
559 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
561 d
= _mm_sub_pd(r20
,rswitch
);
562 d
= _mm_max_pd(d
,_mm_setzero_pd());
563 d2
= _mm_mul_pd(d
,d
);
564 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
566 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
568 /* Evaluate switch function */
569 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
570 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
571 velec
= _mm_mul_pd(velec
,sw
);
572 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
574 /* Update potential sum for this i atom from the interaction with this j atom. */
575 velec
= _mm_and_pd(velec
,cutoff_mask
);
576 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
577 velecsum
= _mm_add_pd(velecsum
,velec
);
581 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
583 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
585 /* Calculate temporary vectorial force */
586 tx
= _mm_mul_pd(fscal
,dx20
);
587 ty
= _mm_mul_pd(fscal
,dy20
);
588 tz
= _mm_mul_pd(fscal
,dz20
);
590 /* Update vectorial force */
591 fix2
= _mm_add_pd(fix2
,tx
);
592 fiy2
= _mm_add_pd(fiy2
,ty
);
593 fiz2
= _mm_add_pd(fiz2
,tz
);
595 fjx0
= _mm_add_pd(fjx0
,tx
);
596 fjy0
= _mm_add_pd(fjy0
,ty
);
597 fjz0
= _mm_add_pd(fjz0
,tz
);
601 /**************************
602 * CALCULATE INTERACTIONS *
603 **************************/
605 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
608 r30
= _mm_mul_pd(rsq30
,rinv30
);
610 /* Compute parameters for interactions between i and j atoms */
611 qq30
= _mm_mul_pd(iq3
,jq0
);
613 /* EWALD ELECTROSTATICS */
615 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
616 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
617 ewitab
= _mm_cvttpd_epi32(ewrt
);
618 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
619 ewitab
= _mm_slli_epi32(ewitab
,2);
620 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
621 ewtabD
= _mm_setzero_pd();
622 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
623 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
624 ewtabFn
= _mm_setzero_pd();
625 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
626 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
627 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
628 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(rinv30
,velec
));
629 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
631 d
= _mm_sub_pd(r30
,rswitch
);
632 d
= _mm_max_pd(d
,_mm_setzero_pd());
633 d2
= _mm_mul_pd(d
,d
);
634 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
636 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
638 /* Evaluate switch function */
639 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
640 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv30
,_mm_mul_pd(velec
,dsw
)) );
641 velec
= _mm_mul_pd(velec
,sw
);
642 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
644 /* Update potential sum for this i atom from the interaction with this j atom. */
645 velec
= _mm_and_pd(velec
,cutoff_mask
);
646 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
647 velecsum
= _mm_add_pd(velecsum
,velec
);
651 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
653 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
655 /* Calculate temporary vectorial force */
656 tx
= _mm_mul_pd(fscal
,dx30
);
657 ty
= _mm_mul_pd(fscal
,dy30
);
658 tz
= _mm_mul_pd(fscal
,dz30
);
660 /* Update vectorial force */
661 fix3
= _mm_add_pd(fix3
,tx
);
662 fiy3
= _mm_add_pd(fiy3
,ty
);
663 fiz3
= _mm_add_pd(fiz3
,tz
);
665 fjx0
= _mm_add_pd(fjx0
,tx
);
666 fjy0
= _mm_add_pd(fjy0
,ty
);
667 fjz0
= _mm_add_pd(fjz0
,tz
);
671 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
673 /* Inner loop uses 198 flops */
676 /* End of innermost loop */
678 gmx_mm_update_iforce_3atom_swizzle_pd(fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
679 f
+i_coord_offset
+DIM
,fshift
+i_shift_offset
);
682 /* Update potential energies */
683 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
685 /* Increment number of inner iterations */
686 inneriter
+= j_index_end
- j_index_start
;
688 /* Outer loop uses 19 flops */
691 /* Increment number of outer iterations */
694 /* Update outer/inner flops */
696 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W4_VF
,outeriter
*19 + inneriter
*198);
699 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_double
700 * Electrostatics interaction: Ewald
701 * VdW interaction: None
702 * Geometry: Water4-Particle
703 * Calculate force/pot: Force
706 nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_double
707 (t_nblist
* gmx_restrict nlist
,
708 rvec
* gmx_restrict xx
,
709 rvec
* gmx_restrict ff
,
710 t_forcerec
* gmx_restrict fr
,
711 t_mdatoms
* gmx_restrict mdatoms
,
712 nb_kernel_data_t
* gmx_restrict kernel_data
,
713 t_nrnb
* gmx_restrict nrnb
)
715 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
716 * just 0 for non-waters.
717 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
718 * jnr indices corresponding to data put in the four positions in the SIMD register.
720 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
721 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
723 int j_coord_offsetA
,j_coord_offsetB
;
724 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
726 real
*shiftvec
,*fshift
,*x
,*f
;
727 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
729 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
731 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
733 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
734 int vdwjidx0A
,vdwjidx0B
;
735 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
736 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
737 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
738 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
739 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
742 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
744 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
745 real rswitch_scalar
,d_scalar
;
746 __m128d dummy_mask
,cutoff_mask
;
747 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
748 __m128d one
= _mm_set1_pd(1.0);
749 __m128d two
= _mm_set1_pd(2.0);
755 jindex
= nlist
->jindex
;
757 shiftidx
= nlist
->shift
;
759 shiftvec
= fr
->shift_vec
[0];
760 fshift
= fr
->fshift
[0];
761 facel
= _mm_set1_pd(fr
->epsfac
);
762 charge
= mdatoms
->chargeA
;
764 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
765 ewtab
= fr
->ic
->tabq_coul_FDV0
;
766 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
767 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
769 /* Setup water-specific parameters */
770 inr
= nlist
->iinr
[0];
771 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
772 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
773 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
775 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
776 rcutoff_scalar
= fr
->rcoulomb
;
777 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
778 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
780 rswitch_scalar
= fr
->rcoulomb_switch
;
781 rswitch
= _mm_set1_pd(rswitch_scalar
);
782 /* Setup switch parameters */
783 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
784 d
= _mm_set1_pd(d_scalar
);
785 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
786 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
787 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
788 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
789 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
790 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
792 /* Avoid stupid compiler warnings */
800 /* Start outer loop over neighborlists */
801 for(iidx
=0; iidx
<nri
; iidx
++)
803 /* Load shift vector for this list */
804 i_shift_offset
= DIM
*shiftidx
[iidx
];
806 /* Load limits for loop over neighbors */
807 j_index_start
= jindex
[iidx
];
808 j_index_end
= jindex
[iidx
+1];
810 /* Get outer coordinate index */
812 i_coord_offset
= DIM
*inr
;
814 /* Load i particle coords and add shift vector */
815 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
+DIM
,
816 &ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
818 fix1
= _mm_setzero_pd();
819 fiy1
= _mm_setzero_pd();
820 fiz1
= _mm_setzero_pd();
821 fix2
= _mm_setzero_pd();
822 fiy2
= _mm_setzero_pd();
823 fiz2
= _mm_setzero_pd();
824 fix3
= _mm_setzero_pd();
825 fiy3
= _mm_setzero_pd();
826 fiz3
= _mm_setzero_pd();
828 /* Start inner kernel loop */
829 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
832 /* Get j neighbor index, and coordinate index */
835 j_coord_offsetA
= DIM
*jnrA
;
836 j_coord_offsetB
= DIM
*jnrB
;
838 /* load j atom coordinates */
839 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
842 /* Calculate displacement vector */
843 dx10
= _mm_sub_pd(ix1
,jx0
);
844 dy10
= _mm_sub_pd(iy1
,jy0
);
845 dz10
= _mm_sub_pd(iz1
,jz0
);
846 dx20
= _mm_sub_pd(ix2
,jx0
);
847 dy20
= _mm_sub_pd(iy2
,jy0
);
848 dz20
= _mm_sub_pd(iz2
,jz0
);
849 dx30
= _mm_sub_pd(ix3
,jx0
);
850 dy30
= _mm_sub_pd(iy3
,jy0
);
851 dz30
= _mm_sub_pd(iz3
,jz0
);
853 /* Calculate squared distance and things based on it */
854 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
855 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
856 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
858 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
859 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
860 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
862 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
863 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
864 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
866 /* Load parameters for j particles */
867 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
869 fjx0
= _mm_setzero_pd();
870 fjy0
= _mm_setzero_pd();
871 fjz0
= _mm_setzero_pd();
873 /**************************
874 * CALCULATE INTERACTIONS *
875 **************************/
877 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
880 r10
= _mm_mul_pd(rsq10
,rinv10
);
882 /* Compute parameters for interactions between i and j atoms */
883 qq10
= _mm_mul_pd(iq1
,jq0
);
885 /* EWALD ELECTROSTATICS */
887 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
888 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
889 ewitab
= _mm_cvttpd_epi32(ewrt
);
890 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
891 ewitab
= _mm_slli_epi32(ewitab
,2);
892 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
893 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
894 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
895 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
896 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
897 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
898 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
899 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
900 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
901 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
903 d
= _mm_sub_pd(r10
,rswitch
);
904 d
= _mm_max_pd(d
,_mm_setzero_pd());
905 d2
= _mm_mul_pd(d
,d
);
906 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
908 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
910 /* Evaluate switch function */
911 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
912 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
913 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
917 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
919 /* Calculate temporary vectorial force */
920 tx
= _mm_mul_pd(fscal
,dx10
);
921 ty
= _mm_mul_pd(fscal
,dy10
);
922 tz
= _mm_mul_pd(fscal
,dz10
);
924 /* Update vectorial force */
925 fix1
= _mm_add_pd(fix1
,tx
);
926 fiy1
= _mm_add_pd(fiy1
,ty
);
927 fiz1
= _mm_add_pd(fiz1
,tz
);
929 fjx0
= _mm_add_pd(fjx0
,tx
);
930 fjy0
= _mm_add_pd(fjy0
,ty
);
931 fjz0
= _mm_add_pd(fjz0
,tz
);
935 /**************************
936 * CALCULATE INTERACTIONS *
937 **************************/
939 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
942 r20
= _mm_mul_pd(rsq20
,rinv20
);
944 /* Compute parameters for interactions between i and j atoms */
945 qq20
= _mm_mul_pd(iq2
,jq0
);
947 /* EWALD ELECTROSTATICS */
949 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
950 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
951 ewitab
= _mm_cvttpd_epi32(ewrt
);
952 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
953 ewitab
= _mm_slli_epi32(ewitab
,2);
954 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
955 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
956 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
957 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
958 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
959 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
960 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
961 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
962 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
963 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
965 d
= _mm_sub_pd(r20
,rswitch
);
966 d
= _mm_max_pd(d
,_mm_setzero_pd());
967 d2
= _mm_mul_pd(d
,d
);
968 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
970 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
972 /* Evaluate switch function */
973 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
974 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
975 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
979 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
981 /* Calculate temporary vectorial force */
982 tx
= _mm_mul_pd(fscal
,dx20
);
983 ty
= _mm_mul_pd(fscal
,dy20
);
984 tz
= _mm_mul_pd(fscal
,dz20
);
986 /* Update vectorial force */
987 fix2
= _mm_add_pd(fix2
,tx
);
988 fiy2
= _mm_add_pd(fiy2
,ty
);
989 fiz2
= _mm_add_pd(fiz2
,tz
);
991 fjx0
= _mm_add_pd(fjx0
,tx
);
992 fjy0
= _mm_add_pd(fjy0
,ty
);
993 fjz0
= _mm_add_pd(fjz0
,tz
);
997 /**************************
998 * CALCULATE INTERACTIONS *
999 **************************/
1001 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
1004 r30
= _mm_mul_pd(rsq30
,rinv30
);
1006 /* Compute parameters for interactions between i and j atoms */
1007 qq30
= _mm_mul_pd(iq3
,jq0
);
1009 /* EWALD ELECTROSTATICS */
1011 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1012 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
1013 ewitab
= _mm_cvttpd_epi32(ewrt
);
1014 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1015 ewitab
= _mm_slli_epi32(ewitab
,2);
1016 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1017 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1018 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1019 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1020 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
1021 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1022 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1023 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1024 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(rinv30
,velec
));
1025 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
1027 d
= _mm_sub_pd(r30
,rswitch
);
1028 d
= _mm_max_pd(d
,_mm_setzero_pd());
1029 d2
= _mm_mul_pd(d
,d
);
1030 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1032 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1034 /* Evaluate switch function */
1035 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1036 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv30
,_mm_mul_pd(velec
,dsw
)) );
1037 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
1041 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1043 /* Calculate temporary vectorial force */
1044 tx
= _mm_mul_pd(fscal
,dx30
);
1045 ty
= _mm_mul_pd(fscal
,dy30
);
1046 tz
= _mm_mul_pd(fscal
,dz30
);
1048 /* Update vectorial force */
1049 fix3
= _mm_add_pd(fix3
,tx
);
1050 fiy3
= _mm_add_pd(fiy3
,ty
);
1051 fiz3
= _mm_add_pd(fiz3
,tz
);
1053 fjx0
= _mm_add_pd(fjx0
,tx
);
1054 fjy0
= _mm_add_pd(fjy0
,ty
);
1055 fjz0
= _mm_add_pd(fjz0
,tz
);
1059 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
1061 /* Inner loop uses 189 flops */
1064 if(jidx
<j_index_end
)
1068 j_coord_offsetA
= DIM
*jnrA
;
1070 /* load j atom coordinates */
1071 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1074 /* Calculate displacement vector */
1075 dx10
= _mm_sub_pd(ix1
,jx0
);
1076 dy10
= _mm_sub_pd(iy1
,jy0
);
1077 dz10
= _mm_sub_pd(iz1
,jz0
);
1078 dx20
= _mm_sub_pd(ix2
,jx0
);
1079 dy20
= _mm_sub_pd(iy2
,jy0
);
1080 dz20
= _mm_sub_pd(iz2
,jz0
);
1081 dx30
= _mm_sub_pd(ix3
,jx0
);
1082 dy30
= _mm_sub_pd(iy3
,jy0
);
1083 dz30
= _mm_sub_pd(iz3
,jz0
);
1085 /* Calculate squared distance and things based on it */
1086 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1087 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1088 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
1090 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1091 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1092 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
1094 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1095 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1096 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
1098 /* Load parameters for j particles */
1099 jq0
= _mm_load_sd(charge
+jnrA
+0);
1101 fjx0
= _mm_setzero_pd();
1102 fjy0
= _mm_setzero_pd();
1103 fjz0
= _mm_setzero_pd();
1105 /**************************
1106 * CALCULATE INTERACTIONS *
1107 **************************/
1109 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1112 r10
= _mm_mul_pd(rsq10
,rinv10
);
1114 /* Compute parameters for interactions between i and j atoms */
1115 qq10
= _mm_mul_pd(iq1
,jq0
);
1117 /* EWALD ELECTROSTATICS */
1119 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1120 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1121 ewitab
= _mm_cvttpd_epi32(ewrt
);
1122 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1123 ewitab
= _mm_slli_epi32(ewitab
,2);
1124 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1125 ewtabD
= _mm_setzero_pd();
1126 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1127 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1128 ewtabFn
= _mm_setzero_pd();
1129 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1130 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1131 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1132 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
1133 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1135 d
= _mm_sub_pd(r10
,rswitch
);
1136 d
= _mm_max_pd(d
,_mm_setzero_pd());
1137 d2
= _mm_mul_pd(d
,d
);
1138 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1140 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1142 /* Evaluate switch function */
1143 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1144 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
1145 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1149 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1151 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1153 /* Calculate temporary vectorial force */
1154 tx
= _mm_mul_pd(fscal
,dx10
);
1155 ty
= _mm_mul_pd(fscal
,dy10
);
1156 tz
= _mm_mul_pd(fscal
,dz10
);
1158 /* Update vectorial force */
1159 fix1
= _mm_add_pd(fix1
,tx
);
1160 fiy1
= _mm_add_pd(fiy1
,ty
);
1161 fiz1
= _mm_add_pd(fiz1
,tz
);
1163 fjx0
= _mm_add_pd(fjx0
,tx
);
1164 fjy0
= _mm_add_pd(fjy0
,ty
);
1165 fjz0
= _mm_add_pd(fjz0
,tz
);
1169 /**************************
1170 * CALCULATE INTERACTIONS *
1171 **************************/
1173 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1176 r20
= _mm_mul_pd(rsq20
,rinv20
);
1178 /* Compute parameters for interactions between i and j atoms */
1179 qq20
= _mm_mul_pd(iq2
,jq0
);
1181 /* EWALD ELECTROSTATICS */
1183 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1184 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1185 ewitab
= _mm_cvttpd_epi32(ewrt
);
1186 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1187 ewitab
= _mm_slli_epi32(ewitab
,2);
1188 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1189 ewtabD
= _mm_setzero_pd();
1190 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1191 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1192 ewtabFn
= _mm_setzero_pd();
1193 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1194 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1195 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1196 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
1197 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1199 d
= _mm_sub_pd(r20
,rswitch
);
1200 d
= _mm_max_pd(d
,_mm_setzero_pd());
1201 d2
= _mm_mul_pd(d
,d
);
1202 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1204 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1206 /* Evaluate switch function */
1207 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1208 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
1209 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1213 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1215 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1217 /* Calculate temporary vectorial force */
1218 tx
= _mm_mul_pd(fscal
,dx20
);
1219 ty
= _mm_mul_pd(fscal
,dy20
);
1220 tz
= _mm_mul_pd(fscal
,dz20
);
1222 /* Update vectorial force */
1223 fix2
= _mm_add_pd(fix2
,tx
);
1224 fiy2
= _mm_add_pd(fiy2
,ty
);
1225 fiz2
= _mm_add_pd(fiz2
,tz
);
1227 fjx0
= _mm_add_pd(fjx0
,tx
);
1228 fjy0
= _mm_add_pd(fjy0
,ty
);
1229 fjz0
= _mm_add_pd(fjz0
,tz
);
1233 /**************************
1234 * CALCULATE INTERACTIONS *
1235 **************************/
1237 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
1240 r30
= _mm_mul_pd(rsq30
,rinv30
);
1242 /* Compute parameters for interactions between i and j atoms */
1243 qq30
= _mm_mul_pd(iq3
,jq0
);
1245 /* EWALD ELECTROSTATICS */
1247 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1248 ewrt
= _mm_mul_pd(r30
,ewtabscale
);
1249 ewitab
= _mm_cvttpd_epi32(ewrt
);
1250 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1251 ewitab
= _mm_slli_epi32(ewitab
,2);
1252 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1253 ewtabD
= _mm_setzero_pd();
1254 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1255 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1256 ewtabFn
= _mm_setzero_pd();
1257 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1258 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1259 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1260 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(rinv30
,velec
));
1261 felec
= _mm_mul_pd(_mm_mul_pd(qq30
,rinv30
),_mm_sub_pd(rinvsq30
,felec
));
1263 d
= _mm_sub_pd(r30
,rswitch
);
1264 d
= _mm_max_pd(d
,_mm_setzero_pd());
1265 d2
= _mm_mul_pd(d
,d
);
1266 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1268 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1270 /* Evaluate switch function */
1271 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1272 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv30
,_mm_mul_pd(velec
,dsw
)) );
1273 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
1277 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1279 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1281 /* Calculate temporary vectorial force */
1282 tx
= _mm_mul_pd(fscal
,dx30
);
1283 ty
= _mm_mul_pd(fscal
,dy30
);
1284 tz
= _mm_mul_pd(fscal
,dz30
);
1286 /* Update vectorial force */
1287 fix3
= _mm_add_pd(fix3
,tx
);
1288 fiy3
= _mm_add_pd(fiy3
,ty
);
1289 fiz3
= _mm_add_pd(fiz3
,tz
);
1291 fjx0
= _mm_add_pd(fjx0
,tx
);
1292 fjy0
= _mm_add_pd(fjy0
,ty
);
1293 fjz0
= _mm_add_pd(fjz0
,tz
);
1297 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1299 /* Inner loop uses 189 flops */
1302 /* End of innermost loop */
1304 gmx_mm_update_iforce_3atom_swizzle_pd(fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1305 f
+i_coord_offset
+DIM
,fshift
+i_shift_offset
);
1307 /* Increment number of inner iterations */
1308 inneriter
+= j_index_end
- j_index_start
;
1310 /* Outer loop uses 18 flops */
1313 /* Increment number of outer iterations */
1316 /* Update outer/inner flops */
1318 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W4_F
,outeriter
*18 + inneriter
*189);