2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse4_1_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water3-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse4_1_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int j_coord_offsetA
,j_coord_offsetB
;
75 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
77 real
*shiftvec
,*fshift
,*x
,*f
;
78 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
80 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
82 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
84 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
85 int vdwjidx0A
,vdwjidx0B
;
86 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
87 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
88 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
89 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
90 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
93 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
95 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
96 real rswitch_scalar
,d_scalar
;
97 __m128d dummy_mask
,cutoff_mask
;
98 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
99 __m128d one
= _mm_set1_pd(1.0);
100 __m128d two
= _mm_set1_pd(2.0);
106 jindex
= nlist
->jindex
;
108 shiftidx
= nlist
->shift
;
110 shiftvec
= fr
->shift_vec
[0];
111 fshift
= fr
->fshift
[0];
112 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
113 charge
= mdatoms
->chargeA
;
115 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
116 ewtab
= fr
->ic
->tabq_coul_FDV0
;
117 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
118 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
120 /* Setup water-specific parameters */
121 inr
= nlist
->iinr
[0];
122 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
123 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
124 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
126 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
127 rcutoff_scalar
= fr
->ic
->rcoulomb
;
128 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
129 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
131 rswitch_scalar
= fr
->ic
->rcoulomb_switch
;
132 rswitch
= _mm_set1_pd(rswitch_scalar
);
133 /* Setup switch parameters */
134 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
135 d
= _mm_set1_pd(d_scalar
);
136 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
137 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
138 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
139 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
140 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
141 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
143 /* Avoid stupid compiler warnings */
151 /* Start outer loop over neighborlists */
152 for(iidx
=0; iidx
<nri
; iidx
++)
154 /* Load shift vector for this list */
155 i_shift_offset
= DIM
*shiftidx
[iidx
];
157 /* Load limits for loop over neighbors */
158 j_index_start
= jindex
[iidx
];
159 j_index_end
= jindex
[iidx
+1];
161 /* Get outer coordinate index */
163 i_coord_offset
= DIM
*inr
;
165 /* Load i particle coords and add shift vector */
166 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
167 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
169 fix0
= _mm_setzero_pd();
170 fiy0
= _mm_setzero_pd();
171 fiz0
= _mm_setzero_pd();
172 fix1
= _mm_setzero_pd();
173 fiy1
= _mm_setzero_pd();
174 fiz1
= _mm_setzero_pd();
175 fix2
= _mm_setzero_pd();
176 fiy2
= _mm_setzero_pd();
177 fiz2
= _mm_setzero_pd();
179 /* Reset potential sums */
180 velecsum
= _mm_setzero_pd();
182 /* Start inner kernel loop */
183 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
186 /* Get j neighbor index, and coordinate index */
189 j_coord_offsetA
= DIM
*jnrA
;
190 j_coord_offsetB
= DIM
*jnrB
;
192 /* load j atom coordinates */
193 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
196 /* Calculate displacement vector */
197 dx00
= _mm_sub_pd(ix0
,jx0
);
198 dy00
= _mm_sub_pd(iy0
,jy0
);
199 dz00
= _mm_sub_pd(iz0
,jz0
);
200 dx10
= _mm_sub_pd(ix1
,jx0
);
201 dy10
= _mm_sub_pd(iy1
,jy0
);
202 dz10
= _mm_sub_pd(iz1
,jz0
);
203 dx20
= _mm_sub_pd(ix2
,jx0
);
204 dy20
= _mm_sub_pd(iy2
,jy0
);
205 dz20
= _mm_sub_pd(iz2
,jz0
);
207 /* Calculate squared distance and things based on it */
208 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
209 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
210 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
212 rinv00
= sse41_invsqrt_d(rsq00
);
213 rinv10
= sse41_invsqrt_d(rsq10
);
214 rinv20
= sse41_invsqrt_d(rsq20
);
216 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
217 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
218 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
220 /* Load parameters for j particles */
221 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
223 fjx0
= _mm_setzero_pd();
224 fjy0
= _mm_setzero_pd();
225 fjz0
= _mm_setzero_pd();
227 /**************************
228 * CALCULATE INTERACTIONS *
229 **************************/
231 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
234 r00
= _mm_mul_pd(rsq00
,rinv00
);
236 /* Compute parameters for interactions between i and j atoms */
237 qq00
= _mm_mul_pd(iq0
,jq0
);
239 /* EWALD ELECTROSTATICS */
241 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
242 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
243 ewitab
= _mm_cvttpd_epi32(ewrt
);
244 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
245 ewitab
= _mm_slli_epi32(ewitab
,2);
246 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
247 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
248 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
249 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
250 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
251 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
252 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
253 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
254 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
255 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
257 d
= _mm_sub_pd(r00
,rswitch
);
258 d
= _mm_max_pd(d
,_mm_setzero_pd());
259 d2
= _mm_mul_pd(d
,d
);
260 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
262 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
264 /* Evaluate switch function */
265 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
266 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(velec
,dsw
)) );
267 velec
= _mm_mul_pd(velec
,sw
);
268 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
270 /* Update potential sum for this i atom from the interaction with this j atom. */
271 velec
= _mm_and_pd(velec
,cutoff_mask
);
272 velecsum
= _mm_add_pd(velecsum
,velec
);
276 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
278 /* Calculate temporary vectorial force */
279 tx
= _mm_mul_pd(fscal
,dx00
);
280 ty
= _mm_mul_pd(fscal
,dy00
);
281 tz
= _mm_mul_pd(fscal
,dz00
);
283 /* Update vectorial force */
284 fix0
= _mm_add_pd(fix0
,tx
);
285 fiy0
= _mm_add_pd(fiy0
,ty
);
286 fiz0
= _mm_add_pd(fiz0
,tz
);
288 fjx0
= _mm_add_pd(fjx0
,tx
);
289 fjy0
= _mm_add_pd(fjy0
,ty
);
290 fjz0
= _mm_add_pd(fjz0
,tz
);
294 /**************************
295 * CALCULATE INTERACTIONS *
296 **************************/
298 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
301 r10
= _mm_mul_pd(rsq10
,rinv10
);
303 /* Compute parameters for interactions between i and j atoms */
304 qq10
= _mm_mul_pd(iq1
,jq0
);
306 /* EWALD ELECTROSTATICS */
308 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
309 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
310 ewitab
= _mm_cvttpd_epi32(ewrt
);
311 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
312 ewitab
= _mm_slli_epi32(ewitab
,2);
313 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
314 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
315 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
316 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
317 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
318 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
319 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
320 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
321 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
322 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
324 d
= _mm_sub_pd(r10
,rswitch
);
325 d
= _mm_max_pd(d
,_mm_setzero_pd());
326 d2
= _mm_mul_pd(d
,d
);
327 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
329 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
331 /* Evaluate switch function */
332 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
333 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
334 velec
= _mm_mul_pd(velec
,sw
);
335 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
337 /* Update potential sum for this i atom from the interaction with this j atom. */
338 velec
= _mm_and_pd(velec
,cutoff_mask
);
339 velecsum
= _mm_add_pd(velecsum
,velec
);
343 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
345 /* Calculate temporary vectorial force */
346 tx
= _mm_mul_pd(fscal
,dx10
);
347 ty
= _mm_mul_pd(fscal
,dy10
);
348 tz
= _mm_mul_pd(fscal
,dz10
);
350 /* Update vectorial force */
351 fix1
= _mm_add_pd(fix1
,tx
);
352 fiy1
= _mm_add_pd(fiy1
,ty
);
353 fiz1
= _mm_add_pd(fiz1
,tz
);
355 fjx0
= _mm_add_pd(fjx0
,tx
);
356 fjy0
= _mm_add_pd(fjy0
,ty
);
357 fjz0
= _mm_add_pd(fjz0
,tz
);
361 /**************************
362 * CALCULATE INTERACTIONS *
363 **************************/
365 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
368 r20
= _mm_mul_pd(rsq20
,rinv20
);
370 /* Compute parameters for interactions between i and j atoms */
371 qq20
= _mm_mul_pd(iq2
,jq0
);
373 /* EWALD ELECTROSTATICS */
375 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
376 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
377 ewitab
= _mm_cvttpd_epi32(ewrt
);
378 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
379 ewitab
= _mm_slli_epi32(ewitab
,2);
380 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
381 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
382 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
383 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
384 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
385 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
386 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
387 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
388 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
389 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
391 d
= _mm_sub_pd(r20
,rswitch
);
392 d
= _mm_max_pd(d
,_mm_setzero_pd());
393 d2
= _mm_mul_pd(d
,d
);
394 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
396 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
398 /* Evaluate switch function */
399 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
400 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
401 velec
= _mm_mul_pd(velec
,sw
);
402 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
404 /* Update potential sum for this i atom from the interaction with this j atom. */
405 velec
= _mm_and_pd(velec
,cutoff_mask
);
406 velecsum
= _mm_add_pd(velecsum
,velec
);
410 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
412 /* Calculate temporary vectorial force */
413 tx
= _mm_mul_pd(fscal
,dx20
);
414 ty
= _mm_mul_pd(fscal
,dy20
);
415 tz
= _mm_mul_pd(fscal
,dz20
);
417 /* Update vectorial force */
418 fix2
= _mm_add_pd(fix2
,tx
);
419 fiy2
= _mm_add_pd(fiy2
,ty
);
420 fiz2
= _mm_add_pd(fiz2
,tz
);
422 fjx0
= _mm_add_pd(fjx0
,tx
);
423 fjy0
= _mm_add_pd(fjy0
,ty
);
424 fjz0
= _mm_add_pd(fjz0
,tz
);
428 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
430 /* Inner loop uses 198 flops */
437 j_coord_offsetA
= DIM
*jnrA
;
439 /* load j atom coordinates */
440 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
443 /* Calculate displacement vector */
444 dx00
= _mm_sub_pd(ix0
,jx0
);
445 dy00
= _mm_sub_pd(iy0
,jy0
);
446 dz00
= _mm_sub_pd(iz0
,jz0
);
447 dx10
= _mm_sub_pd(ix1
,jx0
);
448 dy10
= _mm_sub_pd(iy1
,jy0
);
449 dz10
= _mm_sub_pd(iz1
,jz0
);
450 dx20
= _mm_sub_pd(ix2
,jx0
);
451 dy20
= _mm_sub_pd(iy2
,jy0
);
452 dz20
= _mm_sub_pd(iz2
,jz0
);
454 /* Calculate squared distance and things based on it */
455 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
456 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
457 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
459 rinv00
= sse41_invsqrt_d(rsq00
);
460 rinv10
= sse41_invsqrt_d(rsq10
);
461 rinv20
= sse41_invsqrt_d(rsq20
);
463 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
464 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
465 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
467 /* Load parameters for j particles */
468 jq0
= _mm_load_sd(charge
+jnrA
+0);
470 fjx0
= _mm_setzero_pd();
471 fjy0
= _mm_setzero_pd();
472 fjz0
= _mm_setzero_pd();
474 /**************************
475 * CALCULATE INTERACTIONS *
476 **************************/
478 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
481 r00
= _mm_mul_pd(rsq00
,rinv00
);
483 /* Compute parameters for interactions between i and j atoms */
484 qq00
= _mm_mul_pd(iq0
,jq0
);
486 /* EWALD ELECTROSTATICS */
488 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
489 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
490 ewitab
= _mm_cvttpd_epi32(ewrt
);
491 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
492 ewitab
= _mm_slli_epi32(ewitab
,2);
493 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
494 ewtabD
= _mm_setzero_pd();
495 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
496 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
497 ewtabFn
= _mm_setzero_pd();
498 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
499 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
500 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
501 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
502 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
504 d
= _mm_sub_pd(r00
,rswitch
);
505 d
= _mm_max_pd(d
,_mm_setzero_pd());
506 d2
= _mm_mul_pd(d
,d
);
507 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
509 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
511 /* Evaluate switch function */
512 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
513 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(velec
,dsw
)) );
514 velec
= _mm_mul_pd(velec
,sw
);
515 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
517 /* Update potential sum for this i atom from the interaction with this j atom. */
518 velec
= _mm_and_pd(velec
,cutoff_mask
);
519 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
520 velecsum
= _mm_add_pd(velecsum
,velec
);
524 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
526 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
528 /* Calculate temporary vectorial force */
529 tx
= _mm_mul_pd(fscal
,dx00
);
530 ty
= _mm_mul_pd(fscal
,dy00
);
531 tz
= _mm_mul_pd(fscal
,dz00
);
533 /* Update vectorial force */
534 fix0
= _mm_add_pd(fix0
,tx
);
535 fiy0
= _mm_add_pd(fiy0
,ty
);
536 fiz0
= _mm_add_pd(fiz0
,tz
);
538 fjx0
= _mm_add_pd(fjx0
,tx
);
539 fjy0
= _mm_add_pd(fjy0
,ty
);
540 fjz0
= _mm_add_pd(fjz0
,tz
);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
551 r10
= _mm_mul_pd(rsq10
,rinv10
);
553 /* Compute parameters for interactions between i and j atoms */
554 qq10
= _mm_mul_pd(iq1
,jq0
);
556 /* EWALD ELECTROSTATICS */
558 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
559 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
560 ewitab
= _mm_cvttpd_epi32(ewrt
);
561 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
562 ewitab
= _mm_slli_epi32(ewitab
,2);
563 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
564 ewtabD
= _mm_setzero_pd();
565 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
566 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
567 ewtabFn
= _mm_setzero_pd();
568 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
569 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
570 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
571 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
572 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
574 d
= _mm_sub_pd(r10
,rswitch
);
575 d
= _mm_max_pd(d
,_mm_setzero_pd());
576 d2
= _mm_mul_pd(d
,d
);
577 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
579 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
581 /* Evaluate switch function */
582 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
583 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
584 velec
= _mm_mul_pd(velec
,sw
);
585 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
587 /* Update potential sum for this i atom from the interaction with this j atom. */
588 velec
= _mm_and_pd(velec
,cutoff_mask
);
589 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
590 velecsum
= _mm_add_pd(velecsum
,velec
);
594 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
596 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
598 /* Calculate temporary vectorial force */
599 tx
= _mm_mul_pd(fscal
,dx10
);
600 ty
= _mm_mul_pd(fscal
,dy10
);
601 tz
= _mm_mul_pd(fscal
,dz10
);
603 /* Update vectorial force */
604 fix1
= _mm_add_pd(fix1
,tx
);
605 fiy1
= _mm_add_pd(fiy1
,ty
);
606 fiz1
= _mm_add_pd(fiz1
,tz
);
608 fjx0
= _mm_add_pd(fjx0
,tx
);
609 fjy0
= _mm_add_pd(fjy0
,ty
);
610 fjz0
= _mm_add_pd(fjz0
,tz
);
614 /**************************
615 * CALCULATE INTERACTIONS *
616 **************************/
618 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
621 r20
= _mm_mul_pd(rsq20
,rinv20
);
623 /* Compute parameters for interactions between i and j atoms */
624 qq20
= _mm_mul_pd(iq2
,jq0
);
626 /* EWALD ELECTROSTATICS */
628 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
629 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
630 ewitab
= _mm_cvttpd_epi32(ewrt
);
631 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
632 ewitab
= _mm_slli_epi32(ewitab
,2);
633 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
634 ewtabD
= _mm_setzero_pd();
635 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
636 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
637 ewtabFn
= _mm_setzero_pd();
638 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
639 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
640 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
641 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
642 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
644 d
= _mm_sub_pd(r20
,rswitch
);
645 d
= _mm_max_pd(d
,_mm_setzero_pd());
646 d2
= _mm_mul_pd(d
,d
);
647 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
649 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
651 /* Evaluate switch function */
652 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
653 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
654 velec
= _mm_mul_pd(velec
,sw
);
655 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
657 /* Update potential sum for this i atom from the interaction with this j atom. */
658 velec
= _mm_and_pd(velec
,cutoff_mask
);
659 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
660 velecsum
= _mm_add_pd(velecsum
,velec
);
664 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
666 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
668 /* Calculate temporary vectorial force */
669 tx
= _mm_mul_pd(fscal
,dx20
);
670 ty
= _mm_mul_pd(fscal
,dy20
);
671 tz
= _mm_mul_pd(fscal
,dz20
);
673 /* Update vectorial force */
674 fix2
= _mm_add_pd(fix2
,tx
);
675 fiy2
= _mm_add_pd(fiy2
,ty
);
676 fiz2
= _mm_add_pd(fiz2
,tz
);
678 fjx0
= _mm_add_pd(fjx0
,tx
);
679 fjy0
= _mm_add_pd(fjy0
,ty
);
680 fjz0
= _mm_add_pd(fjz0
,tz
);
684 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
686 /* Inner loop uses 198 flops */
689 /* End of innermost loop */
691 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
692 f
+i_coord_offset
,fshift
+i_shift_offset
);
695 /* Update potential energies */
696 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
698 /* Increment number of inner iterations */
699 inneriter
+= j_index_end
- j_index_start
;
701 /* Outer loop uses 19 flops */
704 /* Increment number of outer iterations */
707 /* Update outer/inner flops */
709 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3_VF
,outeriter
*19 + inneriter
*198);
712 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_double
713 * Electrostatics interaction: Ewald
714 * VdW interaction: None
715 * Geometry: Water3-Particle
716 * Calculate force/pot: Force
719 nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_double
720 (t_nblist
* gmx_restrict nlist
,
721 rvec
* gmx_restrict xx
,
722 rvec
* gmx_restrict ff
,
723 struct t_forcerec
* gmx_restrict fr
,
724 t_mdatoms
* gmx_restrict mdatoms
,
725 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
726 t_nrnb
* gmx_restrict nrnb
)
728 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
729 * just 0 for non-waters.
730 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
731 * jnr indices corresponding to data put in the four positions in the SIMD register.
733 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
734 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
736 int j_coord_offsetA
,j_coord_offsetB
;
737 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
739 real
*shiftvec
,*fshift
,*x
,*f
;
740 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
742 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
744 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
746 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
747 int vdwjidx0A
,vdwjidx0B
;
748 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
749 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
750 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
751 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
752 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
755 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
757 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
758 real rswitch_scalar
,d_scalar
;
759 __m128d dummy_mask
,cutoff_mask
;
760 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
761 __m128d one
= _mm_set1_pd(1.0);
762 __m128d two
= _mm_set1_pd(2.0);
768 jindex
= nlist
->jindex
;
770 shiftidx
= nlist
->shift
;
772 shiftvec
= fr
->shift_vec
[0];
773 fshift
= fr
->fshift
[0];
774 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
775 charge
= mdatoms
->chargeA
;
777 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
778 ewtab
= fr
->ic
->tabq_coul_FDV0
;
779 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
780 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
782 /* Setup water-specific parameters */
783 inr
= nlist
->iinr
[0];
784 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
785 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
786 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
788 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
789 rcutoff_scalar
= fr
->ic
->rcoulomb
;
790 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
791 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
793 rswitch_scalar
= fr
->ic
->rcoulomb_switch
;
794 rswitch
= _mm_set1_pd(rswitch_scalar
);
795 /* Setup switch parameters */
796 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
797 d
= _mm_set1_pd(d_scalar
);
798 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
799 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
800 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
801 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
802 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
803 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
805 /* Avoid stupid compiler warnings */
813 /* Start outer loop over neighborlists */
814 for(iidx
=0; iidx
<nri
; iidx
++)
816 /* Load shift vector for this list */
817 i_shift_offset
= DIM
*shiftidx
[iidx
];
819 /* Load limits for loop over neighbors */
820 j_index_start
= jindex
[iidx
];
821 j_index_end
= jindex
[iidx
+1];
823 /* Get outer coordinate index */
825 i_coord_offset
= DIM
*inr
;
827 /* Load i particle coords and add shift vector */
828 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
829 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
831 fix0
= _mm_setzero_pd();
832 fiy0
= _mm_setzero_pd();
833 fiz0
= _mm_setzero_pd();
834 fix1
= _mm_setzero_pd();
835 fiy1
= _mm_setzero_pd();
836 fiz1
= _mm_setzero_pd();
837 fix2
= _mm_setzero_pd();
838 fiy2
= _mm_setzero_pd();
839 fiz2
= _mm_setzero_pd();
841 /* Start inner kernel loop */
842 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
845 /* Get j neighbor index, and coordinate index */
848 j_coord_offsetA
= DIM
*jnrA
;
849 j_coord_offsetB
= DIM
*jnrB
;
851 /* load j atom coordinates */
852 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
855 /* Calculate displacement vector */
856 dx00
= _mm_sub_pd(ix0
,jx0
);
857 dy00
= _mm_sub_pd(iy0
,jy0
);
858 dz00
= _mm_sub_pd(iz0
,jz0
);
859 dx10
= _mm_sub_pd(ix1
,jx0
);
860 dy10
= _mm_sub_pd(iy1
,jy0
);
861 dz10
= _mm_sub_pd(iz1
,jz0
);
862 dx20
= _mm_sub_pd(ix2
,jx0
);
863 dy20
= _mm_sub_pd(iy2
,jy0
);
864 dz20
= _mm_sub_pd(iz2
,jz0
);
866 /* Calculate squared distance and things based on it */
867 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
868 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
869 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
871 rinv00
= sse41_invsqrt_d(rsq00
);
872 rinv10
= sse41_invsqrt_d(rsq10
);
873 rinv20
= sse41_invsqrt_d(rsq20
);
875 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
876 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
877 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
879 /* Load parameters for j particles */
880 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
882 fjx0
= _mm_setzero_pd();
883 fjy0
= _mm_setzero_pd();
884 fjz0
= _mm_setzero_pd();
886 /**************************
887 * CALCULATE INTERACTIONS *
888 **************************/
890 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
893 r00
= _mm_mul_pd(rsq00
,rinv00
);
895 /* Compute parameters for interactions between i and j atoms */
896 qq00
= _mm_mul_pd(iq0
,jq0
);
898 /* EWALD ELECTROSTATICS */
900 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
901 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
902 ewitab
= _mm_cvttpd_epi32(ewrt
);
903 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
904 ewitab
= _mm_slli_epi32(ewitab
,2);
905 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
906 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
907 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
908 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
909 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
910 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
911 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
912 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
913 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
914 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
916 d
= _mm_sub_pd(r00
,rswitch
);
917 d
= _mm_max_pd(d
,_mm_setzero_pd());
918 d2
= _mm_mul_pd(d
,d
);
919 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
921 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
923 /* Evaluate switch function */
924 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
925 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(velec
,dsw
)) );
926 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
930 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
932 /* Calculate temporary vectorial force */
933 tx
= _mm_mul_pd(fscal
,dx00
);
934 ty
= _mm_mul_pd(fscal
,dy00
);
935 tz
= _mm_mul_pd(fscal
,dz00
);
937 /* Update vectorial force */
938 fix0
= _mm_add_pd(fix0
,tx
);
939 fiy0
= _mm_add_pd(fiy0
,ty
);
940 fiz0
= _mm_add_pd(fiz0
,tz
);
942 fjx0
= _mm_add_pd(fjx0
,tx
);
943 fjy0
= _mm_add_pd(fjy0
,ty
);
944 fjz0
= _mm_add_pd(fjz0
,tz
);
948 /**************************
949 * CALCULATE INTERACTIONS *
950 **************************/
952 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
955 r10
= _mm_mul_pd(rsq10
,rinv10
);
957 /* Compute parameters for interactions between i and j atoms */
958 qq10
= _mm_mul_pd(iq1
,jq0
);
960 /* EWALD ELECTROSTATICS */
962 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
963 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
964 ewitab
= _mm_cvttpd_epi32(ewrt
);
965 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
966 ewitab
= _mm_slli_epi32(ewitab
,2);
967 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
968 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
969 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
970 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
971 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
972 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
973 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
974 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
975 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
976 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
978 d
= _mm_sub_pd(r10
,rswitch
);
979 d
= _mm_max_pd(d
,_mm_setzero_pd());
980 d2
= _mm_mul_pd(d
,d
);
981 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
983 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
985 /* Evaluate switch function */
986 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
987 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
988 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
992 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
994 /* Calculate temporary vectorial force */
995 tx
= _mm_mul_pd(fscal
,dx10
);
996 ty
= _mm_mul_pd(fscal
,dy10
);
997 tz
= _mm_mul_pd(fscal
,dz10
);
999 /* Update vectorial force */
1000 fix1
= _mm_add_pd(fix1
,tx
);
1001 fiy1
= _mm_add_pd(fiy1
,ty
);
1002 fiz1
= _mm_add_pd(fiz1
,tz
);
1004 fjx0
= _mm_add_pd(fjx0
,tx
);
1005 fjy0
= _mm_add_pd(fjy0
,ty
);
1006 fjz0
= _mm_add_pd(fjz0
,tz
);
1010 /**************************
1011 * CALCULATE INTERACTIONS *
1012 **************************/
1014 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1017 r20
= _mm_mul_pd(rsq20
,rinv20
);
1019 /* Compute parameters for interactions between i and j atoms */
1020 qq20
= _mm_mul_pd(iq2
,jq0
);
1022 /* EWALD ELECTROSTATICS */
1024 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1025 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1026 ewitab
= _mm_cvttpd_epi32(ewrt
);
1027 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1028 ewitab
= _mm_slli_epi32(ewitab
,2);
1029 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1030 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1031 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1032 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1033 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
1034 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1035 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1036 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1037 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
1038 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1040 d
= _mm_sub_pd(r20
,rswitch
);
1041 d
= _mm_max_pd(d
,_mm_setzero_pd());
1042 d2
= _mm_mul_pd(d
,d
);
1043 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1045 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1047 /* Evaluate switch function */
1048 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1049 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
1050 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1054 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1056 /* Calculate temporary vectorial force */
1057 tx
= _mm_mul_pd(fscal
,dx20
);
1058 ty
= _mm_mul_pd(fscal
,dy20
);
1059 tz
= _mm_mul_pd(fscal
,dz20
);
1061 /* Update vectorial force */
1062 fix2
= _mm_add_pd(fix2
,tx
);
1063 fiy2
= _mm_add_pd(fiy2
,ty
);
1064 fiz2
= _mm_add_pd(fiz2
,tz
);
1066 fjx0
= _mm_add_pd(fjx0
,tx
);
1067 fjy0
= _mm_add_pd(fjy0
,ty
);
1068 fjz0
= _mm_add_pd(fjz0
,tz
);
1072 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
1074 /* Inner loop uses 189 flops */
1077 if(jidx
<j_index_end
)
1081 j_coord_offsetA
= DIM
*jnrA
;
1083 /* load j atom coordinates */
1084 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1087 /* Calculate displacement vector */
1088 dx00
= _mm_sub_pd(ix0
,jx0
);
1089 dy00
= _mm_sub_pd(iy0
,jy0
);
1090 dz00
= _mm_sub_pd(iz0
,jz0
);
1091 dx10
= _mm_sub_pd(ix1
,jx0
);
1092 dy10
= _mm_sub_pd(iy1
,jy0
);
1093 dz10
= _mm_sub_pd(iz1
,jz0
);
1094 dx20
= _mm_sub_pd(ix2
,jx0
);
1095 dy20
= _mm_sub_pd(iy2
,jy0
);
1096 dz20
= _mm_sub_pd(iz2
,jz0
);
1098 /* Calculate squared distance and things based on it */
1099 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1100 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1101 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1103 rinv00
= sse41_invsqrt_d(rsq00
);
1104 rinv10
= sse41_invsqrt_d(rsq10
);
1105 rinv20
= sse41_invsqrt_d(rsq20
);
1107 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1108 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1109 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1111 /* Load parameters for j particles */
1112 jq0
= _mm_load_sd(charge
+jnrA
+0);
1114 fjx0
= _mm_setzero_pd();
1115 fjy0
= _mm_setzero_pd();
1116 fjz0
= _mm_setzero_pd();
1118 /**************************
1119 * CALCULATE INTERACTIONS *
1120 **************************/
1122 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1125 r00
= _mm_mul_pd(rsq00
,rinv00
);
1127 /* Compute parameters for interactions between i and j atoms */
1128 qq00
= _mm_mul_pd(iq0
,jq0
);
1130 /* EWALD ELECTROSTATICS */
1132 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1133 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
1134 ewitab
= _mm_cvttpd_epi32(ewrt
);
1135 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1136 ewitab
= _mm_slli_epi32(ewitab
,2);
1137 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1138 ewtabD
= _mm_setzero_pd();
1139 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1140 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1141 ewtabFn
= _mm_setzero_pd();
1142 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1143 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1144 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1145 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
1146 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
1148 d
= _mm_sub_pd(r00
,rswitch
);
1149 d
= _mm_max_pd(d
,_mm_setzero_pd());
1150 d2
= _mm_mul_pd(d
,d
);
1151 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1153 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1155 /* Evaluate switch function */
1156 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1157 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(velec
,dsw
)) );
1158 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1162 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1164 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1166 /* Calculate temporary vectorial force */
1167 tx
= _mm_mul_pd(fscal
,dx00
);
1168 ty
= _mm_mul_pd(fscal
,dy00
);
1169 tz
= _mm_mul_pd(fscal
,dz00
);
1171 /* Update vectorial force */
1172 fix0
= _mm_add_pd(fix0
,tx
);
1173 fiy0
= _mm_add_pd(fiy0
,ty
);
1174 fiz0
= _mm_add_pd(fiz0
,tz
);
1176 fjx0
= _mm_add_pd(fjx0
,tx
);
1177 fjy0
= _mm_add_pd(fjy0
,ty
);
1178 fjz0
= _mm_add_pd(fjz0
,tz
);
1182 /**************************
1183 * CALCULATE INTERACTIONS *
1184 **************************/
1186 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1189 r10
= _mm_mul_pd(rsq10
,rinv10
);
1191 /* Compute parameters for interactions between i and j atoms */
1192 qq10
= _mm_mul_pd(iq1
,jq0
);
1194 /* EWALD ELECTROSTATICS */
1196 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1197 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1198 ewitab
= _mm_cvttpd_epi32(ewrt
);
1199 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1200 ewitab
= _mm_slli_epi32(ewitab
,2);
1201 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1202 ewtabD
= _mm_setzero_pd();
1203 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1204 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1205 ewtabFn
= _mm_setzero_pd();
1206 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1207 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1208 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1209 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
1210 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1212 d
= _mm_sub_pd(r10
,rswitch
);
1213 d
= _mm_max_pd(d
,_mm_setzero_pd());
1214 d2
= _mm_mul_pd(d
,d
);
1215 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1217 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1219 /* Evaluate switch function */
1220 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1221 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
1222 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1226 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1228 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1230 /* Calculate temporary vectorial force */
1231 tx
= _mm_mul_pd(fscal
,dx10
);
1232 ty
= _mm_mul_pd(fscal
,dy10
);
1233 tz
= _mm_mul_pd(fscal
,dz10
);
1235 /* Update vectorial force */
1236 fix1
= _mm_add_pd(fix1
,tx
);
1237 fiy1
= _mm_add_pd(fiy1
,ty
);
1238 fiz1
= _mm_add_pd(fiz1
,tz
);
1240 fjx0
= _mm_add_pd(fjx0
,tx
);
1241 fjy0
= _mm_add_pd(fjy0
,ty
);
1242 fjz0
= _mm_add_pd(fjz0
,tz
);
1246 /**************************
1247 * CALCULATE INTERACTIONS *
1248 **************************/
1250 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1253 r20
= _mm_mul_pd(rsq20
,rinv20
);
1255 /* Compute parameters for interactions between i and j atoms */
1256 qq20
= _mm_mul_pd(iq2
,jq0
);
1258 /* EWALD ELECTROSTATICS */
1260 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1261 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1262 ewitab
= _mm_cvttpd_epi32(ewrt
);
1263 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1264 ewitab
= _mm_slli_epi32(ewitab
,2);
1265 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1266 ewtabD
= _mm_setzero_pd();
1267 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1268 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1269 ewtabFn
= _mm_setzero_pd();
1270 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1271 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1272 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1273 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
1274 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1276 d
= _mm_sub_pd(r20
,rswitch
);
1277 d
= _mm_max_pd(d
,_mm_setzero_pd());
1278 d2
= _mm_mul_pd(d
,d
);
1279 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1281 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1283 /* Evaluate switch function */
1284 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1285 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
1286 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1290 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1292 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1294 /* Calculate temporary vectorial force */
1295 tx
= _mm_mul_pd(fscal
,dx20
);
1296 ty
= _mm_mul_pd(fscal
,dy20
);
1297 tz
= _mm_mul_pd(fscal
,dz20
);
1299 /* Update vectorial force */
1300 fix2
= _mm_add_pd(fix2
,tx
);
1301 fiy2
= _mm_add_pd(fiy2
,ty
);
1302 fiz2
= _mm_add_pd(fiz2
,tz
);
1304 fjx0
= _mm_add_pd(fjx0
,tx
);
1305 fjy0
= _mm_add_pd(fjy0
,ty
);
1306 fjz0
= _mm_add_pd(fjz0
,tz
);
1310 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1312 /* Inner loop uses 189 flops */
1315 /* End of innermost loop */
1317 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1318 f
+i_coord_offset
,fshift
+i_shift_offset
);
1320 /* Increment number of inner iterations */
1321 inneriter
+= j_index_end
- j_index_start
;
1323 /* Outer loop uses 18 flops */
1326 /* Increment number of outer iterations */
1329 /* Update outer/inner flops */
1331 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3_F
,outeriter
*18 + inneriter
*189);