2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water3-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_avx_128_fma_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int j_coord_offsetA
,j_coord_offsetB
;
75 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
77 real
*shiftvec
,*fshift
,*x
,*f
;
78 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
80 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
82 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
84 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
85 int vdwjidx0A
,vdwjidx0B
;
86 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
87 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
88 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
89 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
90 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
93 __m128d ewtabscale
,eweps
,twoeweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
95 __m128d dummy_mask
,cutoff_mask
;
96 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
97 __m128d one
= _mm_set1_pd(1.0);
98 __m128d two
= _mm_set1_pd(2.0);
104 jindex
= nlist
->jindex
;
106 shiftidx
= nlist
->shift
;
108 shiftvec
= fr
->shift_vec
[0];
109 fshift
= fr
->fshift
[0];
110 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
111 charge
= mdatoms
->chargeA
;
113 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
114 ewtab
= fr
->ic
->tabq_coul_FDV0
;
115 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
116 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
118 /* Setup water-specific parameters */
119 inr
= nlist
->iinr
[0];
120 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
121 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
122 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
124 /* Avoid stupid compiler warnings */
132 /* Start outer loop over neighborlists */
133 for(iidx
=0; iidx
<nri
; iidx
++)
135 /* Load shift vector for this list */
136 i_shift_offset
= DIM
*shiftidx
[iidx
];
138 /* Load limits for loop over neighbors */
139 j_index_start
= jindex
[iidx
];
140 j_index_end
= jindex
[iidx
+1];
142 /* Get outer coordinate index */
144 i_coord_offset
= DIM
*inr
;
146 /* Load i particle coords and add shift vector */
147 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
148 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
150 fix0
= _mm_setzero_pd();
151 fiy0
= _mm_setzero_pd();
152 fiz0
= _mm_setzero_pd();
153 fix1
= _mm_setzero_pd();
154 fiy1
= _mm_setzero_pd();
155 fiz1
= _mm_setzero_pd();
156 fix2
= _mm_setzero_pd();
157 fiy2
= _mm_setzero_pd();
158 fiz2
= _mm_setzero_pd();
160 /* Reset potential sums */
161 velecsum
= _mm_setzero_pd();
163 /* Start inner kernel loop */
164 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
167 /* Get j neighbor index, and coordinate index */
170 j_coord_offsetA
= DIM
*jnrA
;
171 j_coord_offsetB
= DIM
*jnrB
;
173 /* load j atom coordinates */
174 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
177 /* Calculate displacement vector */
178 dx00
= _mm_sub_pd(ix0
,jx0
);
179 dy00
= _mm_sub_pd(iy0
,jy0
);
180 dz00
= _mm_sub_pd(iz0
,jz0
);
181 dx10
= _mm_sub_pd(ix1
,jx0
);
182 dy10
= _mm_sub_pd(iy1
,jy0
);
183 dz10
= _mm_sub_pd(iz1
,jz0
);
184 dx20
= _mm_sub_pd(ix2
,jx0
);
185 dy20
= _mm_sub_pd(iy2
,jy0
);
186 dz20
= _mm_sub_pd(iz2
,jz0
);
188 /* Calculate squared distance and things based on it */
189 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
190 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
191 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
193 rinv00
= avx128fma_invsqrt_d(rsq00
);
194 rinv10
= avx128fma_invsqrt_d(rsq10
);
195 rinv20
= avx128fma_invsqrt_d(rsq20
);
197 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
198 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
199 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
201 /* Load parameters for j particles */
202 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
204 fjx0
= _mm_setzero_pd();
205 fjy0
= _mm_setzero_pd();
206 fjz0
= _mm_setzero_pd();
208 /**************************
209 * CALCULATE INTERACTIONS *
210 **************************/
212 r00
= _mm_mul_pd(rsq00
,rinv00
);
214 /* Compute parameters for interactions between i and j atoms */
215 qq00
= _mm_mul_pd(iq0
,jq0
);
217 /* EWALD ELECTROSTATICS */
219 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
220 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
221 ewitab
= _mm_cvttpd_epi32(ewrt
);
223 eweps
= _mm_frcz_pd(ewrt
);
225 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
227 twoeweps
= _mm_add_pd(eweps
,eweps
);
228 ewitab
= _mm_slli_epi32(ewitab
,2);
229 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
230 ewtabD
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
231 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
232 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
233 ewtabFn
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,1) +2);
234 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
235 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
236 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
237 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
238 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
240 /* Update potential sum for this i atom from the interaction with this j atom. */
241 velecsum
= _mm_add_pd(velecsum
,velec
);
245 /* Update vectorial force */
246 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
247 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
248 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
250 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
251 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
252 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
254 /**************************
255 * CALCULATE INTERACTIONS *
256 **************************/
258 r10
= _mm_mul_pd(rsq10
,rinv10
);
260 /* Compute parameters for interactions between i and j atoms */
261 qq10
= _mm_mul_pd(iq1
,jq0
);
263 /* EWALD ELECTROSTATICS */
265 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
266 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
267 ewitab
= _mm_cvttpd_epi32(ewrt
);
269 eweps
= _mm_frcz_pd(ewrt
);
271 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
273 twoeweps
= _mm_add_pd(eweps
,eweps
);
274 ewitab
= _mm_slli_epi32(ewitab
,2);
275 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
276 ewtabD
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
277 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
278 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
279 ewtabFn
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,1) +2);
280 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
281 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
282 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
283 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
284 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
286 /* Update potential sum for this i atom from the interaction with this j atom. */
287 velecsum
= _mm_add_pd(velecsum
,velec
);
291 /* Update vectorial force */
292 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
293 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
294 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
296 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
297 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
298 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
300 /**************************
301 * CALCULATE INTERACTIONS *
302 **************************/
304 r20
= _mm_mul_pd(rsq20
,rinv20
);
306 /* Compute parameters for interactions between i and j atoms */
307 qq20
= _mm_mul_pd(iq2
,jq0
);
309 /* EWALD ELECTROSTATICS */
311 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
312 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
313 ewitab
= _mm_cvttpd_epi32(ewrt
);
315 eweps
= _mm_frcz_pd(ewrt
);
317 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
319 twoeweps
= _mm_add_pd(eweps
,eweps
);
320 ewitab
= _mm_slli_epi32(ewitab
,2);
321 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
322 ewtabD
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
323 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
324 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
325 ewtabFn
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,1) +2);
326 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
327 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
328 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
329 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
330 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
332 /* Update potential sum for this i atom from the interaction with this j atom. */
333 velecsum
= _mm_add_pd(velecsum
,velec
);
337 /* Update vectorial force */
338 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
339 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
340 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
342 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
343 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
344 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
346 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
348 /* Inner loop uses 135 flops */
355 j_coord_offsetA
= DIM
*jnrA
;
357 /* load j atom coordinates */
358 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
361 /* Calculate displacement vector */
362 dx00
= _mm_sub_pd(ix0
,jx0
);
363 dy00
= _mm_sub_pd(iy0
,jy0
);
364 dz00
= _mm_sub_pd(iz0
,jz0
);
365 dx10
= _mm_sub_pd(ix1
,jx0
);
366 dy10
= _mm_sub_pd(iy1
,jy0
);
367 dz10
= _mm_sub_pd(iz1
,jz0
);
368 dx20
= _mm_sub_pd(ix2
,jx0
);
369 dy20
= _mm_sub_pd(iy2
,jy0
);
370 dz20
= _mm_sub_pd(iz2
,jz0
);
372 /* Calculate squared distance and things based on it */
373 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
374 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
375 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
377 rinv00
= avx128fma_invsqrt_d(rsq00
);
378 rinv10
= avx128fma_invsqrt_d(rsq10
);
379 rinv20
= avx128fma_invsqrt_d(rsq20
);
381 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
382 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
383 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
385 /* Load parameters for j particles */
386 jq0
= _mm_load_sd(charge
+jnrA
+0);
388 fjx0
= _mm_setzero_pd();
389 fjy0
= _mm_setzero_pd();
390 fjz0
= _mm_setzero_pd();
392 /**************************
393 * CALCULATE INTERACTIONS *
394 **************************/
396 r00
= _mm_mul_pd(rsq00
,rinv00
);
398 /* Compute parameters for interactions between i and j atoms */
399 qq00
= _mm_mul_pd(iq0
,jq0
);
401 /* EWALD ELECTROSTATICS */
403 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
404 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
405 ewitab
= _mm_cvttpd_epi32(ewrt
);
407 eweps
= _mm_frcz_pd(ewrt
);
409 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
411 twoeweps
= _mm_add_pd(eweps
,eweps
);
412 ewitab
= _mm_slli_epi32(ewitab
,2);
413 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
414 ewtabD
= _mm_setzero_pd();
415 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
416 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
417 ewtabFn
= _mm_setzero_pd();
418 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
419 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
420 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
421 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
422 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
424 /* Update potential sum for this i atom from the interaction with this j atom. */
425 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
426 velecsum
= _mm_add_pd(velecsum
,velec
);
430 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
432 /* Update vectorial force */
433 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
434 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
435 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
437 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
438 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
439 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
441 /**************************
442 * CALCULATE INTERACTIONS *
443 **************************/
445 r10
= _mm_mul_pd(rsq10
,rinv10
);
447 /* Compute parameters for interactions between i and j atoms */
448 qq10
= _mm_mul_pd(iq1
,jq0
);
450 /* EWALD ELECTROSTATICS */
452 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
453 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
454 ewitab
= _mm_cvttpd_epi32(ewrt
);
456 eweps
= _mm_frcz_pd(ewrt
);
458 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
460 twoeweps
= _mm_add_pd(eweps
,eweps
);
461 ewitab
= _mm_slli_epi32(ewitab
,2);
462 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
463 ewtabD
= _mm_setzero_pd();
464 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
465 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
466 ewtabFn
= _mm_setzero_pd();
467 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
468 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
469 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
470 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
471 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
473 /* Update potential sum for this i atom from the interaction with this j atom. */
474 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
475 velecsum
= _mm_add_pd(velecsum
,velec
);
479 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
481 /* Update vectorial force */
482 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
483 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
484 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
486 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
487 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
488 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
490 /**************************
491 * CALCULATE INTERACTIONS *
492 **************************/
494 r20
= _mm_mul_pd(rsq20
,rinv20
);
496 /* Compute parameters for interactions between i and j atoms */
497 qq20
= _mm_mul_pd(iq2
,jq0
);
499 /* EWALD ELECTROSTATICS */
501 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
502 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
503 ewitab
= _mm_cvttpd_epi32(ewrt
);
505 eweps
= _mm_frcz_pd(ewrt
);
507 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
509 twoeweps
= _mm_add_pd(eweps
,eweps
);
510 ewitab
= _mm_slli_epi32(ewitab
,2);
511 ewtabF
= _mm_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
512 ewtabD
= _mm_setzero_pd();
513 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
514 ewtabV
= _mm_load_sd( ewtab
+ _mm_extract_epi32(ewitab
,0) +2);
515 ewtabFn
= _mm_setzero_pd();
516 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
517 felec
= _mm_macc_pd(eweps
,ewtabD
,ewtabF
);
518 velec
= _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace
,eweps
) ,_mm_add_pd(ewtabF
,felec
), ewtabV
);
519 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
520 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
522 /* Update potential sum for this i atom from the interaction with this j atom. */
523 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
524 velecsum
= _mm_add_pd(velecsum
,velec
);
528 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
530 /* Update vectorial force */
531 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
532 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
533 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
535 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
536 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
537 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
539 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
541 /* Inner loop uses 135 flops */
544 /* End of innermost loop */
546 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
547 f
+i_coord_offset
,fshift
+i_shift_offset
);
550 /* Update potential energies */
551 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
553 /* Increment number of inner iterations */
554 inneriter
+= j_index_end
- j_index_start
;
556 /* Outer loop uses 19 flops */
559 /* Increment number of outer iterations */
562 /* Update outer/inner flops */
564 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3_VF
,outeriter
*19 + inneriter
*135);
567 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_double
568 * Electrostatics interaction: Ewald
569 * VdW interaction: None
570 * Geometry: Water3-Particle
571 * Calculate force/pot: Force
574 nb_kernel_ElecEw_VdwNone_GeomW3P1_F_avx_128_fma_double
575 (t_nblist
* gmx_restrict nlist
,
576 rvec
* gmx_restrict xx
,
577 rvec
* gmx_restrict ff
,
578 struct t_forcerec
* gmx_restrict fr
,
579 t_mdatoms
* gmx_restrict mdatoms
,
580 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
581 t_nrnb
* gmx_restrict nrnb
)
583 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
584 * just 0 for non-waters.
585 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
586 * jnr indices corresponding to data put in the four positions in the SIMD register.
588 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
589 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
591 int j_coord_offsetA
,j_coord_offsetB
;
592 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
594 real
*shiftvec
,*fshift
,*x
,*f
;
595 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
597 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
599 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
601 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
602 int vdwjidx0A
,vdwjidx0B
;
603 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
604 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
605 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
606 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
607 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
610 __m128d ewtabscale
,eweps
,twoeweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
612 __m128d dummy_mask
,cutoff_mask
;
613 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
614 __m128d one
= _mm_set1_pd(1.0);
615 __m128d two
= _mm_set1_pd(2.0);
621 jindex
= nlist
->jindex
;
623 shiftidx
= nlist
->shift
;
625 shiftvec
= fr
->shift_vec
[0];
626 fshift
= fr
->fshift
[0];
627 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
628 charge
= mdatoms
->chargeA
;
630 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
631 ewtab
= fr
->ic
->tabq_coul_F
;
632 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
633 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
635 /* Setup water-specific parameters */
636 inr
= nlist
->iinr
[0];
637 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
638 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
639 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
641 /* Avoid stupid compiler warnings */
649 /* Start outer loop over neighborlists */
650 for(iidx
=0; iidx
<nri
; iidx
++)
652 /* Load shift vector for this list */
653 i_shift_offset
= DIM
*shiftidx
[iidx
];
655 /* Load limits for loop over neighbors */
656 j_index_start
= jindex
[iidx
];
657 j_index_end
= jindex
[iidx
+1];
659 /* Get outer coordinate index */
661 i_coord_offset
= DIM
*inr
;
663 /* Load i particle coords and add shift vector */
664 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
665 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
667 fix0
= _mm_setzero_pd();
668 fiy0
= _mm_setzero_pd();
669 fiz0
= _mm_setzero_pd();
670 fix1
= _mm_setzero_pd();
671 fiy1
= _mm_setzero_pd();
672 fiz1
= _mm_setzero_pd();
673 fix2
= _mm_setzero_pd();
674 fiy2
= _mm_setzero_pd();
675 fiz2
= _mm_setzero_pd();
677 /* Start inner kernel loop */
678 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
681 /* Get j neighbor index, and coordinate index */
684 j_coord_offsetA
= DIM
*jnrA
;
685 j_coord_offsetB
= DIM
*jnrB
;
687 /* load j atom coordinates */
688 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
691 /* Calculate displacement vector */
692 dx00
= _mm_sub_pd(ix0
,jx0
);
693 dy00
= _mm_sub_pd(iy0
,jy0
);
694 dz00
= _mm_sub_pd(iz0
,jz0
);
695 dx10
= _mm_sub_pd(ix1
,jx0
);
696 dy10
= _mm_sub_pd(iy1
,jy0
);
697 dz10
= _mm_sub_pd(iz1
,jz0
);
698 dx20
= _mm_sub_pd(ix2
,jx0
);
699 dy20
= _mm_sub_pd(iy2
,jy0
);
700 dz20
= _mm_sub_pd(iz2
,jz0
);
702 /* Calculate squared distance and things based on it */
703 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
704 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
705 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
707 rinv00
= avx128fma_invsqrt_d(rsq00
);
708 rinv10
= avx128fma_invsqrt_d(rsq10
);
709 rinv20
= avx128fma_invsqrt_d(rsq20
);
711 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
712 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
713 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
715 /* Load parameters for j particles */
716 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
718 fjx0
= _mm_setzero_pd();
719 fjy0
= _mm_setzero_pd();
720 fjz0
= _mm_setzero_pd();
722 /**************************
723 * CALCULATE INTERACTIONS *
724 **************************/
726 r00
= _mm_mul_pd(rsq00
,rinv00
);
728 /* Compute parameters for interactions between i and j atoms */
729 qq00
= _mm_mul_pd(iq0
,jq0
);
731 /* EWALD ELECTROSTATICS */
733 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
734 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
735 ewitab
= _mm_cvttpd_epi32(ewrt
);
737 eweps
= _mm_frcz_pd(ewrt
);
739 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
741 twoeweps
= _mm_add_pd(eweps
,eweps
);
742 gmx_mm_load_2pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),ewtab
+_mm_extract_epi32(ewitab
,1),
744 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
745 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
749 /* Update vectorial force */
750 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
751 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
752 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
754 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
755 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
756 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
758 /**************************
759 * CALCULATE INTERACTIONS *
760 **************************/
762 r10
= _mm_mul_pd(rsq10
,rinv10
);
764 /* Compute parameters for interactions between i and j atoms */
765 qq10
= _mm_mul_pd(iq1
,jq0
);
767 /* EWALD ELECTROSTATICS */
769 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
770 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
771 ewitab
= _mm_cvttpd_epi32(ewrt
);
773 eweps
= _mm_frcz_pd(ewrt
);
775 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
777 twoeweps
= _mm_add_pd(eweps
,eweps
);
778 gmx_mm_load_2pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),ewtab
+_mm_extract_epi32(ewitab
,1),
780 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
781 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
785 /* Update vectorial force */
786 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
787 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
788 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
790 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
791 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
792 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
794 /**************************
795 * CALCULATE INTERACTIONS *
796 **************************/
798 r20
= _mm_mul_pd(rsq20
,rinv20
);
800 /* Compute parameters for interactions between i and j atoms */
801 qq20
= _mm_mul_pd(iq2
,jq0
);
803 /* EWALD ELECTROSTATICS */
805 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
806 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
807 ewitab
= _mm_cvttpd_epi32(ewrt
);
809 eweps
= _mm_frcz_pd(ewrt
);
811 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
813 twoeweps
= _mm_add_pd(eweps
,eweps
);
814 gmx_mm_load_2pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),ewtab
+_mm_extract_epi32(ewitab
,1),
816 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
817 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
821 /* Update vectorial force */
822 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
823 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
824 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
826 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
827 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
828 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
830 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
832 /* Inner loop uses 120 flops */
839 j_coord_offsetA
= DIM
*jnrA
;
841 /* load j atom coordinates */
842 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
845 /* Calculate displacement vector */
846 dx00
= _mm_sub_pd(ix0
,jx0
);
847 dy00
= _mm_sub_pd(iy0
,jy0
);
848 dz00
= _mm_sub_pd(iz0
,jz0
);
849 dx10
= _mm_sub_pd(ix1
,jx0
);
850 dy10
= _mm_sub_pd(iy1
,jy0
);
851 dz10
= _mm_sub_pd(iz1
,jz0
);
852 dx20
= _mm_sub_pd(ix2
,jx0
);
853 dy20
= _mm_sub_pd(iy2
,jy0
);
854 dz20
= _mm_sub_pd(iz2
,jz0
);
856 /* Calculate squared distance and things based on it */
857 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
858 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
859 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
861 rinv00
= avx128fma_invsqrt_d(rsq00
);
862 rinv10
= avx128fma_invsqrt_d(rsq10
);
863 rinv20
= avx128fma_invsqrt_d(rsq20
);
865 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
866 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
867 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
869 /* Load parameters for j particles */
870 jq0
= _mm_load_sd(charge
+jnrA
+0);
872 fjx0
= _mm_setzero_pd();
873 fjy0
= _mm_setzero_pd();
874 fjz0
= _mm_setzero_pd();
876 /**************************
877 * CALCULATE INTERACTIONS *
878 **************************/
880 r00
= _mm_mul_pd(rsq00
,rinv00
);
882 /* Compute parameters for interactions between i and j atoms */
883 qq00
= _mm_mul_pd(iq0
,jq0
);
885 /* EWALD ELECTROSTATICS */
887 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
888 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
889 ewitab
= _mm_cvttpd_epi32(ewrt
);
891 eweps
= _mm_frcz_pd(ewrt
);
893 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
895 twoeweps
= _mm_add_pd(eweps
,eweps
);
896 gmx_mm_load_1pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
897 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
898 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
902 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
904 /* Update vectorial force */
905 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
906 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
907 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
909 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
910 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
911 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
913 /**************************
914 * CALCULATE INTERACTIONS *
915 **************************/
917 r10
= _mm_mul_pd(rsq10
,rinv10
);
919 /* Compute parameters for interactions between i and j atoms */
920 qq10
= _mm_mul_pd(iq1
,jq0
);
922 /* EWALD ELECTROSTATICS */
924 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
925 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
926 ewitab
= _mm_cvttpd_epi32(ewrt
);
928 eweps
= _mm_frcz_pd(ewrt
);
930 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
932 twoeweps
= _mm_add_pd(eweps
,eweps
);
933 gmx_mm_load_1pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
934 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
935 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
939 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
941 /* Update vectorial force */
942 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
943 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
944 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
946 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
947 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
948 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
950 /**************************
951 * CALCULATE INTERACTIONS *
952 **************************/
954 r20
= _mm_mul_pd(rsq20
,rinv20
);
956 /* Compute parameters for interactions between i and j atoms */
957 qq20
= _mm_mul_pd(iq2
,jq0
);
959 /* EWALD ELECTROSTATICS */
961 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
962 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
963 ewitab
= _mm_cvttpd_epi32(ewrt
);
965 eweps
= _mm_frcz_pd(ewrt
);
967 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
969 twoeweps
= _mm_add_pd(eweps
,eweps
);
970 gmx_mm_load_1pair_swizzle_pd(ewtab
+_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
971 felec
= _mm_macc_pd(eweps
,ewtabFn
,_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
));
972 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
976 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
978 /* Update vectorial force */
979 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
980 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
981 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
983 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
984 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
985 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
987 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
989 /* Inner loop uses 120 flops */
992 /* End of innermost loop */
994 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
995 f
+i_coord_offset
,fshift
+i_shift_offset
);
997 /* Increment number of inner iterations */
998 inneriter
+= j_index_end
- j_index_start
;
1000 /* Outer loop uses 18 flops */
1003 /* Increment number of outer iterations */
1006 /* Update outer/inner flops */
1008 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3_F
,outeriter
*18 + inneriter
*120);