2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2012,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
43 #include "gromacs/math/functions.h"
44 #include "gromacs/math/utilities.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/mdlib/nbnxn_consts.h"
47 #include "gromacs/mdlib/nbnxn_search.h"
48 #include "gromacs/mdtypes/commrec.h"
49 #include "gromacs/mdtypes/inputrec.h"
50 #include "gromacs/mdtypes/md_enums.h"
51 #include "gromacs/simd/simd.h"
52 #include "gromacs/topology/ifunc.h"
53 #include "gromacs/topology/topology.h"
54 #include "gromacs/utility/fatalerror.h"
56 /* Computational cost of bonded, non-bonded and PME calculations.
57 * This will be machine dependent.
58 * The numbers are only used for estimating the relative cost of PME vs PP,
59 * so only relative numbers matter.
60 * The numbers here are accurate cycle counts for Haswell in single precision
61 * compiled with gcc5.2. A correction factor for other architectures is given
62 * by simd_cycle_factor().
63 * In double precision PME mesh is slightly cheaper, although not so much
64 * that the numbers need to be adjusted.
67 /* Cost of a pair interaction in the "group" cut-off scheme */
68 static const double c_group_fq
= 18.0;
69 static const double c_group_qlj_cut
= 18.0;
70 static const double c_group_qlj_tab
= 24.0;
71 static const double c_group_lj_cut
= 12.0;
72 static const double c_group_lj_tab
= 21.0;
73 /* Cost of 1 water with one Q/LJ atom */
74 static const double c_group_qljw_cut
= 24.0;
75 static const double c_group_qljw_tab
= 27.0;
76 /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
77 static const double c_group_qw
= 21.0;
79 /* Cost of a pair interaction in the "Verlet" cut-off scheme, QEXP is Ewald */
80 static const double c_nbnxn_lj
= 2.5;
81 static const double c_nbnxn_qrf_lj
= 2.9;
82 static const double c_nbnxn_qrf
= 2.4;
83 static const double c_nbnxn_qexp_lj
= 4.2;
84 static const double c_nbnxn_qexp
= 3.8;
85 /* Extra cost for expensive LJ interaction, e.g. pot-switch or LJ-PME */
86 static const double c_nbnxn_ljexp_add
= 1.0;
88 /* Cost of the different components of PME. */
89 /* Cost of particle reordering and redistribution (no SIMD correction).
90 * This will be zero without MPI and can be very high with load imbalance.
91 * Thus we use an approximate value for medium parallelization.
93 static const double c_pme_redist
= 100.0;
94 /* Cost of q spreading and force interpolation per charge. This part almost
95 * doesn't accelerate with SIMD, so we don't use SIMD correction.
97 static const double c_pme_spread
= 5.0;
98 /* Cost of fft's, will be multiplied with 2 N log2(N) (no SIMD correction)
99 * Without MPI the number is 2-3, depending on grid factors and thread count.
100 * We take the high limit to be on the safe side and account for some MPI
101 * communication cost, which will dominate at high parallelization.
103 static const double c_pme_fft
= 3.0;
104 /* Cost of pme_solve, will be multiplied with N */
105 static const double c_pme_solve
= 9.0;
107 /* Cost of a bonded interaction divided by the number of distances calculations
108 * required in one interaction. The actual cost is nearly propotional to this.
110 static const double c_bond
= 25.0;
113 #if GMX_SIMD_HAVE_REAL
114 static const gmx_bool bHaveSIMD
= TRUE
;
116 static const gmx_bool bHaveSIMD
= FALSE
;
119 /* Gives a correction factor for the currently compiled SIMD implementations
120 * versus the reference used for the coefficients above (8-wide SIMD with FMA).
121 * bUseSIMD sets if we asking for plain-C (FALSE) or SIMD (TRUE) code.
123 static double simd_cycle_factor(gmx_bool bUseSIMD
)
125 /* The (average) ratio of the time taken by plain-C force calculations
126 * relative to SIMD versions, for the reference platform Haswell:
127 * 8-wide SIMD with FMA, factor: sqrt(2*8)*1.25 = 5.
128 * This factor is used for normalization in simd_cycle_factor().
130 const double simd_cycle_no_simd
= 5.0;
133 #if GMX_SIMD_HAVE_REAL
136 /* We never get full speed-up of a factor GMX_SIMD_REAL_WIDTH.
137 * The actual speed-up depends very much on gather+scatter overhead,
138 * which is different for different bonded and non-bonded kernels.
139 * As a rough, but actually not bad, approximation we use a sqrt
140 * dependence on the width which gives a factor 4 for width=8.
142 speedup
= std::sqrt(2.0*GMX_SIMD_REAL_WIDTH
);
143 #if GMX_SIMD_HAVE_FMA
144 /* FMA tends to give a bit more speedup */
155 gmx_incons("gmx_cycle_factor() compiled without SIMD called with bUseSIMD=TRUE");
157 /* No SIMD, no speedup */
161 /* Return speed compared to the reference (Haswell).
162 * For x86 SIMD, the nbnxn kernels are relatively much slower on
163 * Sandy/Ivy Bridge than Haswell, but that only leads to a too high
164 * PME load estimate on SB/IB, which is erring on the safe side.
166 return simd_cycle_no_simd
/speedup
;
169 void count_bonded_distances(const gmx_mtop_t
*mtop
, const t_inputrec
*ir
,
170 double *ndistance_c
, double *ndistance_simd
)
173 double nonsimd_step_frac
;
175 double ndtot_c
, ndtot_simd
;
176 #if GMX_SIMD_HAVE_REAL
177 gmx_bool bSimdBondeds
= TRUE
;
179 gmx_bool bSimdBondeds
= FALSE
;
182 bExcl
= (ir
->cutoff_scheme
== ecutsGROUP
&& inputrecExclForces(ir
)
183 && !EEL_FULL(ir
->coulombtype
));
187 /* We only have SIMD versions of these bondeds without energy and
188 * without shift-forces, we take that into account here.
190 if (ir
->nstcalcenergy
> 0)
192 nonsimd_step_frac
= 1.0/ir
->nstcalcenergy
;
196 nonsimd_step_frac
= 0;
198 if (ir
->epc
!= epcNO
&& 1.0/ir
->nstpcouple
> nonsimd_step_frac
)
200 nonsimd_step_frac
= 1.0/ir
->nstpcouple
;
205 nonsimd_step_frac
= 1;
208 /* Count the number of pbc_rvec_sub calls required for bonded interactions.
209 * This number is also roughly proportional to the computational cost.
213 for (const gmx_molblock_t
&molb
: mtop
->molblock
)
215 const gmx_moltype_t
*molt
= &mtop
->moltype
[molb
.type
];
216 for (ftype
= 0; ftype
< F_NRE
; ftype
++)
220 if (interaction_function
[ftype
].flags
& IF_BOND
)
222 double nd_c
, nd_simd
;
226 /* For all interactions, except for the three exceptions
227 * in the switch below, #distances = #atoms - 1.
237 /* These bonded potentially use SIMD */
242 nd_c
= nonsimd_step_frac
*(NRAL(ftype
) - 1);
243 nd_simd
= (1 - nonsimd_step_frac
)*(NRAL(ftype
) - 1);
246 nd_c
= NRAL(ftype
) - 1;
249 nbonds
= molb
.nmol
*molt
->ilist
[ftype
].nr
/(1 + NRAL(ftype
));
250 ndtot_c
+= nbonds
*nd_c
;
251 ndtot_simd
+= nbonds
*nd_simd
;
256 ndtot_c
+= molb
.nmol
*(molt
->excls
.nra
- molt
->atoms
.nr
)/2;
262 fprintf(debug
, "nr. of distance calculations in bondeds: C %.1f SIMD %.1f\n", ndtot_c
, ndtot_simd
);
265 if (ndistance_c
!= nullptr)
267 *ndistance_c
= ndtot_c
;
269 if (ndistance_simd
!= nullptr)
271 *ndistance_simd
= ndtot_simd
;
275 static void pp_group_load(const gmx_mtop_t
*mtop
, const t_inputrec
*ir
,
277 int *nq_tot
, int *nlj_tot
,
279 gmx_bool
*bChargePerturbed
, gmx_bool
*bTypePerturbed
)
281 int atnr
, cg
, a0
, ncqlj
, ncq
, nclj
;
282 gmx_bool bBHAM
, bLJcut
, bWater
, bQ
, bLJ
;
283 int nw
, nqlj
, nq
, nlj
;
284 double fq
, fqlj
, flj
, fqljw
, fqw
;
287 bBHAM
= (mtop
->ffparams
.functype
[0] == F_BHAM
);
289 bLJcut
= ((ir
->vdwtype
== evdwCUT
) && !bBHAM
);
291 /* Computational cost of bonded, non-bonded and PME calculations.
292 * This will be machine dependent.
293 * The numbers here are accurate for Intel Core2 and AMD Athlon 64
294 * in single precision. In double precision PME mesh is slightly cheaper,
295 * although not so much that the numbers need to be adjusted.
298 fqlj
= (bLJcut
? c_group_qlj_cut
: c_group_qlj_tab
);
299 flj
= (bLJcut
? c_group_lj_cut
: c_group_lj_tab
);
300 /* Cost of 1 water with one Q/LJ atom */
301 fqljw
= (bLJcut
? c_group_qljw_cut
: c_group_qljw_tab
);
302 /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
305 iparams
= mtop
->ffparams
.iparams
;
306 atnr
= mtop
->ffparams
.atnr
;
311 *bChargePerturbed
= FALSE
;
312 *bTypePerturbed
= FALSE
;
313 for (const gmx_molblock_t
&molb
: mtop
->molblock
)
315 const gmx_moltype_t
*molt
= &mtop
->moltype
[molb
.type
];
316 const t_atom
*atom
= molt
->atoms
.atom
;
318 for (cg
= 0; cg
< molt
->cgs
.nr
; cg
++)
325 while (a
< molt
->cgs
.index
[cg
+1])
327 bQ
= (atom
[a
].q
!= 0 || atom
[a
].qB
!= 0);
328 bLJ
= (iparams
[(atnr
+1)*atom
[a
].type
].lj
.c6
!= 0 ||
329 iparams
[(atnr
+1)*atom
[a
].type
].lj
.c12
!= 0);
330 if (atom
[a
].q
!= atom
[a
].qB
)
332 *bChargePerturbed
= TRUE
;
334 if (atom
[a
].type
!= atom
[a
].typeB
)
336 *bTypePerturbed
= TRUE
;
338 /* This if this atom fits into water optimization */
339 if (!((a
== a0
&& bQ
&& bLJ
) ||
340 (a
== a0
+1 && bQ
&& !bLJ
) ||
341 (a
== a0
+2 && bQ
&& !bLJ
&& atom
[a
].q
== atom
[a
-1].q
) ||
342 (a
== a0
+3 && !bQ
&& bLJ
)))
369 nqlj
+= molb
.nmol
*ncqlj
;
371 nlj
+= molb
.nmol
*nclj
;
376 *nq_tot
= nq
+ nqlj
+ nw
*3;
377 *nlj_tot
= nlj
+ nqlj
+ nw
;
381 fprintf(debug
, "nw %d nqlj %d nq %d nlj %d\n", nw
, nqlj
, nq
, nlj
);
384 /* For the PP non-bonded cost it is (unrealistically) assumed
385 * that all atoms are distributed homogeneously in space.
386 * Factor 3 is used because a water molecule has 3 atoms
387 * (and TIP4P effectively has 3 interactions with (water) atoms)).
389 *cost_pp
= 0.5*(fqljw
*nw
*nqlj
+
390 fqw
*nw
*(3*nw
+ nq
) +
392 fq
*nq
*(3*nw
+ nqlj
+ nq
) +
393 flj
*nlj
*(nw
+ nqlj
+ nlj
))
394 *4/3*M_PI
*ir
->rlist
*ir
->rlist
*ir
->rlist
/det(box
);
396 *cost_pp
*= simd_cycle_factor(bHaveSIMD
);
399 static void pp_verlet_load(const gmx_mtop_t
*mtop
, const t_inputrec
*ir
,
401 int *nq_tot
, int *nlj_tot
,
403 gmx_bool
*bChargePerturbed
, gmx_bool
*bTypePerturbed
)
405 int atnr
, a
, nqlj
, nq
, nlj
;
409 double c_qlj
, c_q
, c_lj
;
412 /* Conversion factor for reference vs SIMD kernel performance.
413 * The factor is about right for SSE2/4, but should be 2 higher for AVX256.
416 const real nbnxn_refkernel_fac
= 4.0;
418 const real nbnxn_refkernel_fac
= 8.0;
421 bQRF
= (EEL_RF(ir
->coulombtype
) || ir
->coulombtype
== eelCUT
);
423 iparams
= mtop
->ffparams
.iparams
;
424 atnr
= mtop
->ffparams
.atnr
;
427 *bChargePerturbed
= FALSE
;
428 *bTypePerturbed
= FALSE
;
429 for (const gmx_molblock_t
&molb
: mtop
->molblock
)
431 const gmx_moltype_t
*molt
= &mtop
->moltype
[molb
.type
];
432 const t_atom
*atom
= molt
->atoms
.atom
;
433 for (a
= 0; a
< molt
->atoms
.nr
; a
++)
435 if (atom
[a
].q
!= 0 || atom
[a
].qB
!= 0)
437 if (iparams
[(atnr
+1)*atom
[a
].type
].lj
.c6
!= 0 ||
438 iparams
[(atnr
+1)*atom
[a
].type
].lj
.c12
!= 0)
447 if (atom
[a
].q
!= atom
[a
].qB
)
449 *bChargePerturbed
= TRUE
;
451 if (atom
[a
].type
!= atom
[a
].typeB
)
453 *bTypePerturbed
= TRUE
;
458 nlj
= mtop
->natoms
- nqlj
- nq
;
461 *nlj_tot
= nqlj
+ nlj
;
463 /* Effective cut-off for cluster pair list of 4x4 or 4x8 atoms.
464 * This choice should match the one of pick_nbnxn_kernel_cpu().
465 * TODO: Make this function use pick_nbnxn_kernel_cpu().
467 #if GMX_SIMD_HAVE_REAL && ((GMX_SIMD_REAL_WIDTH == 8 && defined GMX_SIMD_HAVE_FMA) || GMX_SIMD_REAL_WIDTH > 8)
472 r_eff
= ir
->rlist
+ nbnxn_get_rlist_effective_inc(j_cluster_size
, mtop
->natoms
/det(box
));
474 /* The average number of pairs per atom */
475 nppa
= 0.5*4/3*M_PI
*r_eff
*r_eff
*r_eff
*mtop
->natoms
/det(box
);
479 fprintf(debug
, "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f pairs per atom %.1f\n",
480 nqlj
, nq
, nlj
, ir
->rlist
, r_eff
, nppa
);
483 /* Determine the cost per pair interaction */
484 c_qlj
= (bQRF
? c_nbnxn_qrf_lj
: c_nbnxn_qexp_lj
);
485 c_q
= (bQRF
? c_nbnxn_qrf
: c_nbnxn_qexp
);
487 if (ir
->vdw_modifier
== eintmodPOTSWITCH
|| EVDW_PME(ir
->vdwtype
))
489 c_qlj
+= c_nbnxn_ljexp_add
;
490 c_lj
+= c_nbnxn_ljexp_add
;
492 if (EVDW_PME(ir
->vdwtype
) && ir
->ljpme_combination_rule
== eljpmeLB
)
494 /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */
495 c_qlj
*= nbnxn_refkernel_fac
;
496 c_q
*= nbnxn_refkernel_fac
;
497 c_lj
*= nbnxn_refkernel_fac
;
500 /* For the PP non-bonded cost it is (unrealistically) assumed
501 * that all atoms are distributed homogeneously in space.
503 *cost_pp
= (nqlj
*c_qlj
+ nq
*c_q
+ nlj
*c_lj
)*nppa
;
505 *cost_pp
*= simd_cycle_factor(bHaveSIMD
);
508 float pme_load_estimate(const gmx_mtop_t
*mtop
, const t_inputrec
*ir
,
512 gmx_bool bChargePerturbed
, bTypePerturbed
;
513 double ndistance_c
, ndistance_simd
;
514 double cost_bond
, cost_pp
, cost_redist
, cost_spread
, cost_fft
, cost_solve
, cost_pme
;
517 /* Computational cost of bonded, non-bonded and PME calculations.
518 * This will be machine dependent.
519 * The numbers here are accurate for Intel Core2 and AMD Athlon 64
520 * in single precision. In double precision PME mesh is slightly cheaper,
521 * although not so much that the numbers need to be adjusted.
524 count_bonded_distances(mtop
, ir
, &ndistance_c
, &ndistance_simd
);
525 /* C_BOND is the cost for bonded interactions with SIMD implementations,
526 * so we need to scale the number of bonded interactions for which there
527 * are only C implementations to the number of SIMD equivalents.
529 cost_bond
= c_bond
*(ndistance_c
*simd_cycle_factor(FALSE
) +
530 ndistance_simd
*simd_cycle_factor(bHaveSIMD
));
532 if (ir
->cutoff_scheme
== ecutsGROUP
)
534 pp_group_load(mtop
, ir
, box
,
535 &nq_tot
, &nlj_tot
, &cost_pp
,
536 &bChargePerturbed
, &bTypePerturbed
);
540 pp_verlet_load(mtop
, ir
, box
,
541 &nq_tot
, &nlj_tot
, &cost_pp
,
542 &bChargePerturbed
, &bTypePerturbed
);
550 if (EEL_PME(ir
->coulombtype
))
552 double grid
= ir
->nkx
*ir
->nky
*((ir
->nkz
+ 1)/2);
554 int f
= ((ir
->efep
!= efepNO
&& bChargePerturbed
) ? 2 : 1);
555 cost_redist
+= c_pme_redist
*nq_tot
;
556 cost_spread
+= f
*c_pme_spread
*nq_tot
*gmx::power3(ir
->pme_order
);
557 cost_fft
+= f
*c_pme_fft
*grid
*std::log(grid
)/std::log(2.0);
558 cost_solve
+= f
*c_pme_solve
*grid
*simd_cycle_factor(bHaveSIMD
);
561 if (EVDW_PME(ir
->vdwtype
))
563 double grid
= ir
->nkx
*ir
->nky
*((ir
->nkz
+ 1)/2);
565 int f
= ((ir
->efep
!= efepNO
&& bTypePerturbed
) ? 2 : 1);
566 if (ir
->ljpme_combination_rule
== eljpmeLB
)
568 /* LB combination rule: we have 7 mesh terms */
571 cost_redist
+= c_pme_redist
*nlj_tot
;
572 cost_spread
+= f
*c_pme_spread
*nlj_tot
*gmx::power3(ir
->pme_order
);
573 cost_fft
+= f
*c_pme_fft
*2*grid
*std::log(grid
)/std::log(2.0);
574 cost_solve
+= f
*c_pme_solve
*grid
*simd_cycle_factor(bHaveSIMD
);
577 cost_pme
= cost_redist
+ cost_spread
+ cost_fft
+ cost_solve
;
579 ratio
= cost_pme
/(cost_bond
+ cost_pp
+ cost_pme
);
590 cost_bond
, cost_pp
, cost_redist
, cost_spread
, cost_fft
, cost_solve
);
592 fprintf(debug
, "Estimate for relative PME load: %.3f\n", ratio
);