2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2012,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
43 #include "gromacs/math/functions.h"
44 #include "gromacs/math/utilities.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/mdtypes/commrec.h"
47 #include "gromacs/mdtypes/inputrec.h"
48 #include "gromacs/mdtypes/md_enums.h"
49 #include "gromacs/nbnxm/nbnxm_geometry.h"
50 #include "gromacs/simd/simd.h"
51 #include "gromacs/topology/ifunc.h"
52 #include "gromacs/topology/topology.h"
53 #include "gromacs/utility/fatalerror.h"
55 /* Computational cost of bonded, non-bonded and PME calculations.
56 * This will be machine dependent.
57 * The numbers are only used for estimating the relative cost of PME vs PP,
58 * so only relative numbers matter.
59 * The numbers here are accurate cycle counts for Haswell in single precision
60 * compiled with gcc5.2. A correction factor for other architectures is given
61 * by simd_cycle_factor().
62 * In double precision PME mesh is slightly cheaper, although not so much
63 * that the numbers need to be adjusted.
66 /* Cost of a pair interaction in the "Verlet" cut-off scheme, QEXP is Ewald */
67 static const double c_nbnxn_lj
= 2.5;
68 static const double c_nbnxn_qrf_lj
= 2.9;
69 static const double c_nbnxn_qrf
= 2.4;
70 static const double c_nbnxn_qexp_lj
= 4.2;
71 static const double c_nbnxn_qexp
= 3.8;
72 /* Extra cost for expensive LJ interaction, e.g. pot-switch or LJ-PME */
73 static const double c_nbnxn_ljexp_add
= 1.0;
75 /* Cost of the different components of PME. */
76 /* Cost of particle reordering and redistribution (no SIMD correction).
77 * This will be zero without MPI and can be very high with load imbalance.
78 * Thus we use an approximate value for medium parallelization.
80 static const double c_pme_redist
= 100.0;
81 /* Cost of q spreading and force interpolation per charge. This part almost
82 * doesn't accelerate with SIMD, so we don't use SIMD correction.
84 static const double c_pme_spread
= 5.0;
85 /* Cost of fft's, will be multiplied with 2 N log2(N) (no SIMD correction)
86 * Without MPI the number is 2-3, depending on grid factors and thread count.
87 * We take the high limit to be on the safe side and account for some MPI
88 * communication cost, which will dominate at high parallelization.
90 static const double c_pme_fft
= 3.0;
91 /* Cost of pme_solve, will be multiplied with N */
92 static const double c_pme_solve
= 9.0;
94 /* Cost of a bonded interaction divided by the number of distances calculations
95 * required in one interaction. The actual cost is nearly propotional to this.
97 static const double c_bond
= 25.0;
100 #if GMX_SIMD_HAVE_REAL
101 static const gmx_bool bHaveSIMD
= TRUE
;
103 static const gmx_bool bHaveSIMD
= FALSE
;
106 /* Gives a correction factor for the currently compiled SIMD implementations
107 * versus the reference used for the coefficients above (8-wide SIMD with FMA).
108 * bUseSIMD sets if we asking for plain-C (FALSE) or SIMD (TRUE) code.
110 static double simd_cycle_factor(gmx_bool bUseSIMD
)
112 /* The (average) ratio of the time taken by plain-C force calculations
113 * relative to SIMD versions, for the reference platform Haswell:
114 * 8-wide SIMD with FMA, factor: sqrt(2*8)*1.25 = 5.
115 * This factor is used for normalization in simd_cycle_factor().
117 const double simd_cycle_no_simd
= 5.0;
120 #if GMX_SIMD_HAVE_REAL
123 /* We never get full speed-up of a factor GMX_SIMD_REAL_WIDTH.
124 * The actual speed-up depends very much on gather+scatter overhead,
125 * which is different for different bonded and non-bonded kernels.
126 * As a rough, but actually not bad, approximation we use a sqrt
127 * dependence on the width which gives a factor 4 for width=8.
129 speedup
= std::sqrt(2.0*GMX_SIMD_REAL_WIDTH
);
130 #if GMX_SIMD_HAVE_FMA
131 /* FMA tends to give a bit more speedup */
142 gmx_incons("gmx_cycle_factor() compiled without SIMD called with bUseSIMD=TRUE");
144 /* No SIMD, no speedup */
148 /* Return speed compared to the reference (Haswell).
149 * For x86 SIMD, the nbnxn kernels are relatively much slower on
150 * Sandy/Ivy Bridge than Haswell, but that only leads to a too high
151 * PME load estimate on SB/IB, which is erring on the safe side.
153 return simd_cycle_no_simd
/speedup
;
156 void count_bonded_distances(const gmx_mtop_t
&mtop
, const t_inputrec
&ir
,
157 double *ndistance_c
, double *ndistance_simd
)
160 double nonsimd_step_frac
;
162 double ndtot_c
, ndtot_simd
;
163 #if GMX_SIMD_HAVE_REAL
164 gmx_bool bSimdBondeds
= TRUE
;
166 gmx_bool bSimdBondeds
= FALSE
;
169 bExcl
= (ir
.cutoff_scheme
== ecutsGROUP
&& inputrecExclForces(&ir
)
170 && !EEL_FULL(ir
.coulombtype
));
174 /* We only have SIMD versions of these bondeds without energy and
175 * without shift-forces, we take that into account here.
177 if (ir
.nstcalcenergy
> 0)
179 nonsimd_step_frac
= 1.0/ir
.nstcalcenergy
;
183 nonsimd_step_frac
= 0;
185 if (ir
.epc
!= epcNO
&& 1.0/ir
.nstpcouple
> nonsimd_step_frac
)
187 nonsimd_step_frac
= 1.0/ir
.nstpcouple
;
192 nonsimd_step_frac
= 1;
195 /* Count the number of pbc_rvec_sub calls required for bonded interactions.
196 * This number is also roughly proportional to the computational cost.
200 for (const gmx_molblock_t
&molb
: mtop
.molblock
)
202 const gmx_moltype_t
*molt
= &mtop
.moltype
[molb
.type
];
203 for (ftype
= 0; ftype
< F_NRE
; ftype
++)
207 if (interaction_function
[ftype
].flags
& IF_BOND
)
209 double nd_c
, nd_simd
;
213 /* For all interactions, except for the three exceptions
214 * in the switch below, #distances = #atoms - 1.
224 /* These bonded potentially use SIMD */
229 nd_c
= nonsimd_step_frac
*(NRAL(ftype
) - 1);
230 nd_simd
= (1 - nonsimd_step_frac
)*(NRAL(ftype
) - 1);
233 nd_c
= NRAL(ftype
) - 1;
236 nbonds
= molb
.nmol
*molt
->ilist
[ftype
].size()/(1 + NRAL(ftype
));
237 ndtot_c
+= nbonds
*nd_c
;
238 ndtot_simd
+= nbonds
*nd_simd
;
243 ndtot_c
+= molb
.nmol
*(molt
->excls
.nra
- molt
->atoms
.nr
)/2.;
249 fprintf(debug
, "nr. of distance calculations in bondeds: C %.1f SIMD %.1f\n", ndtot_c
, ndtot_simd
);
252 if (ndistance_c
!= nullptr)
254 *ndistance_c
= ndtot_c
;
256 if (ndistance_simd
!= nullptr)
258 *ndistance_simd
= ndtot_simd
;
262 static void pp_verlet_load(const gmx_mtop_t
&mtop
, const t_inputrec
&ir
,
264 int *nq_tot
, int *nlj_tot
,
266 gmx_bool
*bChargePerturbed
, gmx_bool
*bTypePerturbed
)
268 int atnr
, a
, nqlj
, nq
, nlj
;
271 double c_qlj
, c_q
, c_lj
;
274 /* Conversion factor for reference vs SIMD kernel performance.
275 * The factor is about right for SSE2/4, but should be 2 higher for AVX256.
278 const real nbnxn_refkernel_fac
= 4.0;
280 const real nbnxn_refkernel_fac
= 8.0;
283 bQRF
= (EEL_RF(ir
.coulombtype
) || ir
.coulombtype
== eelCUT
);
285 gmx::ArrayRef
<const t_iparams
> iparams
= mtop
.ffparams
.iparams
;
286 atnr
= mtop
.ffparams
.atnr
;
289 *bChargePerturbed
= FALSE
;
290 *bTypePerturbed
= FALSE
;
291 for (const gmx_molblock_t
&molb
: mtop
.molblock
)
293 const gmx_moltype_t
*molt
= &mtop
.moltype
[molb
.type
];
294 const t_atom
*atom
= molt
->atoms
.atom
;
295 for (a
= 0; a
< molt
->atoms
.nr
; a
++)
297 if (atom
[a
].q
!= 0 || atom
[a
].qB
!= 0)
299 if (iparams
[(atnr
+1)*atom
[a
].type
].lj
.c6
!= 0 ||
300 iparams
[(atnr
+1)*atom
[a
].type
].lj
.c12
!= 0)
309 if (atom
[a
].q
!= atom
[a
].qB
)
311 *bChargePerturbed
= TRUE
;
313 if (atom
[a
].type
!= atom
[a
].typeB
)
315 *bTypePerturbed
= TRUE
;
320 nlj
= mtop
.natoms
- nqlj
- nq
;
323 *nlj_tot
= nqlj
+ nlj
;
325 /* Effective cut-off for cluster pair list of 4x4 or 4x8 atoms.
326 * This choice should match the one of pick_nbnxn_kernel_cpu().
327 * TODO: Make this function use pick_nbnxn_kernel_cpu().
329 #if GMX_SIMD_HAVE_REAL && ((GMX_SIMD_REAL_WIDTH == 8 && defined GMX_SIMD_HAVE_FMA) || GMX_SIMD_REAL_WIDTH > 8)
334 r_eff
= ir
.rlist
+ nbnxn_get_rlist_effective_inc(j_cluster_size
, mtop
.natoms
/det(box
));
336 /* The average number of pairs per atom */
337 nppa
= 0.5*4/3*M_PI
*r_eff
*r_eff
*r_eff
*mtop
.natoms
/det(box
);
341 fprintf(debug
, "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f pairs per atom %.1f\n",
342 nqlj
, nq
, nlj
, ir
.rlist
, r_eff
, nppa
);
345 /* Determine the cost per pair interaction */
346 c_qlj
= (bQRF
? c_nbnxn_qrf_lj
: c_nbnxn_qexp_lj
);
347 c_q
= (bQRF
? c_nbnxn_qrf
: c_nbnxn_qexp
);
349 if (ir
.vdw_modifier
== eintmodPOTSWITCH
|| EVDW_PME(ir
.vdwtype
))
351 c_qlj
+= c_nbnxn_ljexp_add
;
352 c_lj
+= c_nbnxn_ljexp_add
;
354 if (EVDW_PME(ir
.vdwtype
) && ir
.ljpme_combination_rule
== eljpmeLB
)
356 /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */
357 c_qlj
*= nbnxn_refkernel_fac
;
358 c_q
*= nbnxn_refkernel_fac
;
359 c_lj
*= nbnxn_refkernel_fac
;
362 /* For the PP non-bonded cost it is (unrealistically) assumed
363 * that all atoms are distributed homogeneously in space.
365 *cost_pp
= (nqlj
*c_qlj
+ nq
*c_q
+ nlj
*c_lj
)*nppa
;
367 *cost_pp
*= simd_cycle_factor(bHaveSIMD
);
370 float pme_load_estimate(const gmx_mtop_t
&mtop
, const t_inputrec
&ir
,
374 gmx_bool bChargePerturbed
, bTypePerturbed
;
375 double ndistance_c
, ndistance_simd
;
376 double cost_bond
, cost_pp
, cost_redist
, cost_spread
, cost_fft
, cost_solve
, cost_pme
;
379 /* Computational cost of bonded, non-bonded and PME calculations.
380 * This will be machine dependent.
381 * The numbers here are accurate for Intel Core2 and AMD Athlon 64
382 * in single precision. In double precision PME mesh is slightly cheaper,
383 * although not so much that the numbers need to be adjusted.
386 count_bonded_distances(mtop
, ir
, &ndistance_c
, &ndistance_simd
);
387 /* C_BOND is the cost for bonded interactions with SIMD implementations,
388 * so we need to scale the number of bonded interactions for which there
389 * are only C implementations to the number of SIMD equivalents.
391 cost_bond
= c_bond
*(ndistance_c
*simd_cycle_factor(FALSE
) +
392 ndistance_simd
*simd_cycle_factor(bHaveSIMD
));
394 pp_verlet_load(mtop
, ir
, box
,
395 &nq_tot
, &nlj_tot
, &cost_pp
,
396 &bChargePerturbed
, &bTypePerturbed
);
403 int gridNkzFactor
= int{
406 if (EEL_PME(ir
.coulombtype
))
408 double grid
= ir
.nkx
*ir
.nky
*gridNkzFactor
;
410 int f
= ((ir
.efep
!= efepNO
&& bChargePerturbed
) ? 2 : 1);
411 cost_redist
+= c_pme_redist
*nq_tot
;
412 cost_spread
+= f
*c_pme_spread
*nq_tot
*gmx::power3(ir
.pme_order
);
413 cost_fft
+= f
*c_pme_fft
*grid
*std::log(grid
)/std::log(2.0);
414 cost_solve
+= f
*c_pme_solve
*grid
*simd_cycle_factor(bHaveSIMD
);
417 if (EVDW_PME(ir
.vdwtype
))
419 double grid
= ir
.nkx
*ir
.nky
*gridNkzFactor
;
421 int f
= ((ir
.efep
!= efepNO
&& bTypePerturbed
) ? 2 : 1);
422 if (ir
.ljpme_combination_rule
== eljpmeLB
)
424 /* LB combination rule: we have 7 mesh terms */
427 cost_redist
+= c_pme_redist
*nlj_tot
;
428 cost_spread
+= f
*c_pme_spread
*nlj_tot
*gmx::power3(ir
.pme_order
);
429 cost_fft
+= f
*c_pme_fft
*2*grid
*std::log(grid
)/std::log(2.0);
430 cost_solve
+= f
*c_pme_solve
*grid
*simd_cycle_factor(bHaveSIMD
);
433 cost_pme
= cost_redist
+ cost_spread
+ cost_fft
+ cost_solve
;
435 ratio
= cost_pme
/(cost_bond
+ cost_pp
+ cost_pme
);
446 cost_bond
, cost_pp
, cost_redist
, cost_spread
, cost_fft
, cost_solve
);
448 fprintf(debug
, "Estimate for relative PME load: %.3f\n", ratio
);