2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2012,2014,2015,2016,2017, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
43 #include "gromacs/math/functions.h"
44 #include "gromacs/math/utilities.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/mdlib/nbnxn_consts.h"
47 #include "gromacs/mdlib/nbnxn_search.h"
48 #include "gromacs/mdtypes/commrec.h"
49 #include "gromacs/mdtypes/inputrec.h"
50 #include "gromacs/mdtypes/md_enums.h"
51 #include "gromacs/simd/simd.h"
52 #include "gromacs/topology/ifunc.h"
53 #include "gromacs/topology/topology.h"
54 #include "gromacs/utility/fatalerror.h"
56 /* Computational cost of bonded, non-bonded and PME calculations.
57 * This will be machine dependent.
58 * The numbers are only used for estimating the relative cost of PME vs PP,
59 * so only relative numbers matter.
60 * The numbers here are accurate cycle counts for Haswell in single precision
61 * compiled with gcc5.2. A correction factor for other architectures is given
62 * by simd_cycle_factor().
63 * In double precision PME mesh is slightly cheaper, although not so much
64 * that the numbers need to be adjusted.
67 /* Cost of a pair interaction in the "group" cut-off scheme */
68 static const double c_group_fq
= 18.0;
69 static const double c_group_qlj_cut
= 18.0;
70 static const double c_group_qlj_tab
= 24.0;
71 static const double c_group_lj_cut
= 12.0;
72 static const double c_group_lj_tab
= 21.0;
73 /* Cost of 1 water with one Q/LJ atom */
74 static const double c_group_qljw_cut
= 24.0;
75 static const double c_group_qljw_tab
= 27.0;
76 /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
77 static const double c_group_qw
= 21.0;
79 /* Cost of a pair interaction in the "Verlet" cut-off scheme, QEXP is Ewald */
80 static const double c_nbnxn_lj
= 2.5;
81 static const double c_nbnxn_qrf_lj
= 2.9;
82 static const double c_nbnxn_qrf
= 2.4;
83 static const double c_nbnxn_qexp_lj
= 4.2;
84 static const double c_nbnxn_qexp
= 3.8;
85 /* Extra cost for expensive LJ interaction, e.g. pot-switch or LJ-PME */
86 static const double c_nbnxn_ljexp_add
= 1.0;
88 /* Cost of the different components of PME. */
89 /* Cost of particle reordering and redistribution (no SIMD correction).
90 * This will be zero without MPI and can be very high with load imbalance.
91 * Thus we use an approximate value for medium parallelization.
93 static const double c_pme_redist
= 100.0;
94 /* Cost of q spreading and force interpolation per charge. This part almost
95 * doesn't accelerate with SIMD, so we don't use SIMD correction.
97 static const double c_pme_spread
= 5.0;
98 /* Cost of fft's, will be multiplied with 2 N log2(N) (no SIMD correction)
99 * Without MPI the number is 2-3, depending on grid factors and thread count.
100 * We take the high limit to be on the safe side and account for some MPI
101 * communication cost, which will dominate at high parallelization.
103 static const double c_pme_fft
= 3.0;
104 /* Cost of pme_solve, will be multiplied with N */
105 static const double c_pme_solve
= 9.0;
107 /* Cost of a bonded interaction divided by the number of distances calculations
108 * required in one interaction. The actual cost is nearly propotional to this.
110 static const double c_bond
= 25.0;
113 #if GMX_SIMD_HAVE_REAL
114 static const gmx_bool bHaveSIMD
= TRUE
;
116 static const gmx_bool bHaveSIMD
= FALSE
;
119 /* Gives a correction factor for the currently compiled SIMD implementations
120 * versus the reference used for the coefficients above (8-wide SIMD with FMA).
121 * bUseSIMD sets if we asking for plain-C (FALSE) or SIMD (TRUE) code.
123 static double simd_cycle_factor(gmx_bool bUseSIMD
)
125 /* The (average) ratio of the time taken by plain-C force calculations
126 * relative to SIMD versions, for the reference platform Haswell:
127 * 8-wide SIMD with FMA, factor: sqrt(2*8)*1.25 = 5.
128 * This factor is used for normalization in simd_cycle_factor().
130 const double simd_cycle_no_simd
= 5.0;
133 #if GMX_SIMD_HAVE_REAL
136 /* We never get full speed-up of a factor GMX_SIMD_REAL_WIDTH.
137 * The actual speed-up depends very much on gather+scatter overhead,
138 * which is different for different bonded and non-bonded kernels.
139 * As a rough, but actually not bad, approximation we use a sqrt
140 * dependence on the width which gives a factor 4 for width=8.
142 speedup
= std::sqrt(2.0*GMX_SIMD_REAL_WIDTH
);
143 #if GMX_SIMD_HAVE_FMA
144 /* FMA tends to give a bit more speedup */
155 gmx_incons("gmx_cycle_factor() compiled without SIMD called with bUseSIMD=TRUE");
157 /* No SIMD, no speedup */
161 /* Return speed compared to the reference (Haswell).
162 * For x86 SIMD, the nbnxn kernels are relatively much slower on
163 * Sandy/Ivy Bridge than Haswell, but that only leads to a too high
164 * PME load estimate on SB/IB, which is erring on the safe side.
166 return simd_cycle_no_simd
/speedup
;
169 void count_bonded_distances(const gmx_mtop_t
*mtop
, const t_inputrec
*ir
,
170 double *ndistance_c
, double *ndistance_simd
)
173 double nonsimd_step_frac
;
176 double ndtot_c
, ndtot_simd
;
177 #if GMX_SIMD_HAVE_REAL
178 gmx_bool bSimdBondeds
= TRUE
;
180 gmx_bool bSimdBondeds
= FALSE
;
183 bExcl
= (ir
->cutoff_scheme
== ecutsGROUP
&& inputrecExclForces(ir
)
184 && !EEL_FULL(ir
->coulombtype
));
188 /* We only have SIMD versions of these bondeds without energy and
189 * without shift-forces, we take that into account here.
191 if (ir
->nstcalcenergy
> 0)
193 nonsimd_step_frac
= 1.0/ir
->nstcalcenergy
;
197 nonsimd_step_frac
= 0;
199 if (ir
->epc
!= epcNO
&& 1.0/ir
->nstpcouple
> nonsimd_step_frac
)
201 nonsimd_step_frac
= 1.0/ir
->nstpcouple
;
206 nonsimd_step_frac
= 1;
209 /* Count the number of pbc_rvec_sub calls required for bonded interactions.
210 * This number is also roughly proportional to the computational cost.
214 #if defined _ICC && __ICC == 1400 || defined __ICL && __ICL == 1400
215 #pragma novector /* Work-around for incorrect vectorization */
217 for (mb
= 0; mb
< mtop
->nmolblock
; mb
++)
219 molt
= &mtop
->moltype
[mtop
->molblock
[mb
].type
];
220 nmol
= mtop
->molblock
[mb
].nmol
;
221 for (ftype
= 0; ftype
< F_NRE
; ftype
++)
225 if (interaction_function
[ftype
].flags
& IF_BOND
)
227 double nd_c
, nd_simd
;
231 /* For all interactions, except for the three exceptions
232 * in the switch below, #distances = #atoms - 1.
242 /* These bonded potentially use SIMD */
247 nd_c
= nonsimd_step_frac
*(NRAL(ftype
) - 1);
248 nd_simd
= (1 - nonsimd_step_frac
)*(NRAL(ftype
) - 1);
251 nd_c
= NRAL(ftype
) - 1;
254 nbonds
= nmol
*molt
->ilist
[ftype
].nr
/(1 + NRAL(ftype
));
255 ndtot_c
+= nbonds
*nd_c
;
256 ndtot_simd
+= nbonds
*nd_simd
;
261 ndtot_c
+= nmol
*(molt
->excls
.nra
- molt
->atoms
.nr
)/2;
267 fprintf(debug
, "nr. of distance calculations in bondeds: C %.1f SIMD %.1f\n", ndtot_c
, ndtot_simd
);
270 if (ndistance_c
!= nullptr)
272 *ndistance_c
= ndtot_c
;
274 if (ndistance_simd
!= nullptr)
276 *ndistance_simd
= ndtot_simd
;
280 static void pp_group_load(const gmx_mtop_t
*mtop
, const t_inputrec
*ir
,
282 int *nq_tot
, int *nlj_tot
,
284 gmx_bool
*bChargePerturbed
, gmx_bool
*bTypePerturbed
)
287 int mb
, nmol
, atnr
, cg
, a
, a0
, ncqlj
, ncq
, nclj
;
288 gmx_bool bBHAM
, bLJcut
, bWater
, bQ
, bLJ
;
289 int nw
, nqlj
, nq
, nlj
;
290 double fq
, fqlj
, flj
, fqljw
, fqw
;
294 bBHAM
= (mtop
->ffparams
.functype
[0] == F_BHAM
);
296 bLJcut
= ((ir
->vdwtype
== evdwCUT
) && !bBHAM
);
298 /* Computational cost of bonded, non-bonded and PME calculations.
299 * This will be machine dependent.
300 * The numbers here are accurate for Intel Core2 and AMD Athlon 64
301 * in single precision. In double precision PME mesh is slightly cheaper,
302 * although not so much that the numbers need to be adjusted.
305 fqlj
= (bLJcut
? c_group_qlj_cut
: c_group_qlj_tab
);
306 flj
= (bLJcut
? c_group_lj_cut
: c_group_lj_tab
);
307 /* Cost of 1 water with one Q/LJ atom */
308 fqljw
= (bLJcut
? c_group_qljw_cut
: c_group_qljw_tab
);
309 /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
312 iparams
= mtop
->ffparams
.iparams
;
313 atnr
= mtop
->ffparams
.atnr
;
318 *bChargePerturbed
= FALSE
;
319 *bTypePerturbed
= FALSE
;
320 for (mb
= 0; mb
< mtop
->nmolblock
; mb
++)
322 molt
= &mtop
->moltype
[mtop
->molblock
[mb
].type
];
323 atom
= molt
->atoms
.atom
;
324 nmol
= mtop
->molblock
[mb
].nmol
;
326 for (cg
= 0; cg
< molt
->cgs
.nr
; cg
++)
333 while (a
< molt
->cgs
.index
[cg
+1])
335 bQ
= (atom
[a
].q
!= 0 || atom
[a
].qB
!= 0);
336 bLJ
= (iparams
[(atnr
+1)*atom
[a
].type
].lj
.c6
!= 0 ||
337 iparams
[(atnr
+1)*atom
[a
].type
].lj
.c12
!= 0);
338 if (atom
[a
].q
!= atom
[a
].qB
)
340 *bChargePerturbed
= TRUE
;
342 if (atom
[a
].type
!= atom
[a
].typeB
)
344 *bTypePerturbed
= TRUE
;
346 /* This if this atom fits into water optimization */
347 if (!((a
== a0
&& bQ
&& bLJ
) ||
348 (a
== a0
+1 && bQ
&& !bLJ
) ||
349 (a
== a0
+2 && bQ
&& !bLJ
&& atom
[a
].q
== atom
[a
-1].q
) ||
350 (a
== a0
+3 && !bQ
&& bLJ
)))
384 *nq_tot
= nq
+ nqlj
+ nw
*3;
385 *nlj_tot
= nlj
+ nqlj
+ nw
;
389 fprintf(debug
, "nw %d nqlj %d nq %d nlj %d\n", nw
, nqlj
, nq
, nlj
);
392 /* For the PP non-bonded cost it is (unrealistically) assumed
393 * that all atoms are distributed homogeneously in space.
394 * Factor 3 is used because a water molecule has 3 atoms
395 * (and TIP4P effectively has 3 interactions with (water) atoms)).
397 *cost_pp
= 0.5*(fqljw
*nw
*nqlj
+
398 fqw
*nw
*(3*nw
+ nq
) +
400 fq
*nq
*(3*nw
+ nqlj
+ nq
) +
401 flj
*nlj
*(nw
+ nqlj
+ nlj
))
402 *4/3*M_PI
*ir
->rlist
*ir
->rlist
*ir
->rlist
/det(box
);
404 *cost_pp
*= simd_cycle_factor(bHaveSIMD
);
407 static void pp_verlet_load(const gmx_mtop_t
*mtop
, const t_inputrec
*ir
,
409 int *nq_tot
, int *nlj_tot
,
411 gmx_bool
*bChargePerturbed
, gmx_bool
*bTypePerturbed
)
414 int mb
, nmol
, atnr
, a
, nqlj
, nq
, nlj
;
419 double c_qlj
, c_q
, c_lj
;
422 /* Conversion factor for reference vs SIMD kernel performance.
423 * The factor is about right for SSE2/4, but should be 2 higher for AVX256.
426 const real nbnxn_refkernel_fac
= 4.0;
428 const real nbnxn_refkernel_fac
= 8.0;
431 bQRF
= (EEL_RF(ir
->coulombtype
) || ir
->coulombtype
== eelCUT
);
433 iparams
= mtop
->ffparams
.iparams
;
434 atnr
= mtop
->ffparams
.atnr
;
437 *bChargePerturbed
= FALSE
;
438 *bTypePerturbed
= FALSE
;
439 for (mb
= 0; mb
< mtop
->nmolblock
; mb
++)
441 molt
= &mtop
->moltype
[mtop
->molblock
[mb
].type
];
442 atom
= molt
->atoms
.atom
;
443 nmol
= mtop
->molblock
[mb
].nmol
;
444 for (a
= 0; a
< molt
->atoms
.nr
; a
++)
446 if (atom
[a
].q
!= 0 || atom
[a
].qB
!= 0)
448 if (iparams
[(atnr
+1)*atom
[a
].type
].lj
.c6
!= 0 ||
449 iparams
[(atnr
+1)*atom
[a
].type
].lj
.c12
!= 0)
458 if (atom
[a
].q
!= atom
[a
].qB
)
460 *bChargePerturbed
= TRUE
;
462 if (atom
[a
].type
!= atom
[a
].typeB
)
464 *bTypePerturbed
= TRUE
;
469 nlj
= mtop
->natoms
- nqlj
- nq
;
472 *nlj_tot
= nqlj
+ nlj
;
474 /* Effective cut-off for cluster pair list of 4x4 or 4x8 atoms.
475 * This choice should match the one of pick_nbnxn_kernel_cpu().
476 * TODO: Make this function use pick_nbnxn_kernel_cpu().
478 #if GMX_SIMD_HAVE_REAL && ((GMX_SIMD_REAL_WIDTH == 8 && defined GMX_SIMD_HAVE_FMA) || GMX_SIMD_REAL_WIDTH > 8)
483 r_eff
= ir
->rlist
+ nbnxn_get_rlist_effective_inc(j_cluster_size
, mtop
->natoms
/det(box
));
485 /* The average number of pairs per atom */
486 nppa
= 0.5*4/3*M_PI
*r_eff
*r_eff
*r_eff
*mtop
->natoms
/det(box
);
490 fprintf(debug
, "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f pairs per atom %.1f\n",
491 nqlj
, nq
, nlj
, ir
->rlist
, r_eff
, nppa
);
494 /* Determine the cost per pair interaction */
495 c_qlj
= (bQRF
? c_nbnxn_qrf_lj
: c_nbnxn_qexp_lj
);
496 c_q
= (bQRF
? c_nbnxn_qrf
: c_nbnxn_qexp
);
498 if (ir
->vdw_modifier
== eintmodPOTSWITCH
|| EVDW_PME(ir
->vdwtype
))
500 c_qlj
+= c_nbnxn_ljexp_add
;
501 c_lj
+= c_nbnxn_ljexp_add
;
503 if (EVDW_PME(ir
->vdwtype
) && ir
->ljpme_combination_rule
== eljpmeLB
)
505 /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */
506 c_qlj
*= nbnxn_refkernel_fac
;
507 c_q
*= nbnxn_refkernel_fac
;
508 c_lj
*= nbnxn_refkernel_fac
;
511 /* For the PP non-bonded cost it is (unrealistically) assumed
512 * that all atoms are distributed homogeneously in space.
514 *cost_pp
= (nqlj
*c_qlj
+ nq
*c_q
+ nlj
*c_lj
)*nppa
;
516 *cost_pp
*= simd_cycle_factor(bHaveSIMD
);
519 float pme_load_estimate(const gmx_mtop_t
*mtop
, const t_inputrec
*ir
,
523 gmx_bool bChargePerturbed
, bTypePerturbed
;
524 double ndistance_c
, ndistance_simd
;
525 double cost_bond
, cost_pp
, cost_redist
, cost_spread
, cost_fft
, cost_solve
, cost_pme
;
528 /* Computational cost of bonded, non-bonded and PME calculations.
529 * This will be machine dependent.
530 * The numbers here are accurate for Intel Core2 and AMD Athlon 64
531 * in single precision. In double precision PME mesh is slightly cheaper,
532 * although not so much that the numbers need to be adjusted.
535 count_bonded_distances(mtop
, ir
, &ndistance_c
, &ndistance_simd
);
536 /* C_BOND is the cost for bonded interactions with SIMD implementations,
537 * so we need to scale the number of bonded interactions for which there
538 * are only C implementations to the number of SIMD equivalents.
540 cost_bond
= c_bond
*(ndistance_c
*simd_cycle_factor(FALSE
) +
541 ndistance_simd
*simd_cycle_factor(bHaveSIMD
));
543 if (ir
->cutoff_scheme
== ecutsGROUP
)
545 pp_group_load(mtop
, ir
, box
,
546 &nq_tot
, &nlj_tot
, &cost_pp
,
547 &bChargePerturbed
, &bTypePerturbed
);
551 pp_verlet_load(mtop
, ir
, box
,
552 &nq_tot
, &nlj_tot
, &cost_pp
,
553 &bChargePerturbed
, &bTypePerturbed
);
561 if (EEL_PME(ir
->coulombtype
))
563 double grid
= ir
->nkx
*ir
->nky
*((ir
->nkz
+ 1)/2);
565 int f
= ((ir
->efep
!= efepNO
&& bChargePerturbed
) ? 2 : 1);
566 cost_redist
+= c_pme_redist
*nq_tot
;
567 cost_spread
+= f
*c_pme_spread
*nq_tot
*gmx::power3(ir
->pme_order
);
568 cost_fft
+= f
*c_pme_fft
*grid
*std::log(grid
)/std::log(2.0);
569 cost_solve
+= f
*c_pme_solve
*grid
*simd_cycle_factor(bHaveSIMD
);
572 if (EVDW_PME(ir
->vdwtype
))
574 double grid
= ir
->nkx
*ir
->nky
*((ir
->nkz
+ 1)/2);
576 int f
= ((ir
->efep
!= efepNO
&& bTypePerturbed
) ? 2 : 1);
577 if (ir
->ljpme_combination_rule
== eljpmeLB
)
579 /* LB combination rule: we have 7 mesh terms */
582 cost_redist
+= c_pme_redist
*nlj_tot
;
583 cost_spread
+= f
*c_pme_spread
*nlj_tot
*gmx::power3(ir
->pme_order
);
584 cost_fft
+= f
*c_pme_fft
*2*grid
*std::log(grid
)/std::log(2.0);
585 cost_solve
+= f
*c_pme_solve
*grid
*simd_cycle_factor(bHaveSIMD
);
588 cost_pme
= cost_redist
+ cost_spread
+ cost_fft
+ cost_solve
;
590 ratio
= cost_pme
/(cost_bond
+ cost_pp
+ cost_pme
);
601 cost_bond
, cost_pp
, cost_redist
, cost_spread
, cost_fft
, cost_solve
);
603 fprintf(debug
, "Estimate for relative PME load: %.3f\n", ratio
);