From 4dd6716e2191896c0577608081414509e41ee07c Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Mon, 14 Sep 2015 15:01:48 +0200 Subject: [PATCH] Detect incorrect cycle counting When threads are not pinned to cores, the wallcycle counting can get messed up on machines where the cycle counters are not synchronized between the cores. Since it's difficult to detect thread pinning on all architectures and without pinning the counting can be correct on synchronized machines, we try to detect incorrect counters. We only detect negative cycle counts, but that should catch nearly all cases. When we deteced an invalid count, we ignore the cycles counted and do not print the cycle accounting table. Dynamic load balancing is never disabled, because a few incorrect load measurements do not cause problems here. Fixes #1821 Change-Id: I076ad685a043f1f0b913a9b089ebea43a62534f5 --- src/gromacs/domdec/domdec.cpp | 28 ++++++++++++++++++---- src/gromacs/timing/wallcycle.c | 53 +++++++++++++++++++++++++++++++++++------- 2 files changed, 68 insertions(+), 13 deletions(-) diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index 3986ed0562..1e39e9f86c 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -5200,6 +5200,16 @@ static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step, void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl) { + /* Note that the cycles value can be incorrect, either 0 or some + * extremely large value, when our thread migrated to another core + * with an unsynchronized cycle counter. If this happens less often + * that once per nstlist steps, this will not cause issues, since + * we later subtract the maximum value from the sum over nstlist steps. + * A zero count will slightly lower the total, but that's a small effect. + * Note that the main purpose of the subtraction of the maximum value + * is to avoid throwing off the load balancing when stalls occur due + * e.g. system activity or network congestion. + */ dd->comm->cycl[ddCycl] += cycles; dd->comm->cycl_n[ddCycl]++; if (cycles > dd->comm->cycl_max[ddCycl]) @@ -5465,7 +5475,7 @@ static float dd_force_imb_perf_loss(gmx_domdec_t *dd) /* Return the relative performance loss on the total run time * due to the force calculation load imbalance. */ - if (dd->comm->nload > 0) + if (dd->comm->nload > 0 && dd->comm->load_step > 0) { return (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/ @@ -5491,7 +5501,7 @@ static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd) npp = dd->nnodes; npme = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0; nnodes = npp + npme; - if (dd->nnodes > 1) + if (dd->nnodes > 1 && comm->load_sum > 0) { imbal = comm->load_max*npp/comm->load_sum - 1; lossf = dd_force_imb_perf_loss(dd); @@ -5520,10 +5530,10 @@ static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd) fprintf(fplog, "%s", buf); fprintf(stderr, "%s", buf); } - if (npme > 0) + if (npme > 0 && comm->load_mdf > 0 && comm->load_step > 0) { pme_f_ratio = comm->load_pme/comm->load_mdf; - lossp = (comm->load_pme -comm->load_mdf)/comm->load_step; + lossp = (comm->load_pme - comm->load_mdf)/comm->load_step; if (lossp <= 0) { lossp *= (float)npme/(float)nnodes; @@ -5587,7 +5597,15 @@ static gmx_bool dd_load_flags(gmx_domdec_t *dd) static float dd_f_imbal(gmx_domdec_t *dd) { - return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1; + if (dd->comm->load[0].sum > 0) + { + return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f; + } + else + { + /* Something is wrong in the cycle counting, report no load imbalance */ + return 0.0f; + } } float dd_pme_f_ratio(gmx_domdec_t *dd) diff --git a/src/gromacs/timing/wallcycle.c b/src/gromacs/timing/wallcycle.c index 4961ce58a3..f603bf0fe4 100644 --- a/src/gromacs/timing/wallcycle.c +++ b/src/gromacs/timing/wallcycle.c @@ -73,6 +73,8 @@ typedef struct typedef struct gmx_wallcycle { wallcc_t *wcc; + /* did we detect one or more invalid cycle counts */ + gmx_bool haveInvalidCount; /* variables for testing/debugging */ gmx_bool wc_barrier; wallcc_t *wcc_all; @@ -143,6 +145,7 @@ gmx_wallcycle_t wallcycle_init(FILE *fplog, int resetstep, t_commrec gmx_unused snew(wc, 1); + wc->haveInvalidCount = FALSE; wc->wc_barrier = FALSE; wc->wcc_all = NULL; wc->wc_depth = 0; @@ -321,9 +324,25 @@ double wallcycle_stop(gmx_wallcycle_t wc, int ewc) debug_stop_check(wc, ewc); #endif - cycle = gmx_cycles_read(); - last = cycle - wc->wcc[ewc].start; - wc->wcc[ewc].c += last; + /* When processes or threads migrate between cores, the cycle counting + * can get messed up if the cycle counter on different cores are not + * synchronized. When this happens we expect both large negative and + * positive cycle differences. We can detect negative cycle differences. + * Detecting too large positive counts if difficult, since count can be + * large, especially for ewcRUN. If we detect a negative count, + * we will not print the cycle accounting table. + */ + cycle = gmx_cycles_read(); + if (cycle >= wc->wcc[ewc].start) + { + last = cycle - wc->wcc[ewc].start; + } + else + { + last = 0; + wc->haveInvalidCount = TRUE; + } + wc->wcc[ewc].c += last; wc->wcc[ewc].n++; if (wc->wcc_all) { @@ -361,6 +380,8 @@ void wallcycle_reset_all(gmx_wallcycle_t wc) wc->wcc[i].n = 0; wc->wcc[i].c = 0; } + wc->haveInvalidCount = FALSE; + if (wc->wcc_all) { for (i = 0; i < ewcNR*ewcNR; i++) @@ -393,9 +414,15 @@ static void subtract_cycles(wallcc_t *wcc, int ewc_main, int ewc_sub) { if (wcc[ewc_sub].n > 0) { - assert(wcc[ewc_main].c >= wcc[ewc_sub].c); - - wcc[ewc_main].c -= wcc[ewc_sub].c; + if (wcc[ewc_main].c >= wcc[ewc_sub].c) + { + wcc[ewc_main].c -= wcc[ewc_sub].c; + } + else + { + /* Something is wrong with the cycle counting */ + wcc[ewc_main].c = 0; + } } } @@ -403,7 +430,7 @@ void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc) { wallcc_t *wcc; double cycles[ewcNR+ewcsNR]; - double cycles_n[ewcNR+ewcsNR], buf[ewcNR+ewcsNR], *cyc_all, *buf_all; + double cycles_n[ewcNR+ewcsNR+1], buf[ewcNR+ewcsNR], *cyc_all, *buf_all; int i, j; int nsum; @@ -491,12 +518,14 @@ void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc) #ifdef GMX_MPI if (cr->nnodes > 1) { - MPI_Allreduce(cycles_n, buf, nsum, MPI_DOUBLE, MPI_MAX, + cycles_n[nsum] = (wc->haveInvalidCount > 0 ? 1 : 0); + MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim); for (i = 0; i < ewcNR; i++) { wcc[i].n = (int)(buf[i] + 0.5); } + wc->haveInvalidCount = (buf[nsum] > 0); #ifdef GMX_CYCLE_SUBCOUNTERS for (i = 0; i < ewcsNR; i++) { @@ -692,6 +721,14 @@ void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime, return; } + if (wc->haveInvalidCount) + { + md_print_warn(NULL, fplog, "%s\n", + "NOTE: Detected invalid cycle counts, probably because threads moved between CPU cores that do not have synchronized cycle counters. Will not print the cycle accounting."); + return; + } + + /* Conversion factor from cycles to seconds */ c2t = realtime/tot; c2t_pp = c2t * nth_tot / (double) (npp*nth_pp); -- 2.11.4.GIT