From 4dd6716e2191896c0577608081414509e41ee07c Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Mon, 14 Sep 2015 15:01:48 +0200
Subject: [PATCH] Detect incorrect cycle counting

When threads are not pinned to cores, the wallcycle counting can get
messed up on machines where the cycle counters are not synchronized
between the cores. Since it's difficult to detect thread pinning on
all architectures and without pinning the counting can be correct on
synchronized machines, we try to detect incorrect counters. We only
detect negative cycle counts, but that should catch nearly all cases.
When we deteced an invalid count, we ignore the cycles counted and do
not print the cycle accounting table. Dynamic load balancing is never
disabled, because a few incorrect load measurements do not cause
problems here.

Fixes #1821

Change-Id: I076ad685a043f1f0b913a9b089ebea43a62534f5
---
 src/gromacs/domdec/domdec.cpp  | 28 ++++++++++++++++++----
 src/gromacs/timing/wallcycle.c | 53 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 68 insertions(+), 13 deletions(-)

diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index 3986ed0562..1e39e9f86c 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -5200,6 +5200,16 @@ static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
 
 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
 {
+    /* Note that the cycles value can be incorrect, either 0 or some
+     * extremely large value, when our thread migrated to another core
+     * with an unsynchronized cycle counter. If this happens less often
+     * that once per nstlist steps, this will not cause issues, since
+     * we later subtract the maximum value from the sum over nstlist steps.
+     * A zero count will slightly lower the total, but that's a small effect.
+     * Note that the main purpose of the subtraction of the maximum value
+     * is to avoid throwing off the load balancing when stalls occur due
+     * e.g. system activity or network congestion.
+     */
     dd->comm->cycl[ddCycl] += cycles;
     dd->comm->cycl_n[ddCycl]++;
     if (cycles > dd->comm->cycl_max[ddCycl])
@@ -5465,7 +5475,7 @@ static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
     /* Return the relative performance loss on the total run time
      * due to the force calculation load imbalance.
      */
-    if (dd->comm->nload > 0)
+    if (dd->comm->nload > 0 && dd->comm->load_step > 0)
     {
         return
             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
@@ -5491,7 +5501,7 @@ static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
         npp    = dd->nnodes;
         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
         nnodes = npp + npme;
-        if (dd->nnodes > 1)
+        if (dd->nnodes > 1 && comm->load_sum > 0)
         {
             imbal  = comm->load_max*npp/comm->load_sum - 1;
             lossf  = dd_force_imb_perf_loss(dd);
@@ -5520,10 +5530,10 @@ static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
             fprintf(fplog, "%s", buf);
             fprintf(stderr, "%s", buf);
         }
-        if (npme > 0)
+        if (npme > 0 && comm->load_mdf > 0 && comm->load_step > 0)
         {
             pme_f_ratio = comm->load_pme/comm->load_mdf;
-            lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
+            lossp       = (comm->load_pme - comm->load_mdf)/comm->load_step;
             if (lossp <= 0)
             {
                 lossp *= (float)npme/(float)nnodes;
@@ -5587,7 +5597,15 @@ static gmx_bool dd_load_flags(gmx_domdec_t *dd)
 
 static float dd_f_imbal(gmx_domdec_t *dd)
 {
-    return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
+    if (dd->comm->load[0].sum > 0)
+    {
+        return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
+    }
+    else
+    {
+        /* Something is wrong in the cycle counting, report no load imbalance */
+        return 0.0f;
+    }
 }
 
 float dd_pme_f_ratio(gmx_domdec_t *dd)
diff --git a/src/gromacs/timing/wallcycle.c b/src/gromacs/timing/wallcycle.c
index 4961ce58a3..f603bf0fe4 100644
--- a/src/gromacs/timing/wallcycle.c
+++ b/src/gromacs/timing/wallcycle.c
@@ -73,6 +73,8 @@ typedef struct
 typedef struct gmx_wallcycle
 {
     wallcc_t        *wcc;
+    /* did we detect one or more invalid cycle counts */
+    gmx_bool         haveInvalidCount;
     /* variables for testing/debugging */
     gmx_bool         wc_barrier;
     wallcc_t        *wcc_all;
@@ -143,6 +145,7 @@ gmx_wallcycle_t wallcycle_init(FILE *fplog, int resetstep, t_commrec gmx_unused
 
     snew(wc, 1);
 
+    wc->haveInvalidCount    = FALSE;
     wc->wc_barrier          = FALSE;
     wc->wcc_all             = NULL;
     wc->wc_depth            = 0;
@@ -321,9 +324,25 @@ double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
     debug_stop_check(wc, ewc);
 #endif
 
-    cycle           = gmx_cycles_read();
-    last            = cycle - wc->wcc[ewc].start;
-    wc->wcc[ewc].c += last;
+    /* When processes or threads migrate between cores, the cycle counting
+     * can get messed up if the cycle counter on different cores are not
+     * synchronized. When this happens we expect both large negative and
+     * positive cycle differences. We can detect negative cycle differences.
+     * Detecting too large positive counts if difficult, since count can be
+     * large, especially for ewcRUN. If we detect a negative count,
+     * we will not print the cycle accounting table.
+     */
+    cycle                    = gmx_cycles_read();
+    if (cycle >= wc->wcc[ewc].start)
+    {
+        last                 = cycle - wc->wcc[ewc].start;
+    }
+    else
+    {
+        last                 = 0;
+        wc->haveInvalidCount = TRUE;
+    }
+    wc->wcc[ewc].c          += last;
     wc->wcc[ewc].n++;
     if (wc->wcc_all)
     {
@@ -361,6 +380,8 @@ void wallcycle_reset_all(gmx_wallcycle_t wc)
         wc->wcc[i].n = 0;
         wc->wcc[i].c = 0;
     }
+    wc->haveInvalidCount = FALSE;
+
     if (wc->wcc_all)
     {
         for (i = 0; i < ewcNR*ewcNR; i++)
@@ -393,9 +414,15 @@ static void subtract_cycles(wallcc_t *wcc, int ewc_main, int ewc_sub)
 {
     if (wcc[ewc_sub].n > 0)
     {
-        assert(wcc[ewc_main].c >= wcc[ewc_sub].c);
-
-        wcc[ewc_main].c -= wcc[ewc_sub].c;
+        if (wcc[ewc_main].c >= wcc[ewc_sub].c)
+        {
+            wcc[ewc_main].c -= wcc[ewc_sub].c;
+        }
+        else
+        {
+            /* Something is wrong with the cycle counting */
+            wcc[ewc_main].c  = 0;
+        }
     }
 }
 
@@ -403,7 +430,7 @@ void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
 {
     wallcc_t *wcc;
     double    cycles[ewcNR+ewcsNR];
-    double    cycles_n[ewcNR+ewcsNR], buf[ewcNR+ewcsNR], *cyc_all, *buf_all;
+    double    cycles_n[ewcNR+ewcsNR+1], buf[ewcNR+ewcsNR], *cyc_all, *buf_all;
     int       i, j;
     int       nsum;
 
@@ -491,12 +518,14 @@ void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
 #ifdef GMX_MPI
     if (cr->nnodes > 1)
     {
-        MPI_Allreduce(cycles_n, buf, nsum, MPI_DOUBLE, MPI_MAX,
+        cycles_n[nsum] = (wc->haveInvalidCount > 0 ? 1 : 0);
+        MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX,
                       cr->mpi_comm_mysim);
         for (i = 0; i < ewcNR; i++)
         {
             wcc[i].n = (int)(buf[i] + 0.5);
         }
+        wc->haveInvalidCount = (buf[nsum] > 0);
 #ifdef GMX_CYCLE_SUBCOUNTERS
         for (i = 0; i < ewcsNR; i++)
         {
@@ -692,6 +721,14 @@ void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
         return;
     }
 
+    if (wc->haveInvalidCount)
+    {
+        md_print_warn(NULL, fplog, "%s\n",
+                      "NOTE: Detected invalid cycle counts, probably because threads moved between CPU cores that do not have synchronized cycle counters. Will not print the cycle accounting.");
+        return;
+    }
+
+
     /* Conversion factor from cycles to seconds */
     c2t     = realtime/tot;
     c2t_pp  = c2t * nth_tot / (double) (npp*nth_pp);
-- 
2.11.4.GIT