From 18e384d3e37503add2d7fa01f7642706e160d7fd Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Mon, 16 May 2016 22:33:27 +0200
Subject: [PATCH] Remove OpenMP overhead at high parallelization

Commit 6d98622d introduced OpenMP parallelization for for loops
clearing rvecs of increasing rvecs. For small numbers of atoms per
MPI rank this can increase the cost of the loop by up to a factor 10.
This change disables OpenMP parallelization at low atom count.

Change-Id: I0006526568bb387f91e0a373f7ef203b3809f2e7
---
 src/gromacs/ewald/pme-pp.cpp         | 25 +++++++++++++-----
 src/gromacs/mdlib/gmx_omp_nthreads.h | 36 ++++++++++++++++++++++++-
 src/gromacs/mdlib/sim_util.cpp       | 51 +++++++++++++++++++++---------------
 3 files changed, 84 insertions(+), 28 deletions(-)

diff --git a/src/gromacs/ewald/pme-pp.cpp b/src/gromacs/ewald/pme-pp.cpp
index defb287392..d7642deda6 100644
--- a/src/gromacs/ewald/pme-pp.cpp
+++ b/src/gromacs/ewald/pme-pp.cpp
@@ -773,14 +773,27 @@ void gmx_pme_receive_f(t_commrec *cr,
              MPI_STATUS_IGNORE);
 #endif
 
-    // cppcheck-suppress unreadVariable
-    int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
-#pragma omp parallel for num_threads(nt) schedule(static)
-    for (int i = 0; i < natoms; i++)
+    int nt = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, natoms);
+
+    /* Note that we would like to avoid this conditional by putting it
+     * into the omp pragma instead, but then we still take the full
+     * omp parallel for overhead (at least with gcc5).
+     */
+    if (nt == 1)
     {
-        rvec_inc(f[i], cr->dd->pme_recv_f_buf[i]);
+        for (int i = 0; i < natoms; i++)
+        {
+            rvec_inc(f[i], cr->dd->pme_recv_f_buf[i]);
+        }
+    }
+    else
+    {
+#pragma omp parallel for num_threads(nt) schedule(static)
+        for (int i = 0; i < natoms; i++)
+        {
+            rvec_inc(f[i], cr->dd->pme_recv_f_buf[i]);
+        }
     }
-
 
     receive_virial_energy(cr, vir_q, energy_q, vir_lj, energy_lj, dvdlambda_q, dvdlambda_lj, pme_cycles);
 }
diff --git a/src/gromacs/mdlib/gmx_omp_nthreads.h b/src/gromacs/mdlib/gmx_omp_nthreads.h
index 5f769da891..92ac2ceb54 100644
--- a/src/gromacs/mdlib/gmx_omp_nthreads.h
+++ b/src/gromacs/mdlib/gmx_omp_nthreads.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -69,6 +69,40 @@ void gmx_omp_nthreads_init(FILE *fplog, t_commrec *cr,
  * Returns the number of threads to be used in the given module \p mod. */
 int gmx_omp_nthreads_get(int mod);
 
+/*! \brief
+ * Returns the number of threads to be used in the given module \p mod for simple rvec operations.
+ *
+ * When the, potentially, parallel task only consists of a loop of clear_rvec
+ * or rvec_inc for nrvec elements, the OpenMP overhead might be higher than
+ * the reduction in computional cost due to parallelization. This routine
+ * returns 1 when the overhead is expected to be higher than the gain.
+ */
+static int gmx_omp_nthreads_get_simple_rvec_task(int mod, int nrvec)
+{
+    /* There can be a relatively large overhead to an OpenMP parallel for loop.
+     * This overhead increases, slowly, with the numbe of threads used.
+     * The computational gain goes as 1/#threads. The two effects combined
+     * lead to a cross-over point for a (non-)parallel loop at loop count
+     * that is not strongly dependent on the thread count.
+     * Note that a (non-)parallel loop can have benefit later in the code
+     * due to generating more cache hits, depending on how the next lask
+     * that accesses the same data is (not) parallelized over threads.
+     *
+     * A value of 2000 is the switch-over point for Haswell without
+     * hyper-threading. With hyper-threading it is about a factor 1.5 higher.
+     */
+    const int nrvec_omp = 2000;
+
+    if (nrvec < nrvec_omp)
+    {
+        return 1;
+    }
+    else
+    {
+        return gmx_omp_nthreads_get(mod);
+    }
+}
+
 /*! \brief Sets the number of threads to be used in module.
  *
  * Intended for use in testing. */
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index e6292e2570..d48e1594dd 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -738,13 +738,28 @@ gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
     return nbv != NULL && nbv->bUseGPU;
 }
 
-static gmx_inline void clear_rvecs_omp_nowait(int n, rvec v[])
+static gmx_inline void clear_rvecs_omp(int n, rvec v[])
 {
-    int i;
-#pragma omp for nowait
-    for (i = 0; i < n; i++)
+    int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
+
+    /* Note that we would like to avoid this conditional by putting it
+     * into the omp pragma instead, but then we still take the full
+     * omp parallel for overhead (at least with gcc5).
+     */
+    if (nth == 1)
     {
-        clear_rvec(v[i]);
+        for (int i = 0; i < n; i++)
+        {
+            clear_rvec(v[i]);
+        }
+    }
+    else
+    {
+#pragma omp parallel for num_threads(nth) schedule(static)
+        for (int i = 0; i < n; i++)
+        {
+            clear_rvec(v[i]);
+        }
     }
 }
 
@@ -1185,28 +1200,22 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             }
         }
 
-        // cppcheck-suppress unreadVariable
-        int gmx_unused nth = gmx_omp_nthreads_get(emntDefault);
-#pragma omp parallel num_threads(nth)
+        if (fr->bF_NoVirSum)
         {
-            if (fr->bF_NoVirSum)
+            if (flags & GMX_FORCE_VIRIAL)
             {
-                if (flags & GMX_FORCE_VIRIAL)
+                if (fr->bDomDec)
                 {
-
-                    if (fr->bDomDec)
-                    {
-                        clear_rvecs_omp_nowait(fr->f_novirsum_n, fr->f_novirsum);
-                    }
-                    else
-                    {
-                        clear_rvecs_omp_nowait(homenr, fr->f_novirsum+start);
-                    }
+                    clear_rvecs_omp(fr->f_novirsum_n, fr->f_novirsum);
+                }
+                else
+                {
+                    clear_rvecs_omp(homenr, fr->f_novirsum+start);
                 }
             }
-            /* Clear the short- and long-range forces */
-            clear_rvecs_omp_nowait(fr->natoms_force_constr, f);
         }
+        /* Clear the short- and long-range forces */
+        clear_rvecs_omp(fr->natoms_force_constr, f);
 
         clear_rvec(fr->vir_diag_posres);
     }
-- 
2.11.4.GIT