From a4f4cc46c93c86c6e64f381dc8cd0c828014ba31 Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Tue, 9 Oct 2012 17:08:42 +0200
Subject: [PATCH] updated several things related with OpenMP thread count

Removed a gmx_fatal preventing oversubscribing hardware with OpenMP threads.
Proper checks for avoiding partial HT and inconvenient thread count with tMPI.
Automatic switch to pure OpenMP parallelization with <= 4 threads
or <= 12 on Nehalem/Westmere or <= 16 on >=Sandy Bridge.

Change-Id: I5f0fa7c8e5a23cb66833ec9facdbc6d197f82836
---
 src/kernel/mdrun.c  | 11 +++++--
 src/kernel/runner.c | 93 ++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 79 insertions(+), 25 deletions(-)

diff --git a/src/kernel/mdrun.c b/src/kernel/mdrun.c
index 316b485143..dc49b99617 100644
--- a/src/kernel/mdrun.c
+++ b/src/kernel/mdrun.c
@@ -174,6 +174,13 @@ int main(int argc,char *argv[])
     "Note that using combined MPI+OpenMP parallelization is almost always",
     "slower than single parallelization, except at the scaling limit, where",
     "especially OpenMP parallelization of PME reduces the communication cost.",
+    "OpenMP-only parallelization is much faster than MPI-only parallelization",
+    "on a single CPU(-die). Since we currently don't have proper hardware",
+    "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only",
+    "automatically use OpenMP-only parallelization when you use up to 4",
+    "threads, up to 12 threads with Intel Nehalem/Westmere, or up to 16",
+    "threads with Intel Sandy Bridge or newer CPUs. Otherwise MPI-only",
+    "parallelization is used (except with GPUs, see below).",
     "[PAR]",
     "To quickly test the performance of the new Verlet cut-off scheme",
     "with old [TT].tpr[tt] files, either on CPUs or CPUs+GPUs, you can use",
@@ -522,9 +529,9 @@ int main(int argc,char *argv[])
     { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
       "Number of thread-MPI threads to start (0 is guess)" },
     { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
-      "Number of OpenMP threads to start (0 is guess)" },
+      "Number of OpenMP threads per MPI process/thread to start (0 is guess)" },
     { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
-      "Number of OpenMP threads to start (0 is -ntomp)" },
+      "Number of OpenMP threads per MPI process/thread to start (0 is -ntomp)" },
     { "-pin",     FALSE, etBOOL, {&hw_opt.bThreadPinning},
       "Pin OpenMP threads to cores" },
     { "-pinht",   FALSE, etBOOL, {&hw_opt.bPinHyperthreading},
diff --git a/src/kernel/runner.c b/src/kernel/runner.c
index 7599238dc5..b7549d9a44 100644
--- a/src/kernel/runner.c
+++ b/src/kernel/runner.c
@@ -275,9 +275,10 @@ static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
 }
 
 
-static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
-                                            int nthreads_tot,
-                                            int ngpu)
+static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
+                                        const gmx_hw_opt_t *hw_opt,
+                                        int nthreads_tot,
+                                        int ngpu)
 {
     int nthreads_tmpi;
 
@@ -296,18 +297,47 @@ static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
     }
     else if (hw_opt->nthreads_omp > 0)
     {
-        if (hw_opt->nthreads_omp > nthreads_tot)
-        {
-            gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot);
-        }
-        nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp;
+        /* Here we could oversubscribe, when we do, we issue a warning later */
+        nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
     }
     else
     {
         /* TODO choose nthreads_omp based on hardware topology
            when we have a hardware topology detection library */
-        /* Don't use OpenMP parallelization */
-        nthreads_tmpi = nthreads_tot;
+        /* In general, when running up to 4 threads, OpenMP should be faster.
+         * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
+         * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
+         * even on two CPUs it's usually faster (but with many OpenMP threads
+         * it could be faster not to use HT, currently we always use HT).
+         * On Nehalem/Westmere we want to avoid running 16 threads over
+         * two CPUs with HT, so we need a limit<16; thus we use 12.
+         * A reasonable limit for Intel Sandy and Ivy bridge,
+         * not knowing the topology, is 16 threads.
+         */
+        const int nthreads_omp_always_faster             =  4;
+        const int nthreads_omp_always_faster_Nehalem     = 12;
+        const int nthreads_omp_always_faster_SandyBridge = 16;
+        const int first_model_Nehalem     = 0x1A;
+        const int first_model_SandyBridge = 0x2A;
+        gmx_bool bIntel_Family6;
+
+        bIntel_Family6 =
+            (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
+             gmx_cpuid_family(hwinfo->cpuid_info) == 6);
+
+        if (nthreads_tot <= nthreads_omp_always_faster ||
+            (bIntel_Family6 &&
+             ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
+              (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
+        {
+            /* Use pure OpenMP parallelization */
+            nthreads_tmpi = 1;
+        }
+        else
+        {
+            /* Don't use OpenMP parallelization */
+            nthreads_tmpi = nthreads_tot;
+        }
     }
 
     return nthreads_tmpi;
@@ -327,7 +357,7 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
                             const t_commrec *cr,
                             FILE *fplog)
 {
-    int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
+    int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
     int min_atoms_per_mpi_thread;
     char *env;
     char sbuf[STRLEN];
@@ -339,6 +369,8 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
         return hw_opt->nthreads_tmpi;
     }
 
+    nthreads_hw = hwinfo->nthreads_hw_avail;
+
     /* How many total (#tMPI*#OpenMP) threads can we start? */ 
     if (hw_opt->nthreads_tot > 0)
     {
@@ -346,7 +378,7 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
     }
     else
     {
-        nthreads_tot_max = tMPI_Thread_get_hw_number();
+        nthreads_tot_max = nthreads_hw;
     }
 
     bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
@@ -360,7 +392,7 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
     }
 
     nthreads_tmpi =
-        get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu);
+        get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
 
     if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
     {
@@ -398,19 +430,34 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
            threads (too few atoms per thread) */
         nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
 
-        if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4))
+        /* Avoid partial use of Hyper-Threading */
+        if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
+            nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
         {
-            /* TODO replace this once we have proper HT detection
-             * Use only multiples of 4 above 8 threads
-             * or with an 8-core processor
-             * (to avoid 6 threads on 8 core processors with 4 real cores).
-             */
-            nthreads_new = (nthreads_new/4)*4;
+            nthreads_new = nthreads_hw/2;
         }
-        else if (nthreads_new > 4)
+
+        /* Avoid large prime numbers in the thread count */
+        if (nthreads_new >= 6)
         {
-            /* Avoid 5 or 7 threads */
-            nthreads_new = (nthreads_new/2)*2;
+            /* Use only 6,8,10 with additional factors of 2 */
+            int fac;
+
+            fac = 2;
+            while (3*fac*2 <= nthreads_new)
+            {
+                fac *= 2;
+            }
+
+            nthreads_new = (nthreads_new/fac)*fac;
+        }
+        else
+        {
+            /* Avoid 5 */
+            if (nthreads_new == 5)
+            {
+                nthreads_new = 4;
+            }
         }
 
         nthreads_tmpi = nthreads_new;
-- 
2.11.4.GIT