From a4f4cc46c93c86c6e64f381dc8cd0c828014ba31 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Tue, 9 Oct 2012 17:08:42 +0200 Subject: [PATCH] updated several things related with OpenMP thread count Removed a gmx_fatal preventing oversubscribing hardware with OpenMP threads. Proper checks for avoiding partial HT and inconvenient thread count with tMPI. Automatic switch to pure OpenMP parallelization with <= 4 threads or <= 12 on Nehalem/Westmere or <= 16 on >=Sandy Bridge. Change-Id: I5f0fa7c8e5a23cb66833ec9facdbc6d197f82836 --- src/kernel/mdrun.c | 11 +++++-- src/kernel/runner.c | 93 ++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 79 insertions(+), 25 deletions(-) diff --git a/src/kernel/mdrun.c b/src/kernel/mdrun.c index 316b485143..dc49b99617 100644 --- a/src/kernel/mdrun.c +++ b/src/kernel/mdrun.c @@ -174,6 +174,13 @@ int main(int argc,char *argv[]) "Note that using combined MPI+OpenMP parallelization is almost always", "slower than single parallelization, except at the scaling limit, where", "especially OpenMP parallelization of PME reduces the communication cost.", + "OpenMP-only parallelization is much faster than MPI-only parallelization", + "on a single CPU(-die). Since we currently don't have proper hardware", + "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only", + "automatically use OpenMP-only parallelization when you use up to 4", + "threads, up to 12 threads with Intel Nehalem/Westmere, or up to 16", + "threads with Intel Sandy Bridge or newer CPUs. Otherwise MPI-only", + "parallelization is used (except with GPUs, see below).", "[PAR]", "To quickly test the performance of the new Verlet cut-off scheme", "with old [TT].tpr[tt] files, either on CPUs or CPUs+GPUs, you can use", @@ -522,9 +529,9 @@ int main(int argc,char *argv[]) { "-ntmpi", FALSE, etINT, {&hw_opt.nthreads_tmpi}, "Number of thread-MPI threads to start (0 is guess)" }, { "-ntomp", FALSE, etINT, {&hw_opt.nthreads_omp}, - "Number of OpenMP threads to start (0 is guess)" }, + "Number of OpenMP threads per MPI process/thread to start (0 is guess)" }, { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme}, - "Number of OpenMP threads to start (0 is -ntomp)" }, + "Number of OpenMP threads per MPI process/thread to start (0 is -ntomp)" }, { "-pin", FALSE, etBOOL, {&hw_opt.bThreadPinning}, "Pin OpenMP threads to cores" }, { "-pinht", FALSE, etBOOL, {&hw_opt.bPinHyperthreading}, diff --git a/src/kernel/runner.c b/src/kernel/runner.c index 7599238dc5..b7549d9a44 100644 --- a/src/kernel/runner.c +++ b/src/kernel/runner.c @@ -275,9 +275,10 @@ static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt, } -static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt, - int nthreads_tot, - int ngpu) +static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo, + const gmx_hw_opt_t *hw_opt, + int nthreads_tot, + int ngpu) { int nthreads_tmpi; @@ -296,18 +297,47 @@ static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt, } else if (hw_opt->nthreads_omp > 0) { - if (hw_opt->nthreads_omp > nthreads_tot) - { - gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot); - } - nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp; + /* Here we could oversubscribe, when we do, we issue a warning later */ + nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp); } else { /* TODO choose nthreads_omp based on hardware topology when we have a hardware topology detection library */ - /* Don't use OpenMP parallelization */ - nthreads_tmpi = nthreads_tot; + /* In general, when running up to 4 threads, OpenMP should be faster. + * Note: on AMD Bulldozer we should avoid running OpenMP over two dies. + * On Intel>=Nehalem running OpenMP on a single CPU is always faster, + * even on two CPUs it's usually faster (but with many OpenMP threads + * it could be faster not to use HT, currently we always use HT). + * On Nehalem/Westmere we want to avoid running 16 threads over + * two CPUs with HT, so we need a limit<16; thus we use 12. + * A reasonable limit for Intel Sandy and Ivy bridge, + * not knowing the topology, is 16 threads. + */ + const int nthreads_omp_always_faster = 4; + const int nthreads_omp_always_faster_Nehalem = 12; + const int nthreads_omp_always_faster_SandyBridge = 16; + const int first_model_Nehalem = 0x1A; + const int first_model_SandyBridge = 0x2A; + gmx_bool bIntel_Family6; + + bIntel_Family6 = + (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL && + gmx_cpuid_family(hwinfo->cpuid_info) == 6); + + if (nthreads_tot <= nthreads_omp_always_faster || + (bIntel_Family6 && + ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) || + (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge)))) + { + /* Use pure OpenMP parallelization */ + nthreads_tmpi = 1; + } + else + { + /* Don't use OpenMP parallelization */ + nthreads_tmpi = nthreads_tot; + } } return nthreads_tmpi; @@ -327,7 +357,7 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo, const t_commrec *cr, FILE *fplog) { - int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu; + int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu; int min_atoms_per_mpi_thread; char *env; char sbuf[STRLEN]; @@ -339,6 +369,8 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo, return hw_opt->nthreads_tmpi; } + nthreads_hw = hwinfo->nthreads_hw_avail; + /* How many total (#tMPI*#OpenMP) threads can we start? */ if (hw_opt->nthreads_tot > 0) { @@ -346,7 +378,7 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo, } else { - nthreads_tot_max = tMPI_Thread_get_hw_number(); + nthreads_tot_max = nthreads_hw; } bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU); @@ -360,7 +392,7 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo, } nthreads_tmpi = - get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu); + get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu); if (inputrec->eI == eiNM || EI_TPI(inputrec->eI)) { @@ -398,19 +430,34 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo, threads (too few atoms per thread) */ nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread); - if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4)) + /* Avoid partial use of Hyper-Threading */ + if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED && + nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw) { - /* TODO replace this once we have proper HT detection - * Use only multiples of 4 above 8 threads - * or with an 8-core processor - * (to avoid 6 threads on 8 core processors with 4 real cores). - */ - nthreads_new = (nthreads_new/4)*4; + nthreads_new = nthreads_hw/2; } - else if (nthreads_new > 4) + + /* Avoid large prime numbers in the thread count */ + if (nthreads_new >= 6) { - /* Avoid 5 or 7 threads */ - nthreads_new = (nthreads_new/2)*2; + /* Use only 6,8,10 with additional factors of 2 */ + int fac; + + fac = 2; + while (3*fac*2 <= nthreads_new) + { + fac *= 2; + } + + nthreads_new = (nthreads_new/fac)*fac; + } + else + { + /* Avoid 5 */ + if (nthreads_new == 5) + { + nthreads_new = 4; + } } nthreads_tmpi = nthreads_new; -- 2.11.4.GIT