From dd9859930822a97cb9b33ba116821e6940329675 Mon Sep 17 00:00:00 2001
From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Fri, 4 Aug 2017 13:52:33 +0200
Subject: [PATCH] Decouple task assignment from task execution

Code that needs to run on a GPU does not also need to know about the
code and data structures that underpin task assignment.  The outcome
of task assignment is the information about which GPU to use, and it
is simple and effective to give just that result to the code that
needs it.

Simplifies t_forcerec.

Added more const correctness for gmx_device_info_t pointers.

Change-Id: I094c19e08be73af998bd287e43d5c2b6e5969a60
---
 src/gromacs/domdec/domdec.cpp                      | 17 +++---
 src/gromacs/domdec/domdec.h                        |  8 +--
 src/gromacs/gpu_utils/gpu_utils.cu                 | 69 +++++++++-------------
 src/gromacs/gpu_utils/gpu_utils.h                  | 41 +++++++------
 src/gromacs/gpu_utils/gpu_utils_ocl.cpp            | 31 +++++-----
 src/gromacs/gpu_utils/ocl_compiler.h               |  3 +-
 src/gromacs/mdlib/force.h                          |  4 +-
 src/gromacs/mdlib/forcerec.cpp                     | 43 ++++++--------
 src/gromacs/mdlib/forcerec.h                       |  3 +
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu         |  8 +--
 .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu       | 10 +---
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h    |  2 +-
 src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h            |  7 +--
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp          |  2 +-
 .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp        |  8 +--
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h      |  4 +-
 src/gromacs/mdtypes/forcerec.h                     |  4 --
 src/programs/mdrun/runner.cpp                      | 18 ++++--
 18 files changed, 129 insertions(+), 153 deletions(-)

diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index 81a4289d74..1a5693187a 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -5388,28 +5388,24 @@ static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
 }
 #endif
 
-void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
-                                   const gmx_hw_info_t gmx_unused *hwinfo,
-                                   const gmx_hw_opt_t  gmx_unused &hw_opt)
+void dd_setup_dlb_resource_sharing(t_commrec            *cr,
+                                   int                   gpu_id)
 {
 #if GMX_MPI
     int           physicalnode_id_hash;
-    int           gpu_id;
     gmx_domdec_t *dd;
     MPI_Comm      mpi_comm_pp_physicalnode;
 
-    if (!(cr->duty & DUTY_PP) || hw_opt.gpu_opt.n_dev_use == 0)
+    if (!(cr->duty & DUTY_PP) || gpu_id < 0)
     {
-        /* Only PP nodes (currently) use GPUs.
-         * If we don't have GPUs, there are no resources to share.
+        /* Only ranks with short-ranged tasks (currently) use GPUs.
+         * If we don't have GPUs assigned, there are no resources to share.
          */
         return;
     }
 
     physicalnode_id_hash = gmx_physicalnode_id_hash();
 
-    gpu_id = get_gpu_device_id(hwinfo->gpu_info, &hw_opt.gpu_opt, cr->rank_pp_intranode);
-
     dd = cr->dd;
 
     if (debug)
@@ -5440,6 +5436,9 @@ void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
     {
         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
     }
+#else
+    GMX_UNUSED_VALUE(cr);
+    GMX_UNUSED_VALUE(gpu_id);
 #endif
 }
 
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h
index 814fd614b5..d23c444a13 100644
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -61,7 +61,6 @@
 #include <stdio.h>
 
 #include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/hardware/hw_info.h"
 #include "gromacs/math/vectypes.h"
 #include "gromacs/mdlib/vsite.h"
 #include "gromacs/mdtypes/forcerec.h"
@@ -193,16 +192,15 @@ void dd_dlb_lock(struct gmx_domdec_t *dd);
 /*! \brief Clear a lock such that with DLB=auto DLB may get turned on later */
 void dd_dlb_unlock(struct gmx_domdec_t *dd);
 
-/*! \brief Set up communication for averaging GPU wait times over ranks
+/*! \brief Set up communication for averaging GPU wait times over domains
  *
  * When domains (PP MPI ranks) share a GPU, the individual GPU wait times
  * are meaningless, as it depends on the order in which tasks on the same
  * GPU finish. Therefore there wait times need to be averaged over the ranks
  * sharing the same GPU. This function sets up the communication for that.
  */
-void dd_setup_dlb_resource_sharing(struct t_commrec           *cr,
-                                   const gmx_hw_info_t        *hwinfo,
-                                   const gmx_hw_opt_t         &hw_opt);
+void dd_setup_dlb_resource_sharing(t_commrec           *cr,
+                                   int                  gpu_id);
 
 /*! \brief Collects local rvec arrays \p lv to \p v on the master rank */
 void dd_collect_vec(struct gmx_domdec_t *dd,
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu
index 2ad9ec26ba..4ae442a374 100644
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -280,25 +280,22 @@ static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
 
 /*! \brief Tries to set application clocks for the GPU with the given index.
  *
- * The variable \gpuid is the index of the GPU in the gpu_info.cuda_dev array
- * to handle the application clocks for. Application clocks are set to the
- * max supported value to increase performance if application clock permissions
- * allow this. For future GPU architectures a more sophisticated scheme might be
- * required.
+ * Application clocks are set to the max supported value to increase
+ * performance if application clock permissions allow this. For future
+ * GPU architectures a more sophisticated scheme might be required.
  *
  * \todo Refactor this into a detection phase and a work phase. Also
  * refactor to remove compile-time dependence on logging header.
  *
  * \param     mdlog         log file to write to
- * \param[in] gpuid         index of the GPU to set application clocks for
- * \param[in] gpu_info      GPU info of all detected devices in the system.
+ * \param[in] cuda_dev      GPU device info for the GPU in use
  * \returns                 true if no error occurs during application clocks handling.
  */
 static gmx_bool init_gpu_application_clocks(
-        const gmx::MDLogger &mdlog, int gmx_unused gpuid,
-        const gmx_gpu_info_t gmx_unused *gpu_info)
+        const gmx::MDLogger &mdlog,
+        gmx_device_info_t   *cuda_dev)
 {
-    const cudaDeviceProp *prop                        = &gpu_info->gpu_dev[gpuid].prop;
+    const cudaDeviceProp *prop                        = &cuda_dev->prop;
     int                   cuda_compute_capability     = prop->major * 10 + prop->minor;
     gmx_bool              bGpuCanUseApplicationClocks =
         ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_compute_capability >= 35 ) ||
@@ -344,8 +341,6 @@ static gmx_bool init_gpu_application_clocks(
         return false;
     }
 
-    gmx_device_info_t *cuda_dev = &(gpu_info->gpu_dev[gpuid]);
-
     if (!addNVMLDeviceId(cuda_dev))
     {
         return false;
@@ -453,53 +448,36 @@ static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused
 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 }
 
-void init_gpu(const gmx::MDLogger &mdlog, int rank, int mygpu,
-              const struct gmx_gpu_info_t *gpu_info,
-              const struct gmx_gpu_opt_t *gpu_opt)
+void init_gpu(const gmx::MDLogger &mdlog, int rank,
+              gmx_device_info_t *deviceInfo)
 {
     cudaError_t stat;
     char        sbuf[STRLEN];
-    int         gpuid;
-
-    assert(gpu_info);
-    assert(gpu_opt);
-
-    if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
-    {
-        snprintf(sbuf, STRLEN, "On rank %d trying to initialize an non-existent GPU: "
-                 "there are %d selected GPU(s), but #%d was requested.",
-                 rank, gpu_opt->n_dev_use, mygpu);
-        gmx_incons(sbuf);
-    }
 
-    gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
+    assert(deviceInfo);
 
-    stat = cudaSetDevice(gpuid);
+    stat = cudaSetDevice(deviceInfo->id);
     if (stat != cudaSuccess)
     {
         snprintf(sbuf, STRLEN, "On rank %d failed to initialize GPU #%d",
-                 rank, mygpu);
+                 rank, deviceInfo->id);
         CU_RET_ERR(stat, sbuf);
     }
 
     if (debug)
     {
-        fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name);
+        fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
     }
 
     //Ignoring return value as NVML errors should be treated not critical.
-    init_gpu_application_clocks(mdlog, gpuid, gpu_info);
+    init_gpu_application_clocks(mdlog, deviceInfo);
 }
 
-gmx_bool free_cuda_gpu(
-        int gmx_unused mygpu, char *result_str,
-        const gmx_gpu_info_t gmx_unused *gpu_info,
-        const gmx_gpu_opt_t gmx_unused *gpu_opt
-        )
+gmx_bool free_cuda_gpu(const gmx_device_info_t *deviceInfo,
+                       char                    *result_str)
 {
     cudaError_t  stat;
     gmx_bool     reset_gpu_application_clocks_status = true;
-    int          gpuid;
 
     assert(result_str);
 
@@ -511,10 +489,9 @@ gmx_bool free_cuda_gpu(
         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
     }
 
-    gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1;
-    if (gpuid != -1)
+    if (deviceInfo != nullptr)
     {
-        reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) );
+        reset_gpu_application_clocks_status = reset_gpu_application_clocks(deviceInfo);
     }
 
     stat = cudaDeviceReset();
@@ -522,6 +499,16 @@ gmx_bool free_cuda_gpu(
     return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
 }
 
+gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
+                                 int                   deviceId)
+{
+    if (deviceId < 0 || deviceId >= gpu_info.n_dev)
+    {
+        gmx_incons("Invalid GPU deviceId requested");
+    }
+    return &gpu_info.gpu_dev[deviceId];
+}
+
 /*! \brief Returns true if the gpu characterized by the device properties is
  *  supported by the native gpu acceleration.
  *
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h
index 0e9d04f666..2bb1e21f0c 100644
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -50,6 +50,7 @@
 #include "gromacs/gpu_utils/gpu_macros.h"
 #include "gromacs/utility/basedefinitions.h"
 
+struct gmx_device_info_t;
 struct gmx_gpu_info_t;
 struct gmx_gpu_opt_t;
 
@@ -101,16 +102,14 @@ const char *getGpuCompatibilityDescription(const gmx_gpu_info_t &GPU_FUNC_ARGUME
 GPU_FUNC_QUALIFIER
 void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU_FUNC_TERM
 
-/*! \brief Initializes the GPU with the given index.
+/*! \brief Initializes the GPU described by \c deviceInfo.
  *
- * The varible \p mygpu is the index of the GPU to initialize in the
- * gpu_info.gpu_dev array.
+ * TODO Doxygen complains about these - probably a Doxygen bug, since
+ * the patterns here are the same as elsewhere in this header.
  *
- * \param      mdlog        log file to write to
- * \param[in]  rank         MPI rank of this process (for error output)
- * \param[in]  mygpu        index of the GPU to initialize
- * \param[in] gpu_info      GPU info of all detected devices in the system.
- * \param[in] gpu_opt       options for using the GPUs in gpu_info
+ *  param[in]    mdlog        log file to write to
+ *  param[in]    rank         MPI rank of this process (for error output)
+ * \param[inout] deviceInfo   device info of the GPU to initialize
  *
  * Issues a fatal error for any critical errors that occur during
  * initialization.
@@ -118,27 +117,33 @@ void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU
 GPU_FUNC_QUALIFIER
 void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog),
               int GPU_FUNC_ARGUMENT(rank),
-              int GPU_FUNC_ARGUMENT(mygpu),
-              const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
-              const gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM
+              gmx_device_info_t *GPU_FUNC_ARGUMENT(deviceInfo)) GPU_FUNC_TERM
 
 /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
  *
  * The context is explicitly destroyed and therefore all data uploaded to the GPU
  * is lost. This should only be called when none of this data is required anymore.
  *
- * \param[in]  mygpu        index of the GPU clean up for
+ * \param[in]  deviceInfo   device info of the GPU to clean up for
  * \param[out] result_str   the message related to the error that occurred
  *                          during the initialization (if there was any).
- * \param[in] gpu_info      GPU info of all detected devices in the system.
- * \param[in] gpu_opt       options for using the GPUs in gpu_info
+ *
  * \returns                 true if no error occurs during the freeing.
  */
 CUDA_FUNC_QUALIFIER
-gmx_bool free_cuda_gpu(int CUDA_FUNC_ARGUMENT(mygpu),
-                       char *CUDA_FUNC_ARGUMENT(result_str),
-                       const gmx_gpu_info_t *CUDA_FUNC_ARGUMENT(gpu_info),
-                       const gmx_gpu_opt_t *CUDA_FUNC_ARGUMENT(gpu_opt)) CUDA_FUNC_TERM_WITH_RETURN(TRUE)
+gmx_bool free_cuda_gpu(const gmx_device_info_t *CUDA_FUNC_ARGUMENT(deviceInfo),
+                       char *CUDA_FUNC_ARGUMENT(result_str)) CUDA_FUNC_TERM_WITH_RETURN(TRUE)
+
+/*! \brief Return a pointer to the device info for \c deviceId
+ *
+ * \param[in] gpu_info      GPU info of all detected devices in the system.
+ * \param[in] deviceId      ID for the GPU device requested.
+ *
+ * \returns                 Pointer to the device info for \c deviceId.
+ */
+GPU_FUNC_QUALIFIER
+gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &GPU_FUNC_ARGUMENT(gpu_info),
+                                 int GPU_FUNC_ARGUMENT(deviceId)) GPU_FUNC_TERM_WITH_RETURN(NULL)
 
 /*! \brief Returns the device ID of the CUDA GPU currently in use.
  *
diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
index e030186f78..efbb796879 100644
--- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
+++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
@@ -393,29 +393,17 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int ind
 
 //! This function is documented in the header file
 void init_gpu(const gmx::MDLogger               & /*mdlog*/,
-              int                               rank,
-              int                               mygpu,
-              const gmx_gpu_info_t             *gpu_info,
-              const gmx_gpu_opt_t              *gpu_opt
-              )
+              int                               /* rank */,
+              gmx_device_info_t                *deviceInfo)
 {
-    assert(gpu_opt);
-
-    if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
-    {
-        char        sbuf[STRLEN];
-        sprintf(sbuf, "On rank %d trying to initialize an non-existent GPU: "
-                "there are %d selected GPU(s), but #%d was requested.",
-                rank, gpu_opt->n_dev_use, mygpu);
-        gmx_incons(sbuf);
-    }
+    assert(deviceInfo);
 
     // If the device is NVIDIA, for safety reasons we disable the JIT
     // caching as this is known to be broken at least until driver 364.19;
     // the cache does not always get regenerated when the source code changes,
     // e.g. if the path to the kernel sources remains the same
 
-    if (gpu_info->gpu_dev[mygpu].vendor_e == OCL_VENDOR_NVIDIA)
+    if (deviceInfo->vendor_e == OCL_VENDOR_NVIDIA)
     {
         // Ignore return values, failing to set the variable does not mean
         // that something will go wrong later.
@@ -440,6 +428,17 @@ int get_gpu_device_id(const gmx_gpu_info_t  &,
 }
 
 //! This function is documented in the header file
+gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
+                                 int                   deviceId)
+{
+    if (deviceId < 0 || deviceId >= gpu_info.n_dev)
+    {
+        gmx_incons("Invalid GPU deviceId requested");
+    }
+    return &gpu_info.gpu_dev[deviceId];
+}
+
+//! This function is documented in the header file
 char* get_ocl_gpu_device_name(const gmx_gpu_info_t *gpu_info,
                               const gmx_gpu_opt_t  *gpu_opt,
                               int                   idx)
diff --git a/src/gromacs/gpu_utils/ocl_compiler.h b/src/gromacs/gpu_utils/ocl_compiler.h
index b5db76eb4c..33e64941ce 100644
--- a/src/gromacs/gpu_utils/ocl_compiler.h
+++ b/src/gromacs/gpu_utils/ocl_compiler.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -47,7 +47,6 @@
 #include <string>
 
 #include "gromacs/gpu_utils/oclutils.h"
-#include "gromacs/hardware/gpu_hw_info.h"
 
 namespace gmx
 {
diff --git a/src/gromacs/mdlib/force.h b/src/gromacs/mdlib/force.h
index 7874e723ad..93cd302140 100644
--- a/src/gromacs/mdlib/force.h
+++ b/src/gromacs/mdlib/force.h
@@ -45,6 +45,7 @@
 #include "gromacs/timing/wallcycle.h"
 #include "gromacs/utility/arrayref.h"
 
+struct gmx_device_info_t;
 struct gmx_edsam;
 struct gmx_gpu_info_t;
 struct gmx_groups_t;
@@ -214,7 +215,6 @@ void do_force_lowlevel(t_forcerec   *fr,
 
 void free_gpu_resources(const t_forcerec            *fr,
                         const t_commrec             *cr,
-                        const gmx_gpu_info_t        *gpu_info,
-                        const gmx_gpu_opt_t         *gpu_opt);
+                        const gmx_device_info_t     *deviceInfo);
 
 #endif
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp
index c4ed97ebf7..dd37fe8bf0 100644
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -2024,6 +2024,10 @@ init_interaction_const(FILE                       *fp,
     *interaction_const = ic;
 }
 
+/* TODO deviceInfo should be logically const, but currently
+ * init_gpu modifies it to set up NVML support. This could
+ * happen during the detection phase, and deviceInfo could
+ * the become const. */
 static void init_nb_verlet(FILE                *fp,
                            const gmx::MDLogger &mdlog,
                            nonbonded_verlet_t **nb_verlet,
@@ -2031,7 +2035,8 @@ static void init_nb_verlet(FILE                *fp,
                            const t_inputrec    *ir,
                            const t_forcerec    *fr,
                            const t_commrec     *cr,
-                           const char          *nbpu_opt)
+                           const char          *nbpu_opt,
+                           gmx_device_info_t   *deviceInfo)
 {
     nonbonded_verlet_t *nbv;
     int                 i;
@@ -2044,17 +2049,14 @@ static void init_nb_verlet(FILE                *fp,
     snew(nbv, 1);
 
     nbv->emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr);
-    nbv->bUseGPU    = (fr->gpu_opt->n_dev_use > 0);
+    nbv->bUseGPU    = deviceInfo != nullptr;
+
     GMX_RELEASE_ASSERT(!(nbv->emulateGpu && nbv->bUseGPU), "When GPU emulation is active, there cannot be a GPU assignment");
 
     if (nbv->bUseGPU)
     {
-        /* This PP MPI rank uses the GPU that the GPU assignment
-         * prepared for it, which is the entry in gpu_opt->dev_use
-         * corresponding to the index of this PP MPI rank within the
-         * set of such ranks on this node. */
-        init_gpu(mdlog, cr->nodeid, cr->rank_pp_intranode,
-                 &fr->hwinfo->gpu_info, fr->gpu_opt);
+        /* Use the assigned GPU. */
+        init_gpu(mdlog, cr->nodeid, deviceInfo);
     }
 
     nbv->nbs             = nullptr;
@@ -2170,11 +2172,9 @@ static void init_nb_verlet(FILE                *fp,
         /* init the NxN GPU data; the last argument tells whether we'll have
          * both local and non-local NB calculation on GPU */
         nbnxn_gpu_init(&nbv->gpu_nbv,
-                       &fr->hwinfo->gpu_info,
-                       fr->gpu_opt,
+                       deviceInfo,
                        fr->ic,
                        nbv->grp,
-                       cr->rank_pp_intranode,
                        cr->nodeid,
                        (nbv->ngrp > 1) && !bHybridGPURun);
 
@@ -2245,6 +2245,7 @@ void init_forcerec(FILE                *fp,
                    const char          *tabpfn,
                    const t_filenm      *tabbfnm,
                    const char          *nbpu_opt,
+                   gmx_device_info_t   *deviceInfo,
                    gmx_bool             bNoSolvOpt,
                    real                 print_force)
 {
@@ -2258,15 +2259,6 @@ void init_forcerec(FILE                *fp,
     gmx_bool       bFEP_NonBonded;
     int           *nm_ind, egp_flags;
 
-    if (fr->hwinfo == nullptr)
-    {
-        /* Detect hardware, gather information.
-         * In mdrun, hwinfo has already been set before calling init_forcerec.
-         * Here we ignore GPUs, as tools will not use them anyhow.
-         */
-        fr->hwinfo = gmx_detect_hardware(mdlog, cr, FALSE);
-    }
-
     /* By default we turn SIMD kernels on, but it might be turned off further down... */
     fr->use_simd_kernels = TRUE;
 
@@ -3136,7 +3128,7 @@ void init_forcerec(FILE                *fp,
             GMX_RELEASE_ASSERT(ir->rcoulomb == ir->rvdw, "With Verlet lists and no PME rcoulomb and rvdw should be identical");
         }
 
-        init_nb_verlet(fp, mdlog, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt);
+        init_nb_verlet(fp, mdlog, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt, deviceInfo);
     }
 
     if (ir->eDispCorr != edispcNO)
@@ -3175,10 +3167,9 @@ void pr_forcerec(FILE *fp, t_forcerec *fr)
  * in this run because the PME ranks have no knowledge of whether GPUs
  * are used or not, but all ranks need to enter the barrier below.
  */
-void free_gpu_resources(const t_forcerec     *fr,
-                        const t_commrec      *cr,
-                        const gmx_gpu_info_t *gpu_info,
-                        const gmx_gpu_opt_t  *gpu_opt)
+void free_gpu_resources(const t_forcerec        *fr,
+                        const t_commrec         *cr,
+                        const gmx_device_info_t *deviceInfo)
 {
     gmx_bool bIsPPrankUsingGPU;
     char     gpu_err_str[STRLEN];
@@ -3213,7 +3204,7 @@ void free_gpu_resources(const t_forcerec     *fr,
     if (bIsPPrankUsingGPU)
     {
         /* uninitialize GPU (by destroying the context) */
-        if (!free_cuda_gpu(cr->rank_pp_intranode, gpu_err_str, gpu_info, gpu_opt))
+        if (!free_cuda_gpu(deviceInfo, gpu_err_str))
         {
             gmx_warning("On rank %d failed to free GPU #%d: %s",
                         cr->nodeid, get_current_cuda_gpu_device_id(), gpu_err_str);
diff --git a/src/gromacs/mdlib/forcerec.h b/src/gromacs/mdlib/forcerec.h
index 5eaafc4e49..c235a87015 100644
--- a/src/gromacs/mdlib/forcerec.h
+++ b/src/gromacs/mdlib/forcerec.h
@@ -44,6 +44,7 @@
 #include "gromacs/mdtypes/forcerec.h"
 #include "gromacs/timing/wallcycle.h"
 
+struct gmx_device_info_t;
 struct t_commrec;
 struct t_fcdata;
 struct t_filenm;
@@ -107,6 +108,7 @@ void init_interaction_const_tables(FILE                   *fp,
  * \param[in]  tabpfn      Table potential file for pair interactions
  * \param[in]  tabbfnm     Table potential files for bonded interactions
  * \param[in]  nbpu_opt    Nonbonded Processing Unit (GPU/CPU etc.)
+ * \param[in]  deviceInfo  Info about GPU device to use for short-ranged work
  * \param[in]  bNoSolvOpt  Do not use solvent optimization
  * \param[in]  print_force Print forces for atoms with force >= print_force
  */
@@ -122,6 +124,7 @@ void init_forcerec(FILE                   *fplog,
                    const char             *tabpfn,
                    const t_filenm         *tabbfnm,
                    const char             *nbpu_opt,
+                   gmx_device_info_t      *deviceInfo,
                    gmx_bool                bNoSolvOpt,
                    real                    print_force);
 
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index 35774bec6f..739b582c1e 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -148,7 +148,7 @@ static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 
 
 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
-static inline int calc_nb_kernel_nblock(int nwork_units, gmx_device_info_t *dinfo)
+static inline int calc_nb_kernel_nblock(int nwork_units, const gmx_device_info_t *dinfo)
 {
     int max_grid_x_size;
 
@@ -231,7 +231,7 @@ static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int
                                                        int                                  evdwtype,
                                                        bool                                 bDoEne,
                                                        bool                                 bDoPrune,
-                                                       struct gmx_device_info_t gmx_unused *devInfo)
+                                                       const gmx_device_info_t gmx_unused  *devInfo)
 {
     nbnxn_cu_kfunc_ptr_t res;
 
@@ -271,7 +271,7 @@ static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int
 }
 
 /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
-static inline int calc_shmem_required(const int num_threads_z, gmx_device_info_t gmx_unused *dinfo, const cu_nbparam_t *nbp)
+static inline int calc_shmem_required(const int num_threads_z, const gmx_device_info_t gmx_unused *dinfo, const cu_nbparam_t *nbp)
 {
     int shmem;
 
@@ -725,7 +725,7 @@ const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_
 
 /*! Set up the cache configuration for the non-bonded kernels,
  */
-void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo)
+void nbnxn_cuda_set_cacheconfig(const gmx_device_info_t *devinfo)
 {
     cudaError_t stat;
 
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index e29fccb303..52e255de56 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -78,7 +78,7 @@ static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
 static unsigned int gpu_min_ci_balanced_factor = 44;
 
 /* Functions from nbnxn_cuda.cu */
-extern void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo);
+extern void nbnxn_cuda_set_cacheconfig(const gmx_device_info_t *devinfo);
 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
@@ -584,19 +584,15 @@ static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
 }
 
 void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
-                    const gmx_gpu_info_t      *gpu_info,
-                    const gmx_gpu_opt_t       *gpu_opt,
+                    const gmx_device_info_t   *deviceInfo,
                     const interaction_const_t *ic,
                     nonbonded_verlet_group_t  *nbv_grp,
-                    int                        my_gpu_index,
                     int                        /*rank*/,
                     gmx_bool                   bLocalAndNonlocal)
 {
     cudaError_t       stat;
     gmx_nbnxn_cuda_t *nb;
 
-    assert(gpu_info);
-
     if (p_nb == NULL)
     {
         return;
@@ -624,7 +620,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
     init_plist(nb->plist[eintLocal]);
 
     /* set device info, just point it to the right GPU among the detected ones */
-    nb->dev_info = &gpu_info->gpu_dev[get_gpu_device_id(*gpu_info, gpu_opt, my_gpu_index)];
+    nb->dev_info = deviceInfo;
 
     /* local/non-local GPU streams */
     stat = cudaStreamCreate(&nb->stream[eintLocal]);
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
index 09eccc34c0..9bdb258d06 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -237,7 +237,7 @@ struct cu_timers
  */
 struct gmx_nbnxn_cuda_t
 {
-    struct gmx_device_info_t *dev_info;       /**< CUDA device information                              */
+    const gmx_device_info_t  *dev_info;       /**< CUDA device information                              */
     bool                      bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU    */
     cu_atomdata_t            *atdat;          /**< atom data                                            */
     cu_nbparam_t             *nbparam;        /**< parameters required for the non-bonded calc.         */
diff --git a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
index 0e16d5cfd9..cddb0f05a4 100644
--- a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
+++ b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -56,16 +56,13 @@ struct nbnxn_pairlist_t;
 struct nbnxn_atomdata_t;
 struct gmx_wallclock_gpu_t;
 struct gmx_gpu_info_t;
-struct gmx_gpu_opt_t;
 
 /** Initializes the data structures related to GPU nonbonded calculations. */
 GPU_FUNC_QUALIFIER
 void nbnxn_gpu_init(gmx_nbnxn_gpu_t gmx_unused            **p_nb,
-                    const struct gmx_gpu_info_t gmx_unused *gpu_info,
-                    const struct gmx_gpu_opt_t gmx_unused  *gpu_opt,
+                    const gmx_device_info_t gmx_unused     *deviceInfo,
                     const interaction_const_t gmx_unused   *ic,
                     nonbonded_verlet_group_t gmx_unused    *nbv_grp,
-                    int gmx_unused                          my_gpu_index,
                     int gmx_unused                          rank,
                     /* true if both local and non-local are done on GPU */
                     gmx_bool gmx_unused                     bLocalAndNonlocal) GPU_FUNC_TERM
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
index 633e5d3252..863927f0a4 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
@@ -94,7 +94,7 @@ static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 
 /*! \brief Validates the input global work size parameter.
  */
-static inline void validate_global_work_size(size_t *global_work_size, int work_dim, gmx_device_info_t *dinfo)
+static inline void validate_global_work_size(size_t *global_work_size, int work_dim, const gmx_device_info_t *dinfo)
 {
     cl_uint device_size_t_size_bits;
     cl_uint host_size_t_size_bits;
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
index 23566d8db8..a7b95e49a9 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -687,11 +687,9 @@ static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
 
 //! This function is documented in the header file
 void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
-                    const gmx_gpu_info_t      *gpu_info,
-                    const gmx_gpu_opt_t       *gpu_opt,
+                    const gmx_device_info_t   *deviceInfo,
                     const interaction_const_t *ic,
                     nonbonded_verlet_group_t  *nbv_grp,
-                    int                        my_gpu_index,
                     int                        rank,
                     gmx_bool                   bLocalAndNonlocal)
 {
@@ -699,8 +697,6 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
     cl_int                      cl_error;
     cl_command_queue_properties queue_properties;
 
-    assert(gpu_info);
-    assert(gpu_opt);
     assert(ic);
 
     if (p_nb == NULL)
@@ -723,7 +719,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
     snew(nb->timings, 1);
 
     /* set device info, just point it to the right GPU among the detected ones */
-    nb->dev_info = gpu_info->gpu_dev + gpu_opt->dev_use[my_gpu_index];
+    nb->dev_info = deviceInfo;
     snew(nb->dev_rundata, 1);
 
     /* init to NULL the debug buffer */
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
index 0fa212584e..7d81906d2a 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -271,7 +271,7 @@ typedef struct cl_timers
  */
 struct gmx_nbnxn_ocl_t
 {
-    struct gmx_device_info_t         *dev_info;    /**< OpenCL device information                              */
+    const gmx_device_info_t          *dev_info;    /**< OpenCL device information                              */
     struct gmx_device_runtime_data_t *dev_rundata; /**< OpenCL runtime data (context, kernels)                 */
 
     /**< Pointers to non-bonded kernel functions
diff --git a/src/gromacs/mdtypes/forcerec.h b/src/gromacs/mdtypes/forcerec.h
index 7cd11fad44..62f5030711 100644
--- a/src/gromacs/mdtypes/forcerec.h
+++ b/src/gromacs/mdtypes/forcerec.h
@@ -59,8 +59,6 @@ struct t_forcetable;
 struct t_nblist;
 struct t_nblists;
 struct t_QMMMrec;
-struct gmx_hw_info_t;
-struct gmx_gpu_opt_t;
 
 #ifdef __cplusplus
 extern "C" {
@@ -174,8 +172,6 @@ struct t_forcerec {
     rvec                        posres_com;
     rvec                        posres_comB;
 
-    const struct gmx_hw_info_t *hwinfo;
-    const struct gmx_gpu_opt_t *gpu_opt;
     gmx_bool                    use_simd_kernels;
 
     /* Interaction for calculated in kernels. In many cases this is similar to
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp
index 7cc5d5ad57..c798109795 100644
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -1144,10 +1144,21 @@ int Mdrunner::mdrunner()
     check_resource_division_efficiency(hwinfo, hw_opt, hw_opt.gpu_opt.n_dev_use, Flags & MD_NTOMPSET,
                                        cr, mdlog);
 
+    gmx_device_info_t *shortRangedDeviceInfo = nullptr;
+    int                shortRangedDeviceId   = -1;
+    if (cr->duty & DUTY_PP)
+    {
+        if (willUsePhysicalGpu)
+        {
+            shortRangedDeviceId   = hw_opt.gpu_opt.dev_use[cr->nrank_pp_intranode];
+            shortRangedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, shortRangedDeviceId);
+        }
+    }
+
     if (DOMAINDECOMP(cr))
     {
         /* When we share GPUs over ranks, we need to know this for the DLB */
-        dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
+        dd_setup_dlb_resource_sharing(cr, shortRangedDeviceId);
     }
 
     /* getting number of PP/PME threads
@@ -1187,8 +1198,6 @@ int Mdrunner::mdrunner()
 
         /* Initiate forcerecord */
         fr                 = mk_forcerec();
-        fr->hwinfo         = hwinfo;
-        fr->gpu_opt        = &hw_opt.gpu_opt;
         fr->forceProviders = mdModules.initForceProviders();
         init_forcerec(fplog, mdlog, fr, fcd,
                       inputrec, mtop, cr, box,
@@ -1196,6 +1205,7 @@ int Mdrunner::mdrunner()
                       opt2fn("-tablep", nfile, fnm),
                       getFilenm("-tableb", nfile, fnm),
                       nbpu_opt,
+                      shortRangedDeviceInfo,
                       FALSE,
                       pforce);
 
@@ -1425,7 +1435,7 @@ int Mdrunner::mdrunner()
     }
 
     /* Free GPU memory and context */
-    free_gpu_resources(fr, cr, &hwinfo->gpu_info, fr ? fr->gpu_opt : nullptr);
+    free_gpu_resources(fr, cr, shortRangedDeviceInfo);
 
     if (doMembed)
     {
-- 
2.11.4.GIT