From 1f5c9496a0250a15d1f1b9640832dc5244305e7e Mon Sep 17 00:00:00 2001
From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Mon, 2 Oct 2017 18:19:05 +0200
Subject: [PATCH] Continue removing -nb gpu_cpu

Now that hybrid mode is gone, both local and non-local Verlet-scheme
groups use the same kernel_type and thus both nbat pointers were
always the same. Thus, there's no reason to maintain two of them.
This simplifies and slightly optimizes nbnxn_atomdata_set().

Also fixed some other docs, comments, and logic that were either
already wrong, or are useless with hybrid mode gone.

Change-Id: Id02a11a00553b1df151a1e15b934611e0e15b9f7
---
 docs/user-guide/mdrun-performance.rst              |  7 +-
 src/gromacs/domdec/domdec.cpp                      |  2 +-
 src/gromacs/mdlib/forcerec.cpp                     | 90 +++++++++-------------
 src/gromacs/mdlib/nb_verlet.h                      |  2 +-
 src/gromacs/mdlib/nbnxn_atomdata.cpp               | 37 +++------
 src/gromacs/mdlib/nbnxn_atomdata.h                 |  1 -
 .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu       | 12 +--
 src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h            |  6 +-
 .../mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp       |  2 +-
 src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.h |  3 +
 .../mdlib/nbnxn_kernels/nbnxn_kernel_prune.cpp     |  2 +-
 .../mdlib/nbnxn_kernels/nbnxn_kernel_prune.h       |  2 +
 .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp        | 12 +--
 src/gromacs/mdlib/nbnxn_search.cpp                 | 19 +----
 src/gromacs/mdlib/nbnxn_tuning.cpp                 |  2 +-
 src/gromacs/mdlib/sim_util.cpp                     | 53 +++++--------
 16 files changed, 99 insertions(+), 153 deletions(-)

diff --git a/docs/user-guide/mdrun-performance.rst b/docs/user-guide/mdrun-performance.rst
index 58689f4045..f6f254b700 100644
--- a/docs/user-guide/mdrun-performance.rst
+++ b/docs/user-guide/mdrun-performance.rst
@@ -246,13 +246,10 @@ behavior.
 
 ``-nb``
     Used to set where to execute the non-bonded interactions.
-    Can be set to "auto", "cpu", "gpu", "gpu_cpu."
+    Can be set to "auto", "cpu", "gpu."
     Defaults to "auto," which uses a compatible GPU if available.
     Setting "cpu" requires that no GPU is used. Setting "gpu" requires
-    that a compatible GPU be available and will be used. Setting
-    "gpu_cpu" lets the GPU compute the local and the CPU the non-local
-    non-bonded interactions. Is only faster under a narrow range of
-    conditions.
+    that a compatible GPU be available and will be used.
 
 Examples for mdrun on one node
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index 83925fbad3..3b27800426 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -9580,7 +9580,7 @@ void dd_partition_system(FILE                *fplog,
                                   as_rvec_array(state_local->x.data()),
                                   ncg_moved, bRedist ? comm->moved : nullptr,
                                   fr->nbv->grp[eintLocal].kernel_type,
-                                  fr->nbv->grp[eintLocal].nbat);
+                                  fr->nbv->nbat);
 
                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
                 break;
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp
index 87f74d20fa..ffaea07569 100644
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -2151,7 +2151,6 @@ static void init_nb_verlet(FILE                *fp,
                            matrix               box)
 {
     nonbonded_verlet_t *nbv;
-    int                 i;
     char               *env;
 
     nbnxn_alloc_t      *nb_alloc;
@@ -2174,10 +2173,9 @@ static void init_nb_verlet(FILE                *fp,
     nbv->min_ci_balanced = 0;
 
     nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
-    for (i = 0; i < nbv->ngrp; i++)
+    for (int i = 0; i < nbv->ngrp; i++)
     {
         nbv->grp[i].nbl_lists.nnbl = 0;
-        nbv->grp[i].nbat           = nullptr;
         nbv->grp[i].kernel_type    = nbnxnkNotSet;
 
         if (i == 0) /* local */
@@ -2206,67 +2204,55 @@ static void init_nb_verlet(FILE                *fp,
                       bFEP_NonBonded,
                       gmx_omp_nthreads_get(emntPairsearch));
 
-    for (i = 0; i < nbv->ngrp; i++)
-    {
-        gpu_set_host_malloc_and_free(nbv->grp[0].kernel_type == nbnxnk8x8x8_GPU,
-                                     &nb_alloc, &nb_free);
+    gpu_set_host_malloc_and_free(nbv->grp[0].kernel_type == nbnxnk8x8x8_GPU,
+                                 &nb_alloc, &nb_free);
 
+    for (int i = 0; i < nbv->ngrp; i++)
+    {
         nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
                                 nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
                                 /* 8x8x8 "non-simple" lists are ATM always combined */
                                 !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
                                 nb_alloc, nb_free);
+    }
 
-        if (i == 0 ||
-            nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
+    int      enbnxninitcombrule;
+    if (fr->ic->vdwtype == evdwCUT &&
+        (fr->ic->vdw_modifier == eintmodNONE ||
+         fr->ic->vdw_modifier == eintmodPOTSHIFT) &&
+        getenv("GMX_NO_LJ_COMB_RULE") == nullptr)
+    {
+        /* Plain LJ cut-off: we can optimize with combination rules */
+        enbnxninitcombrule = enbnxninitcombruleDETECT;
+    }
+    else if (fr->ic->vdwtype == evdwPME)
+    {
+        /* LJ-PME: we need to use a combination rule for the grid */
+        if (fr->ljpme_combination_rule == eljpmeGEOM)
         {
-            gmx_bool bSimpleList;
-            int      enbnxninitcombrule;
-
-            bSimpleList = nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type);
-
-            if (fr->ic->vdwtype == evdwCUT &&
-                (fr->ic->vdw_modifier == eintmodNONE ||
-                 fr->ic->vdw_modifier == eintmodPOTSHIFT) &&
-                getenv("GMX_NO_LJ_COMB_RULE") == nullptr)
-            {
-                /* Plain LJ cut-off: we can optimize with combination rules */
-                enbnxninitcombrule = enbnxninitcombruleDETECT;
-            }
-            else if (fr->ic->vdwtype == evdwPME)
-            {
-                /* LJ-PME: we need to use a combination rule for the grid */
-                if (fr->ljpme_combination_rule == eljpmeGEOM)
-                {
-                    enbnxninitcombrule = enbnxninitcombruleGEOM;
-                }
-                else
-                {
-                    enbnxninitcombrule = enbnxninitcombruleLB;
-                }
-            }
-            else
-            {
-                /* We use a full combination matrix: no rule required */
-                enbnxninitcombrule = enbnxninitcombruleNONE;
-            }
-
-
-            snew(nbv->grp[i].nbat, 1);
-            nbnxn_atomdata_init(fp,
-                                nbv->grp[i].nbat,
-                                nbv->grp[i].kernel_type,
-                                enbnxninitcombrule,
-                                fr->ntype, fr->nbfp,
-                                ir->opts.ngener,
-                                bSimpleList ? gmx_omp_nthreads_get(emntNonbonded) : 1,
-                                nb_alloc, nb_free);
+            enbnxninitcombrule = enbnxninitcombruleGEOM;
         }
         else
         {
-            nbv->grp[i].nbat = nbv->grp[0].nbat;
+            enbnxninitcombrule = enbnxninitcombruleLB;
         }
     }
+    else
+    {
+        /* We use a full combination matrix: no rule required */
+        enbnxninitcombrule = enbnxninitcombruleNONE;
+    }
+
+    snew(nbv->nbat, 1);
+    bool bSimpleList = nbnxn_kernel_pairlist_simple(nbv->grp[0].kernel_type);
+    nbnxn_atomdata_init(fp,
+                        nbv->nbat,
+                        nbv->grp[0].kernel_type,
+                        enbnxninitcombrule,
+                        fr->ntype, fr->nbfp,
+                        ir->opts.ngener,
+                        bSimpleList ? gmx_omp_nthreads_get(emntNonbonded) : 1,
+                        nb_alloc, nb_free);
 
     if (nbv->bUseGPU)
     {
@@ -2276,7 +2262,7 @@ static void init_nb_verlet(FILE                *fp,
                        deviceInfo,
                        fr->ic,
                        nbv->listParams.get(),
-                       nbv->grp,
+                       nbv->nbat,
                        cr->nodeid,
                        (nbv->ngrp > 1));
 
diff --git a/src/gromacs/mdlib/nb_verlet.h b/src/gromacs/mdlib/nb_verlet.h
index 9d744b7f59..e8ef3884df 100644
--- a/src/gromacs/mdlib/nb_verlet.h
+++ b/src/gromacs/mdlib/nb_verlet.h
@@ -182,7 +182,6 @@ enum {
  *  \brief Non-bonded interaction group data structure. */
 typedef struct nonbonded_verlet_group_t {
     nbnxn_pairlist_set_t  nbl_lists;   /**< pair list(s)                       */
-    nbnxn_atomdata_t     *nbat;        /**< atom data                          */
     int                   kernel_type; /**< non-bonded kernel - see enum above */
     int                   ewald_excl;  /**< Ewald exclusion - see enum above   */
 } nonbonded_verlet_group_t;
@@ -194,6 +193,7 @@ typedef struct nonbonded_verlet_t {
     nbnxn_search_t                       nbs;             /**< n vs n atom pair searching data       */
     int                                  ngrp;            /**< number of interaction groups          */
     nonbonded_verlet_group_t             grp[2];          /**< local and non-local interaction group */
+    nbnxn_atomdata_t                    *nbat;            /**< atom data                             */
 
     gmx_bool                             bUseGPU;         /**< TRUE when non-bonded interactions are computed on a physical GPU */
     EmulateGpuNonbonded                  emulateGpu;      /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */
diff --git a/src/gromacs/mdlib/nbnxn_atomdata.cpp b/src/gromacs/mdlib/nbnxn_atomdata.cpp
index 66f9348178..2a66fad79b 100644
--- a/src/gromacs/mdlib/nbnxn_atomdata.cpp
+++ b/src/gromacs/mdlib/nbnxn_atomdata.cpp
@@ -790,11 +790,10 @@ static void copy_lj_to_nbat_lj_comb(const real *ljparam_type,
 
 /* Sets the atom type in nbnxn_atomdata_t */
 static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t    *nbat,
-                                         int                  ngrid,
                                          const nbnxn_search_t nbs,
                                          const int           *type)
 {
-    for (int g = 0; g < ngrid; g++)
+    for (int g = 0; g < nbs->ngrid; g++)
     {
         const nbnxn_grid_t * grid = &nbs->grid[g];
 
@@ -812,12 +811,11 @@ static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t    *nbat,
 
 /* Sets the LJ combination rule parameters in nbnxn_atomdata_t */
 static void nbnxn_atomdata_set_ljcombparams(nbnxn_atomdata_t    *nbat,
-                                            int                  ngrid,
                                             const nbnxn_search_t nbs)
 {
     if (nbat->comb_rule != ljcrNONE)
     {
-        for (int g = 0; g < ngrid; g++)
+        for (int g = 0; g < nbs->ngrid; g++)
         {
             const nbnxn_grid_t * grid = &nbs->grid[g];
 
@@ -855,14 +853,13 @@ static void nbnxn_atomdata_set_ljcombparams(nbnxn_atomdata_t    *nbat,
 
 /* Sets the charges in nbnxn_atomdata_t *nbat */
 static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t    *nbat,
-                                       int                  ngrid,
                                        const nbnxn_search_t nbs,
                                        const real          *charge)
 {
     int                 i;
     real               *q;
 
-    for (int g = 0; g < ngrid; g++)
+    for (int g = 0; g < nbs->ngrid; g++)
     {
         const nbnxn_grid_t * grid = &nbs->grid[g];
 
@@ -914,7 +911,6 @@ static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t    *nbat,
  * using the original charge and LJ data, not nbnxn_atomdata_t.
  */
 static void nbnxn_atomdata_mask_fep(nbnxn_atomdata_t    *nbat,
-                                    int                  ngrid,
                                     const nbnxn_search_t nbs)
 {
     real               *q;
@@ -931,7 +927,7 @@ static void nbnxn_atomdata_mask_fep(nbnxn_atomdata_t    *nbat,
         stride_q = 1;
     }
 
-    for (int g = 0; g < ngrid; g++)
+    for (int g = 0; g < nbs->ngrid; g++)
     {
         const nbnxn_grid_t * grid = &nbs->grid[g];
         if (grid->bSimple)
@@ -999,7 +995,6 @@ static void copy_egp_to_nbat_egps(const int *a, int na, int na_round,
 
 /* Set the energy group indices for atoms in nbnxn_atomdata_t */
 static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t    *nbat,
-                                            int                  ngrid,
                                             const nbnxn_search_t nbs,
                                             const int           *atinfo)
 {
@@ -1008,7 +1003,7 @@ static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t    *nbat,
         return;
     }
 
-    for (int g = 0; g < ngrid; g++)
+    for (int g = 0; g < nbs->ngrid; g++)
     {
         const nbnxn_grid_t * grid = &nbs->grid[g];
 
@@ -1027,35 +1022,23 @@ static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t    *nbat,
 
 /* Sets all required atom parameter data in nbnxn_atomdata_t */
 void nbnxn_atomdata_set(nbnxn_atomdata_t    *nbat,
-                        int                  locality,
                         const nbnxn_search_t nbs,
                         const t_mdatoms     *mdatoms,
                         const int           *atinfo)
 {
-    int ngrid;
+    nbnxn_atomdata_set_atomtypes(nbat, nbs, mdatoms->typeA);
 
-    if (locality == eatLocal)
-    {
-        ngrid = 1;
-    }
-    else
-    {
-        ngrid = nbs->ngrid;
-    }
-
-    nbnxn_atomdata_set_atomtypes(nbat, ngrid, nbs, mdatoms->typeA);
-
-    nbnxn_atomdata_set_charges(nbat, ngrid, nbs, mdatoms->chargeA);
+    nbnxn_atomdata_set_charges(nbat, nbs, mdatoms->chargeA);
 
     if (nbs->bFEP)
     {
-        nbnxn_atomdata_mask_fep(nbat, ngrid, nbs);
+        nbnxn_atomdata_mask_fep(nbat, nbs);
     }
 
     /* This must be done after masking types for FEP */
-    nbnxn_atomdata_set_ljcombparams(nbat, ngrid, nbs);
+    nbnxn_atomdata_set_ljcombparams(nbat, nbs);
 
-    nbnxn_atomdata_set_energygroups(nbat, ngrid, nbs, atinfo);
+    nbnxn_atomdata_set_energygroups(nbat, nbs, atinfo);
 }
 
 /* Copies the shift vector array to nbnxn_atomdata_t */
diff --git a/src/gromacs/mdlib/nbnxn_atomdata.h b/src/gromacs/mdlib/nbnxn_atomdata.h
index 0a47b7c1c1..0d1bedfd51 100644
--- a/src/gromacs/mdlib/nbnxn_atomdata.h
+++ b/src/gromacs/mdlib/nbnxn_atomdata.h
@@ -91,7 +91,6 @@ void nbnxn_atomdata_init(FILE *fp,
 
 /* Copy the atom data to the non-bonded atom data structure */
 void nbnxn_atomdata_set(nbnxn_atomdata_t    *nbat,
-                        int                  locality,
                         const nbnxn_search_t nbs,
                         const t_mdatoms     *mdatoms,
                         const int           *atinfo);
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index b0ef37ffea..9500e4b78f 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -424,10 +424,10 @@ static void init_timings(gmx_wallclock_gpu_t *t)
 static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
                                   const interaction_const_t      *ic,
                                   const NbnxnListParameters      *listParams,
-                                  const nonbonded_verlet_group_t *nbv_group)
+                                  const nbnxn_atomdata_t         *nbat)
 {
-    init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
-    init_nbparam(nb->nbparam, ic, listParams, nbv_group[0].nbat, nb->dev_info);
+    init_atomdata_first(nb->atdat, nbat->ntype);
+    init_nbparam(nb->nbparam, ic, listParams, nbat, nb->dev_info);
 
     /* clear energy and shift force outputs */
     nbnxn_cuda_clear_e_fshift(nb);
@@ -437,7 +437,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
                     const gmx_device_info_t   *deviceInfo,
                     const interaction_const_t *ic,
                     const NbnxnListParameters *listParams,
-                    nonbonded_verlet_group_t  *nbv_grp,
+                    const nbnxn_atomdata_t    *nbat,
                     int                        /*rank*/,
                     gmx_bool                   bLocalAndNonlocal)
 {
@@ -518,7 +518,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
     /* pick L1 cache configuration */
     nbnxn_cuda_set_cacheconfig(nb->dev_info);
 
-    nbnxn_cuda_init_const(nb, ic, listParams, nbv_grp);
+    nbnxn_cuda_init_const(nb, ic, listParams, nbat);
 
     *p_nb = nb;
 
@@ -645,7 +645,7 @@ void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
 }
 
 void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
-                             const struct nbnxn_atomdata_t *nbat)
+                             const nbnxn_atomdata_t        *nbat)
 {
     cudaError_t    stat;
     int            nalloc, natoms;
diff --git a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
index ebc1d5171b..841574e399 100644
--- a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
+++ b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
@@ -64,7 +64,7 @@ void nbnxn_gpu_init(gmx_nbnxn_gpu_t gmx_unused            **p_nb,
                     const gmx_device_info_t gmx_unused     *deviceInfo,
                     const interaction_const_t gmx_unused   *ic,
                     const NbnxnListParameters gmx_unused   *listParams,
-                    nonbonded_verlet_group_t gmx_unused    *nbv_grp,
+                    const nbnxn_atomdata_t gmx_unused      *nbat,
                     int gmx_unused                          rank,
                     /* true if both local and non-local are done on GPU */
                     gmx_bool gmx_unused                     bLocalAndNonlocal) GPU_FUNC_TERM
@@ -78,7 +78,7 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused               *nb,
 /** Initializes atom-data on the GPU, called at every pair search step. */
 GPU_FUNC_QUALIFIER
 void nbnxn_gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused               *nb,
-                             const struct nbnxn_atomdata_t gmx_unused *nbat) GPU_FUNC_TERM
+                             const nbnxn_atomdata_t gmx_unused        *nbat) GPU_FUNC_TERM
 
 /*! \brief Re-generate the GPU Ewald force table, resets rlist, and update the
  *  electrostatic type switching to twin cut-off (or back) if needed.
@@ -91,7 +91,7 @@ void nbnxn_gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unus
 /** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
 GPU_FUNC_QUALIFIER
 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_gpu_t gmx_unused               *nb,
-                               const struct nbnxn_atomdata_t gmx_unused *nbatom) GPU_FUNC_TERM
+                               const nbnxn_atomdata_t gmx_unused        *nbatom) GPU_FUNC_TERM
 
 /** Clears GPU outputs: nonbonded force, shift force and energy. */
 GPU_FUNC_QUALIFIER
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp
index cd9fbce337..bf9ba49f1f 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp
@@ -134,6 +134,7 @@ reduceGroupEnergySimdBuffers(int                       numGroups,
 
 void
 nbnxn_kernel_cpu(nonbonded_verlet_group_t  *nbvg,
+                 const nbnxn_atomdata_t    *nbat,
                  const interaction_const_t *ic,
                  rvec                      *shiftVectors,
                  int                        forceFlags,
@@ -142,7 +143,6 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t  *nbvg,
                  real                      *vCoulomb,
                  real                      *vVdw)
 {
-    const nbnxn_atomdata_t  *nbat = nbvg->nbat;
 
     int                      coulkt;
     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.h
index 43bcbb7299..5f4ba0cde0 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.h
@@ -48,6 +48,7 @@
 #include "gromacs/utility/real.h"
 
 struct interaction_const_t;
+struct nbnxn_atomdata_t;
 struct nonbonded_verlet_group_t;
 
 /*! \brief Dispatches the non-bonded N versus M atom cluster CPU kernels.
@@ -57,6 +58,7 @@ struct nonbonded_verlet_group_t;
  * within this function.
  *
  * \param[in,out] nbvg          The group (local/non-local) to compute interaction for
+ * \param[in]     nbat          The atomdata for the interactions
  * \param[in]     ic            Non-bonded interaction constants
  * \param[in]     shiftVectors  The PBC shift vectors
  * \param[in]     forceFlags    Flags that tell what to compute
@@ -67,6 +69,7 @@ struct nonbonded_verlet_group_t;
  */
 void
 nbnxn_kernel_cpu(nonbonded_verlet_group_t  *nbvg,
+                 const nbnxn_atomdata_t    *nbat,
                  const interaction_const_t *ic,
                  rvec                      *shiftVectors,
                  int                        forceFlags,
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.cpp b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.cpp
index aefeafc239..7782e918d6 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.cpp
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.cpp
@@ -48,11 +48,11 @@
 
 
 void nbnxn_kernel_cpu_prune(nonbonded_verlet_group_t *nbvg,
+                            const nbnxn_atomdata_t   *nbat,
                             const rvec               *shift_vec,
                             real                      rlistInner)
 {
     nbnxn_pairlist_set_t   *nbl_lists = &nbvg->nbl_lists;
-    const nbnxn_atomdata_t *nbat      = nbvg->nbat;
 
     GMX_ASSERT(nbl_lists->nbl[0]->nciOuter >= 0, "nciOuter<0, which signals an invalid pair-list");
 
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.h
index ac6161e8d7..dfa436a34a 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.h
@@ -47,6 +47,7 @@
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/real.h"
 
+struct nbnxn_atomdata_t;
 struct nonbonded_verlet_group_t;
 
 /*! \brief Prune all pair-lists in the set with distance \p rlistInner
@@ -56,5 +57,6 @@ struct nonbonded_verlet_group_t;
  * to be consumed by the non-bonded kernel.
  */
 void nbnxn_kernel_cpu_prune(nonbonded_verlet_group_t  *nbvg,
+                            const nbnxn_atomdata_t    *nbat,
                             const rvec                *shift_vec,
                             real                       rlistInner);
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
index 6dd8e4f669..da63c733c4 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -708,10 +708,10 @@ static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb)
 static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
                                  const interaction_const_t      *ic,
                                  const NbnxnListParameters      *listParams,
-                                 const nonbonded_verlet_group_t *nbv_group)
+                                 const nbnxn_atomdata_t         *nbat)
 {
-    init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype, nb->dev_rundata);
-    init_nbparam(nb->nbparam, ic, listParams, nbv_group[0].nbat, nb->dev_rundata);
+    init_atomdata_first(nb->atdat, nbat->ntype, nb->dev_rundata);
+    init_nbparam(nb->nbparam, ic, listParams, nbat, nb->dev_rundata);
 }
 
 
@@ -720,7 +720,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
                     const gmx_device_info_t   *deviceInfo,
                     const interaction_const_t *ic,
                     const NbnxnListParameters *listParams,
-                    nonbonded_verlet_group_t  *nbv_grp,
+                    const nbnxn_atomdata_t    *nbat,
                     int                        rank,
                     gmx_bool                   bLocalAndNonlocal)
 {
@@ -812,7 +812,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
         init_timings(nb->timings);
     }
 
-    nbnxn_ocl_init_const(nb, ic, listParams, nbv_grp);
+    nbnxn_ocl_init_const(nb, ic, listParams, nbat);
 
     /* Enable LJ param manual prefetch for AMD or if we request through env. var.
      * TODO: decide about NVIDIA
@@ -983,7 +983,7 @@ void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
 
 //! This function is documented in the header file
 void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
-                             const struct nbnxn_atomdata_t *nbat)
+                             const nbnxn_atomdata_t        *nbat)
 {
     cl_int           cl_error;
     int              nalloc, natoms;
diff --git a/src/gromacs/mdlib/nbnxn_search.cpp b/src/gromacs/mdlib/nbnxn_search.cpp
index f33ecae48c..736e7c0f1d 100644
--- a/src/gromacs/mdlib/nbnxn_search.cpp
+++ b/src/gromacs/mdlib/nbnxn_search.cpp
@@ -4100,7 +4100,6 @@ void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
                          t_nrnb               *nrnb)
 {
     nbnxn_grid_t      *gridi, *gridj;
-    gmx_bool           bGPUCPU;
     int                nzi, zj0, zj1;
     int                nsubpair_target;
     float              nsubpair_tot_est;
@@ -4111,9 +4110,6 @@ void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
     gmx_bool           progBal;
     int                np_tot, np_noq, np_hlj, nap;
 
-    /* Check if we are running hybrid GPU + CPU nbnxn mode */
-    bGPUCPU = (!nbs->grid[0].bSimple && nbl_list->bSimple);
-
     nnbl            = nbl_list->nnbl;
     nbl             = nbl_list->nbl;
     CombineNBLists  = nbl_list->bCombined;
@@ -4125,7 +4121,7 @@ void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
 
     nbat->bUseBufferFlags = (nbat->nout > 1);
     /* We should re-init the flags before making the first list */
-    if (nbat->bUseBufferFlags && (LOCAL_I(iloc) || bGPUCPU))
+    if (nbat->bUseBufferFlags && LOCAL_I(iloc))
     {
         init_buffer_flags(&nbat->buffer_flags, nbat->natoms);
     }
@@ -4217,15 +4213,7 @@ void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
 
             nbs_cycle_start(&nbs->cc[enbsCCsearch]);
 
-            if (nbl[0]->bSimple && !gridi->bSimple)
-            {
-                /* Hybrid list, determine blocking later */
-                ci_block = 0;
-            }
-            else
-            {
-                ci_block = get_ci_block_size(gridi, nbs->DomDec, nnbl);
-            }
+            ci_block = get_ci_block_size(gridi, nbs->DomDec, nnbl);
 
             /* With GPU: generate progressively smaller lists for
              * load balancing for local only or non-local with 2 zones.
@@ -4240,8 +4228,7 @@ void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
                     /* Re-init the thread-local work flag data before making
                      * the first list (not an elegant conditional).
                      */
-                    if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0) ||
-                                                  (bGPUCPU && zi == 0 && zj == 1)))
+                    if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0)))
                     {
                         init_buffer_flags(&nbs->work[th].buffer_flags, nbat->natoms);
                     }
diff --git a/src/gromacs/mdlib/nbnxn_tuning.cpp b/src/gromacs/mdlib/nbnxn_tuning.cpp
index 4a4af2dcc6..d88946cd54 100644
--- a/src/gromacs/mdlib/nbnxn_tuning.cpp
+++ b/src/gromacs/mdlib/nbnxn_tuning.cpp
@@ -115,7 +115,7 @@ static const float c_nbnxnListSizeFactorCpu           = 1.25;
 //! Target pair-list size increase ratio for Intel KNL
 static const float c_nbnxnListSizeFactorIntelXeonPhi  = 1.4;
 // GPU: pair-search is a factor 1.5-3 slower than the non-bonded kernel.
-//! Target pair-list size increase ratio for hybrid CPU-GPU
+//! Target pair-list size increase ratio for GPU
 static const float c_nbnxnListSizeFactorGPU           = 1.4;
 //! Never increase the size of the pair-list more than the factor above plus this margin
 static const float c_nbnxnListSizeFactorMargin        = 0.1;
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index f31a54d975..815b1d964d 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -443,7 +443,7 @@ static void do_nb_verlet(t_forcerec *fr,
              * the current coordinates of the atoms.
              */
             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
-            nbnxn_kernel_cpu_prune(nbvg, fr->shift_vec, nbv->listParams->rlistInner);
+            nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
         }
 
@@ -456,6 +456,7 @@ static void do_nb_verlet(t_forcerec *fr,
         case nbnxnk4xN_SIMD_4xN:
         case nbnxnk4xN_SIMD_2xNN:
             nbnxn_kernel_cpu(nbvg,
+                             nbv->nbat,
                              ic,
                              fr->shift_vec,
                              flags,
@@ -468,16 +469,16 @@ static void do_nb_verlet(t_forcerec *fr,
             break;
 
         case nbnxnk8x8x8_GPU:
-            nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
+            nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality);
             break;
 
         case nbnxnk8x8x8_PlainC:
             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
-                                 nbvg->nbat, ic,
+                                 nbv->nbat, ic,
                                  fr->shift_vec,
                                  flags,
                                  clearF,
-                                 nbvg->nbat->out[0].f,
+                                 nbv->nbat->out[0].f,
                                  fr->fshift[0],
                                  enerd->grpp.ener[egCOULSR],
                                  fr->bBHAM ?
@@ -903,7 +904,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
     }
 
     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
-                                 fr->shift_vec, nbv->grp[0].nbat);
+                                 fr->shift_vec, nbv->nbat);
 
 #if GMX_MPI
     if (!(cr->duty & DUTY_PME))
@@ -958,7 +959,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                               0, mdatoms->homenr, -1, fr->cginfo, x,
                               0, nullptr,
                               nbv->grp[eintLocal].kernel_type,
-                              nbv->grp[eintLocal].nbat);
+                              nbv->nbat);
             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
         }
         else
@@ -967,23 +968,11 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
                                        fr->cginfo, x,
                                        nbv->grp[eintNonlocal].kernel_type,
-                                       nbv->grp[eintNonlocal].nbat);
+                                       nbv->nbat);
             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
         }
 
-        if (nbv->ngrp == 1 ||
-            nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
-        {
-            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
-                               nbv->nbs, mdatoms, fr->cginfo);
-        }
-        else
-        {
-            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
-                               nbv->nbs, mdatoms, fr->cginfo);
-            nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
-                               nbv->nbs, mdatoms, fr->cginfo);
-        }
+        nbnxn_atomdata_set(nbv->nbat, nbv->nbs, mdatoms, fr->cginfo);
         wallcycle_stop(wcycle, ewcNS);
     }
 
@@ -995,10 +984,10 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
 
         if (bNS)
         {
-            nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
+            nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
         }
 
-        nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
+        nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
 
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
@@ -1009,7 +998,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
     {
         wallcycle_start_nocount(wcycle, ewcNS);
         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
-        nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
+        nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
                             &top->excls,
                             nbv->listParams->rlistOuter,
                             nbv->min_ci_balanced,
@@ -1038,7 +1027,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
         wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
-                                        nbv->grp[eintLocal].nbat);
+                                        nbv->nbat);
         wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
     }
@@ -1068,7 +1057,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             wallcycle_start_nocount(wcycle, ewcNS);
             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
 
-            nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
+            nbnxn_make_pairlist(nbv->nbs, nbv->nbat,
                                 &top->excls,
                                 nbv->listParams->rlistOuter,
                                 nbv->min_ci_balanced,
@@ -1101,7 +1090,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
-                                            nbv->grp[eintNonlocal].nbat);
+                                            nbv->nbat);
             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
         }
@@ -1125,10 +1114,10 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         if (DOMAINDECOMP(cr))
         {
-            nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
+            nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
                                      flags, eatNonlocal);
         }
-        nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
+        nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
                                  flags, eatLocal);
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
@@ -1277,7 +1266,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         wallcycle_stop(wcycle, ewcFORCE);
         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
-        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->nbat, f);
         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
         wallcycle_start_nocount(wcycle, ewcFORCE);
@@ -1288,7 +1277,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         {
             /* This is not in a subcounter because it takes a
                negligible and constant-sized amount of time */
-            nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
+            nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat,
                                                      fr->fshift);
         }
     }
@@ -1341,7 +1330,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
             {
                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
-                                               nbv->grp[eintNonlocal].nbat, f);
+                                               nbv->nbat, f);
             }
             wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
@@ -1443,7 +1432,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
-                                       nbv->grp[eintLocal].nbat, f);
+                                       nbv->nbat, f);
         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
     }
-- 
2.11.4.GIT