From 1f5c9496a0250a15d1f1b9640832dc5244305e7e Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Mon, 2 Oct 2017 18:19:05 +0200 Subject: [PATCH] Continue removing -nb gpu_cpu Now that hybrid mode is gone, both local and non-local Verlet-scheme groups use the same kernel_type and thus both nbat pointers were always the same. Thus, there's no reason to maintain two of them. This simplifies and slightly optimizes nbnxn_atomdata_set(). Also fixed some other docs, comments, and logic that were either already wrong, or are useless with hybrid mode gone. Change-Id: Id02a11a00553b1df151a1e15b934611e0e15b9f7 --- docs/user-guide/mdrun-performance.rst | 7 +- src/gromacs/domdec/domdec.cpp | 2 +- src/gromacs/mdlib/forcerec.cpp | 90 +++++++++------------- src/gromacs/mdlib/nb_verlet.h | 2 +- src/gromacs/mdlib/nbnxn_atomdata.cpp | 37 +++------ src/gromacs/mdlib/nbnxn_atomdata.h | 1 - .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu | 12 +-- src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h | 6 +- .../mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp | 2 +- src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.h | 3 + .../mdlib/nbnxn_kernels/nbnxn_kernel_prune.cpp | 2 +- .../mdlib/nbnxn_kernels/nbnxn_kernel_prune.h | 2 + .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp | 12 +-- src/gromacs/mdlib/nbnxn_search.cpp | 19 +---- src/gromacs/mdlib/nbnxn_tuning.cpp | 2 +- src/gromacs/mdlib/sim_util.cpp | 53 +++++-------- 16 files changed, 99 insertions(+), 153 deletions(-) diff --git a/docs/user-guide/mdrun-performance.rst b/docs/user-guide/mdrun-performance.rst index 58689f4045..f6f254b700 100644 --- a/docs/user-guide/mdrun-performance.rst +++ b/docs/user-guide/mdrun-performance.rst @@ -246,13 +246,10 @@ behavior. ``-nb`` Used to set where to execute the non-bonded interactions. - Can be set to "auto", "cpu", "gpu", "gpu_cpu." + Can be set to "auto", "cpu", "gpu." Defaults to "auto," which uses a compatible GPU if available. Setting "cpu" requires that no GPU is used. Setting "gpu" requires - that a compatible GPU be available and will be used. Setting - "gpu_cpu" lets the GPU compute the local and the CPU the non-local - non-bonded interactions. Is only faster under a narrow range of - conditions. + that a compatible GPU be available and will be used. Examples for mdrun on one node ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index 83925fbad3..3b27800426 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -9580,7 +9580,7 @@ void dd_partition_system(FILE *fplog, as_rvec_array(state_local->x.data()), ncg_moved, bRedist ? comm->moved : nullptr, fr->nbv->grp[eintLocal].kernel_type, - fr->nbv->grp[eintLocal].nbat); + fr->nbv->nbat); nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]); break; diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp index 87f74d20fa..ffaea07569 100644 --- a/src/gromacs/mdlib/forcerec.cpp +++ b/src/gromacs/mdlib/forcerec.cpp @@ -2151,7 +2151,6 @@ static void init_nb_verlet(FILE *fp, matrix box) { nonbonded_verlet_t *nbv; - int i; char *env; nbnxn_alloc_t *nb_alloc; @@ -2174,10 +2173,9 @@ static void init_nb_verlet(FILE *fp, nbv->min_ci_balanced = 0; nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1); - for (i = 0; i < nbv->ngrp; i++) + for (int i = 0; i < nbv->ngrp; i++) { nbv->grp[i].nbl_lists.nnbl = 0; - nbv->grp[i].nbat = nullptr; nbv->grp[i].kernel_type = nbnxnkNotSet; if (i == 0) /* local */ @@ -2206,67 +2204,55 @@ static void init_nb_verlet(FILE *fp, bFEP_NonBonded, gmx_omp_nthreads_get(emntPairsearch)); - for (i = 0; i < nbv->ngrp; i++) - { - gpu_set_host_malloc_and_free(nbv->grp[0].kernel_type == nbnxnk8x8x8_GPU, - &nb_alloc, &nb_free); + gpu_set_host_malloc_and_free(nbv->grp[0].kernel_type == nbnxnk8x8x8_GPU, + &nb_alloc, &nb_free); + for (int i = 0; i < nbv->ngrp; i++) + { nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists, nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type), /* 8x8x8 "non-simple" lists are ATM always combined */ !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type), nb_alloc, nb_free); + } - if (i == 0 || - nbv->grp[0].kernel_type != nbv->grp[i].kernel_type) + int enbnxninitcombrule; + if (fr->ic->vdwtype == evdwCUT && + (fr->ic->vdw_modifier == eintmodNONE || + fr->ic->vdw_modifier == eintmodPOTSHIFT) && + getenv("GMX_NO_LJ_COMB_RULE") == nullptr) + { + /* Plain LJ cut-off: we can optimize with combination rules */ + enbnxninitcombrule = enbnxninitcombruleDETECT; + } + else if (fr->ic->vdwtype == evdwPME) + { + /* LJ-PME: we need to use a combination rule for the grid */ + if (fr->ljpme_combination_rule == eljpmeGEOM) { - gmx_bool bSimpleList; - int enbnxninitcombrule; - - bSimpleList = nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type); - - if (fr->ic->vdwtype == evdwCUT && - (fr->ic->vdw_modifier == eintmodNONE || - fr->ic->vdw_modifier == eintmodPOTSHIFT) && - getenv("GMX_NO_LJ_COMB_RULE") == nullptr) - { - /* Plain LJ cut-off: we can optimize with combination rules */ - enbnxninitcombrule = enbnxninitcombruleDETECT; - } - else if (fr->ic->vdwtype == evdwPME) - { - /* LJ-PME: we need to use a combination rule for the grid */ - if (fr->ljpme_combination_rule == eljpmeGEOM) - { - enbnxninitcombrule = enbnxninitcombruleGEOM; - } - else - { - enbnxninitcombrule = enbnxninitcombruleLB; - } - } - else - { - /* We use a full combination matrix: no rule required */ - enbnxninitcombrule = enbnxninitcombruleNONE; - } - - - snew(nbv->grp[i].nbat, 1); - nbnxn_atomdata_init(fp, - nbv->grp[i].nbat, - nbv->grp[i].kernel_type, - enbnxninitcombrule, - fr->ntype, fr->nbfp, - ir->opts.ngener, - bSimpleList ? gmx_omp_nthreads_get(emntNonbonded) : 1, - nb_alloc, nb_free); + enbnxninitcombrule = enbnxninitcombruleGEOM; } else { - nbv->grp[i].nbat = nbv->grp[0].nbat; + enbnxninitcombrule = enbnxninitcombruleLB; } } + else + { + /* We use a full combination matrix: no rule required */ + enbnxninitcombrule = enbnxninitcombruleNONE; + } + + snew(nbv->nbat, 1); + bool bSimpleList = nbnxn_kernel_pairlist_simple(nbv->grp[0].kernel_type); + nbnxn_atomdata_init(fp, + nbv->nbat, + nbv->grp[0].kernel_type, + enbnxninitcombrule, + fr->ntype, fr->nbfp, + ir->opts.ngener, + bSimpleList ? gmx_omp_nthreads_get(emntNonbonded) : 1, + nb_alloc, nb_free); if (nbv->bUseGPU) { @@ -2276,7 +2262,7 @@ static void init_nb_verlet(FILE *fp, deviceInfo, fr->ic, nbv->listParams.get(), - nbv->grp, + nbv->nbat, cr->nodeid, (nbv->ngrp > 1)); diff --git a/src/gromacs/mdlib/nb_verlet.h b/src/gromacs/mdlib/nb_verlet.h index 9d744b7f59..e8ef3884df 100644 --- a/src/gromacs/mdlib/nb_verlet.h +++ b/src/gromacs/mdlib/nb_verlet.h @@ -182,7 +182,6 @@ enum { * \brief Non-bonded interaction group data structure. */ typedef struct nonbonded_verlet_group_t { nbnxn_pairlist_set_t nbl_lists; /**< pair list(s) */ - nbnxn_atomdata_t *nbat; /**< atom data */ int kernel_type; /**< non-bonded kernel - see enum above */ int ewald_excl; /**< Ewald exclusion - see enum above */ } nonbonded_verlet_group_t; @@ -194,6 +193,7 @@ typedef struct nonbonded_verlet_t { nbnxn_search_t nbs; /**< n vs n atom pair searching data */ int ngrp; /**< number of interaction groups */ nonbonded_verlet_group_t grp[2]; /**< local and non-local interaction group */ + nbnxn_atomdata_t *nbat; /**< atom data */ gmx_bool bUseGPU; /**< TRUE when non-bonded interactions are computed on a physical GPU */ EmulateGpuNonbonded emulateGpu; /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */ diff --git a/src/gromacs/mdlib/nbnxn_atomdata.cpp b/src/gromacs/mdlib/nbnxn_atomdata.cpp index 66f9348178..2a66fad79b 100644 --- a/src/gromacs/mdlib/nbnxn_atomdata.cpp +++ b/src/gromacs/mdlib/nbnxn_atomdata.cpp @@ -790,11 +790,10 @@ static void copy_lj_to_nbat_lj_comb(const real *ljparam_type, /* Sets the atom type in nbnxn_atomdata_t */ static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t *nbat, - int ngrid, const nbnxn_search_t nbs, const int *type) { - for (int g = 0; g < ngrid; g++) + for (int g = 0; g < nbs->ngrid; g++) { const nbnxn_grid_t * grid = &nbs->grid[g]; @@ -812,12 +811,11 @@ static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t *nbat, /* Sets the LJ combination rule parameters in nbnxn_atomdata_t */ static void nbnxn_atomdata_set_ljcombparams(nbnxn_atomdata_t *nbat, - int ngrid, const nbnxn_search_t nbs) { if (nbat->comb_rule != ljcrNONE) { - for (int g = 0; g < ngrid; g++) + for (int g = 0; g < nbs->ngrid; g++) { const nbnxn_grid_t * grid = &nbs->grid[g]; @@ -855,14 +853,13 @@ static void nbnxn_atomdata_set_ljcombparams(nbnxn_atomdata_t *nbat, /* Sets the charges in nbnxn_atomdata_t *nbat */ static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t *nbat, - int ngrid, const nbnxn_search_t nbs, const real *charge) { int i; real *q; - for (int g = 0; g < ngrid; g++) + for (int g = 0; g < nbs->ngrid; g++) { const nbnxn_grid_t * grid = &nbs->grid[g]; @@ -914,7 +911,6 @@ static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t *nbat, * using the original charge and LJ data, not nbnxn_atomdata_t. */ static void nbnxn_atomdata_mask_fep(nbnxn_atomdata_t *nbat, - int ngrid, const nbnxn_search_t nbs) { real *q; @@ -931,7 +927,7 @@ static void nbnxn_atomdata_mask_fep(nbnxn_atomdata_t *nbat, stride_q = 1; } - for (int g = 0; g < ngrid; g++) + for (int g = 0; g < nbs->ngrid; g++) { const nbnxn_grid_t * grid = &nbs->grid[g]; if (grid->bSimple) @@ -999,7 +995,6 @@ static void copy_egp_to_nbat_egps(const int *a, int na, int na_round, /* Set the energy group indices for atoms in nbnxn_atomdata_t */ static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t *nbat, - int ngrid, const nbnxn_search_t nbs, const int *atinfo) { @@ -1008,7 +1003,7 @@ static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t *nbat, return; } - for (int g = 0; g < ngrid; g++) + for (int g = 0; g < nbs->ngrid; g++) { const nbnxn_grid_t * grid = &nbs->grid[g]; @@ -1027,35 +1022,23 @@ static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t *nbat, /* Sets all required atom parameter data in nbnxn_atomdata_t */ void nbnxn_atomdata_set(nbnxn_atomdata_t *nbat, - int locality, const nbnxn_search_t nbs, const t_mdatoms *mdatoms, const int *atinfo) { - int ngrid; + nbnxn_atomdata_set_atomtypes(nbat, nbs, mdatoms->typeA); - if (locality == eatLocal) - { - ngrid = 1; - } - else - { - ngrid = nbs->ngrid; - } - - nbnxn_atomdata_set_atomtypes(nbat, ngrid, nbs, mdatoms->typeA); - - nbnxn_atomdata_set_charges(nbat, ngrid, nbs, mdatoms->chargeA); + nbnxn_atomdata_set_charges(nbat, nbs, mdatoms->chargeA); if (nbs->bFEP) { - nbnxn_atomdata_mask_fep(nbat, ngrid, nbs); + nbnxn_atomdata_mask_fep(nbat, nbs); } /* This must be done after masking types for FEP */ - nbnxn_atomdata_set_ljcombparams(nbat, ngrid, nbs); + nbnxn_atomdata_set_ljcombparams(nbat, nbs); - nbnxn_atomdata_set_energygroups(nbat, ngrid, nbs, atinfo); + nbnxn_atomdata_set_energygroups(nbat, nbs, atinfo); } /* Copies the shift vector array to nbnxn_atomdata_t */ diff --git a/src/gromacs/mdlib/nbnxn_atomdata.h b/src/gromacs/mdlib/nbnxn_atomdata.h index 0a47b7c1c1..0d1bedfd51 100644 --- a/src/gromacs/mdlib/nbnxn_atomdata.h +++ b/src/gromacs/mdlib/nbnxn_atomdata.h @@ -91,7 +91,6 @@ void nbnxn_atomdata_init(FILE *fp, /* Copy the atom data to the non-bonded atom data structure */ void nbnxn_atomdata_set(nbnxn_atomdata_t *nbat, - int locality, const nbnxn_search_t nbs, const t_mdatoms *mdatoms, const int *atinfo); diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu index b0ef37ffea..9500e4b78f 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu @@ -424,10 +424,10 @@ static void init_timings(gmx_wallclock_gpu_t *t) static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t *nb, const interaction_const_t *ic, const NbnxnListParameters *listParams, - const nonbonded_verlet_group_t *nbv_group) + const nbnxn_atomdata_t *nbat) { - init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype); - init_nbparam(nb->nbparam, ic, listParams, nbv_group[0].nbat, nb->dev_info); + init_atomdata_first(nb->atdat, nbat->ntype); + init_nbparam(nb->nbparam, ic, listParams, nbat, nb->dev_info); /* clear energy and shift force outputs */ nbnxn_cuda_clear_e_fshift(nb); @@ -437,7 +437,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb, const gmx_device_info_t *deviceInfo, const interaction_const_t *ic, const NbnxnListParameters *listParams, - nonbonded_verlet_group_t *nbv_grp, + const nbnxn_atomdata_t *nbat, int /*rank*/, gmx_bool bLocalAndNonlocal) { @@ -518,7 +518,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb, /* pick L1 cache configuration */ nbnxn_cuda_set_cacheconfig(nb->dev_info); - nbnxn_cuda_init_const(nb, ic, listParams, nbv_grp); + nbnxn_cuda_init_const(nb, ic, listParams, nbat); *p_nb = nb; @@ -645,7 +645,7 @@ void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags) } void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb, - const struct nbnxn_atomdata_t *nbat) + const nbnxn_atomdata_t *nbat) { cudaError_t stat; int nalloc, natoms; diff --git a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h index ebc1d5171b..841574e399 100644 --- a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h +++ b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h @@ -64,7 +64,7 @@ void nbnxn_gpu_init(gmx_nbnxn_gpu_t gmx_unused **p_nb, const gmx_device_info_t gmx_unused *deviceInfo, const interaction_const_t gmx_unused *ic, const NbnxnListParameters gmx_unused *listParams, - nonbonded_verlet_group_t gmx_unused *nbv_grp, + const nbnxn_atomdata_t gmx_unused *nbat, int gmx_unused rank, /* true if both local and non-local are done on GPU */ gmx_bool gmx_unused bLocalAndNonlocal) GPU_FUNC_TERM @@ -78,7 +78,7 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused *nb, /** Initializes atom-data on the GPU, called at every pair search step. */ GPU_FUNC_QUALIFIER void nbnxn_gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused *nb, - const struct nbnxn_atomdata_t gmx_unused *nbat) GPU_FUNC_TERM + const nbnxn_atomdata_t gmx_unused *nbat) GPU_FUNC_TERM /*! \brief Re-generate the GPU Ewald force table, resets rlist, and update the * electrostatic type switching to twin cut-off (or back) if needed. @@ -91,7 +91,7 @@ void nbnxn_gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unus /** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */ GPU_FUNC_QUALIFIER void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_gpu_t gmx_unused *nb, - const struct nbnxn_atomdata_t gmx_unused *nbatom) GPU_FUNC_TERM + const nbnxn_atomdata_t gmx_unused *nbatom) GPU_FUNC_TERM /** Clears GPU outputs: nonbonded force, shift force and energy. */ GPU_FUNC_QUALIFIER diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp index cd9fbce337..bf9ba49f1f 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp @@ -134,6 +134,7 @@ reduceGroupEnergySimdBuffers(int numGroups, void nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, + const nbnxn_atomdata_t *nbat, const interaction_const_t *ic, rvec *shiftVectors, int forceFlags, @@ -142,7 +143,6 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, real *vCoulomb, real *vVdw) { - const nbnxn_atomdata_t *nbat = nbvg->nbat; int coulkt; if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT) diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.h index 43bcbb7299..5f4ba0cde0 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.h @@ -48,6 +48,7 @@ #include "gromacs/utility/real.h" struct interaction_const_t; +struct nbnxn_atomdata_t; struct nonbonded_verlet_group_t; /*! \brief Dispatches the non-bonded N versus M atom cluster CPU kernels. @@ -57,6 +58,7 @@ struct nonbonded_verlet_group_t; * within this function. * * \param[in,out] nbvg The group (local/non-local) to compute interaction for + * \param[in] nbat The atomdata for the interactions * \param[in] ic Non-bonded interaction constants * \param[in] shiftVectors The PBC shift vectors * \param[in] forceFlags Flags that tell what to compute @@ -67,6 +69,7 @@ struct nonbonded_verlet_group_t; */ void nbnxn_kernel_cpu(nonbonded_verlet_group_t *nbvg, + const nbnxn_atomdata_t *nbat, const interaction_const_t *ic, rvec *shiftVectors, int forceFlags, diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.cpp b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.cpp index aefeafc239..7782e918d6 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.cpp +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.cpp @@ -48,11 +48,11 @@ void nbnxn_kernel_cpu_prune(nonbonded_verlet_group_t *nbvg, + const nbnxn_atomdata_t *nbat, const rvec *shift_vec, real rlistInner) { nbnxn_pairlist_set_t *nbl_lists = &nbvg->nbl_lists; - const nbnxn_atomdata_t *nbat = nbvg->nbat; GMX_ASSERT(nbl_lists->nbl[0]->nciOuter >= 0, "nciOuter<0, which signals an invalid pair-list"); diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.h index ac6161e8d7..dfa436a34a 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.h +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_prune.h @@ -47,6 +47,7 @@ #include "gromacs/math/vectypes.h" #include "gromacs/utility/real.h" +struct nbnxn_atomdata_t; struct nonbonded_verlet_group_t; /*! \brief Prune all pair-lists in the set with distance \p rlistInner @@ -56,5 +57,6 @@ struct nonbonded_verlet_group_t; * to be consumed by the non-bonded kernel. */ void nbnxn_kernel_cpu_prune(nonbonded_verlet_group_t *nbvg, + const nbnxn_atomdata_t *nbat, const rvec *shift_vec, real rlistInner); diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp index 6dd8e4f669..da63c733c4 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp @@ -708,10 +708,10 @@ static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb) static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t *nb, const interaction_const_t *ic, const NbnxnListParameters *listParams, - const nonbonded_verlet_group_t *nbv_group) + const nbnxn_atomdata_t *nbat) { - init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype, nb->dev_rundata); - init_nbparam(nb->nbparam, ic, listParams, nbv_group[0].nbat, nb->dev_rundata); + init_atomdata_first(nb->atdat, nbat->ntype, nb->dev_rundata); + init_nbparam(nb->nbparam, ic, listParams, nbat, nb->dev_rundata); } @@ -720,7 +720,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, const gmx_device_info_t *deviceInfo, const interaction_const_t *ic, const NbnxnListParameters *listParams, - nonbonded_verlet_group_t *nbv_grp, + const nbnxn_atomdata_t *nbat, int rank, gmx_bool bLocalAndNonlocal) { @@ -812,7 +812,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, init_timings(nb->timings); } - nbnxn_ocl_init_const(nb, ic, listParams, nbv_grp); + nbnxn_ocl_init_const(nb, ic, listParams, nbat); /* Enable LJ param manual prefetch for AMD or if we request through env. var. * TODO: decide about NVIDIA @@ -983,7 +983,7 @@ void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t *nb, //! This function is documented in the header file void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t *nb, - const struct nbnxn_atomdata_t *nbat) + const nbnxn_atomdata_t *nbat) { cl_int cl_error; int nalloc, natoms; diff --git a/src/gromacs/mdlib/nbnxn_search.cpp b/src/gromacs/mdlib/nbnxn_search.cpp index f33ecae48c..736e7c0f1d 100644 --- a/src/gromacs/mdlib/nbnxn_search.cpp +++ b/src/gromacs/mdlib/nbnxn_search.cpp @@ -4100,7 +4100,6 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs, t_nrnb *nrnb) { nbnxn_grid_t *gridi, *gridj; - gmx_bool bGPUCPU; int nzi, zj0, zj1; int nsubpair_target; float nsubpair_tot_est; @@ -4111,9 +4110,6 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs, gmx_bool progBal; int np_tot, np_noq, np_hlj, nap; - /* Check if we are running hybrid GPU + CPU nbnxn mode */ - bGPUCPU = (!nbs->grid[0].bSimple && nbl_list->bSimple); - nnbl = nbl_list->nnbl; nbl = nbl_list->nbl; CombineNBLists = nbl_list->bCombined; @@ -4125,7 +4121,7 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs, nbat->bUseBufferFlags = (nbat->nout > 1); /* We should re-init the flags before making the first list */ - if (nbat->bUseBufferFlags && (LOCAL_I(iloc) || bGPUCPU)) + if (nbat->bUseBufferFlags && LOCAL_I(iloc)) { init_buffer_flags(&nbat->buffer_flags, nbat->natoms); } @@ -4217,15 +4213,7 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs, nbs_cycle_start(&nbs->cc[enbsCCsearch]); - if (nbl[0]->bSimple && !gridi->bSimple) - { - /* Hybrid list, determine blocking later */ - ci_block = 0; - } - else - { - ci_block = get_ci_block_size(gridi, nbs->DomDec, nnbl); - } + ci_block = get_ci_block_size(gridi, nbs->DomDec, nnbl); /* With GPU: generate progressively smaller lists for * load balancing for local only or non-local with 2 zones. @@ -4240,8 +4228,7 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs, /* Re-init the thread-local work flag data before making * the first list (not an elegant conditional). */ - if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0) || - (bGPUCPU && zi == 0 && zj == 1))) + if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0))) { init_buffer_flags(&nbs->work[th].buffer_flags, nbat->natoms); } diff --git a/src/gromacs/mdlib/nbnxn_tuning.cpp b/src/gromacs/mdlib/nbnxn_tuning.cpp index 4a4af2dcc6..d88946cd54 100644 --- a/src/gromacs/mdlib/nbnxn_tuning.cpp +++ b/src/gromacs/mdlib/nbnxn_tuning.cpp @@ -115,7 +115,7 @@ static const float c_nbnxnListSizeFactorCpu = 1.25; //! Target pair-list size increase ratio for Intel KNL static const float c_nbnxnListSizeFactorIntelXeonPhi = 1.4; // GPU: pair-search is a factor 1.5-3 slower than the non-bonded kernel. -//! Target pair-list size increase ratio for hybrid CPU-GPU +//! Target pair-list size increase ratio for GPU static const float c_nbnxnListSizeFactorGPU = 1.4; //! Never increase the size of the pair-list more than the factor above plus this margin static const float c_nbnxnListSizeFactorMargin = 0.1; diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index f31a54d975..815b1d964d 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -443,7 +443,7 @@ static void do_nb_verlet(t_forcerec *fr, * the current coordinates of the atoms. */ wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING); - nbnxn_kernel_cpu_prune(nbvg, fr->shift_vec, nbv->listParams->rlistInner); + nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner); wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING); } @@ -456,6 +456,7 @@ static void do_nb_verlet(t_forcerec *fr, case nbnxnk4xN_SIMD_4xN: case nbnxnk4xN_SIMD_2xNN: nbnxn_kernel_cpu(nbvg, + nbv->nbat, ic, fr->shift_vec, flags, @@ -468,16 +469,16 @@ static void do_nb_verlet(t_forcerec *fr, break; case nbnxnk8x8x8_GPU: - nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbvg->nbat, flags, ilocality); + nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbv->nbat, flags, ilocality); break; case nbnxnk8x8x8_PlainC: nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0], - nbvg->nbat, ic, + nbv->nbat, ic, fr->shift_vec, flags, clearF, - nbvg->nbat->out[0].f, + nbv->nbat->out[0].f, fr->fshift[0], enerd->grpp.ener[egCOULSR], fr->bBHAM ? @@ -903,7 +904,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, } nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX, - fr->shift_vec, nbv->grp[0].nbat); + fr->shift_vec, nbv->nbat); #if GMX_MPI if (!(cr->duty & DUTY_PME)) @@ -958,7 +959,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, 0, mdatoms->homenr, -1, fr->cginfo, x, 0, nullptr, nbv->grp[eintLocal].kernel_type, - nbv->grp[eintLocal].nbat); + nbv->nbat); wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL); } else @@ -967,23 +968,11 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd), fr->cginfo, x, nbv->grp[eintNonlocal].kernel_type, - nbv->grp[eintNonlocal].nbat); + nbv->nbat); wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL); } - if (nbv->ngrp == 1 || - nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat) - { - nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll, - nbv->nbs, mdatoms, fr->cginfo); - } - else - { - nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal, - nbv->nbs, mdatoms, fr->cginfo); - nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll, - nbv->nbs, mdatoms, fr->cginfo); - } + nbnxn_atomdata_set(nbv->nbat, nbv->nbs, mdatoms, fr->cginfo); wallcycle_stop(wcycle, ewcNS); } @@ -995,10 +984,10 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, if (bNS) { - nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat); + nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat); } - nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat); + nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); wallcycle_stop(wcycle, ewcLAUNCH_GPU); @@ -1009,7 +998,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, { wallcycle_start_nocount(wcycle, ewcNS); wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL); - nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat, + nbnxn_make_pairlist(nbv->nbs, nbv->nbat, &top->excls, nbv->listParams->rlistOuter, nbv->min_ci_balanced, @@ -1038,7 +1027,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS); nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x, - nbv->grp[eintLocal].nbat); + nbv->nbat); wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS); wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS); } @@ -1068,7 +1057,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, wallcycle_start_nocount(wcycle, ewcNS); wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL); - nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat, + nbnxn_make_pairlist(nbv->nbs, nbv->nbat, &top->excls, nbv->listParams->rlistOuter, nbv->min_ci_balanced, @@ -1101,7 +1090,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS); nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x, - nbv->grp[eintNonlocal].nbat); + nbv->nbat); wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS); wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS); } @@ -1125,10 +1114,10 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED); if (DOMAINDECOMP(cr)) { - nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat, + nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat, flags, eatNonlocal); } - nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat, + nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat, flags, eatLocal); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); wallcycle_stop(wcycle, ewcLAUNCH_GPU); @@ -1277,7 +1266,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, wallcycle_stop(wcycle, ewcFORCE); wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS); - nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f); + nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->nbat, f); wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS); wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS); wallcycle_start_nocount(wcycle, ewcFORCE); @@ -1288,7 +1277,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, { /* This is not in a subcounter because it takes a negligible and constant-sized amount of time */ - nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat, + nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->nbat, fr->fshift); } } @@ -1341,7 +1330,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0) { nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal, - nbv->grp[eintNonlocal].nbat, f); + nbv->nbat, f); } wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS); wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS); @@ -1443,7 +1432,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS); nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal, - nbv->grp[eintLocal].nbat, f); + nbv->nbat, f); wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS); wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS); } -- 2.11.4.GIT