From 11302f48521725cef4346b8f482dbaaffd0be8fd Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Thu, 25 Jun 2020 15:31:36 +0000 Subject: [PATCH] Make cl_nbparam into a struct This is needed to unify with CUDA path --- src/gromacs/gpu_utils/gputraits.cuh | 3 + src/gromacs/gpu_utils/gputraits.h | 2 + src/gromacs/gpu_utils/gputraits_ocl.h | 2 + src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 8 +-- src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 16 ++--- src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh | 2 +- .../nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu | 6 +- .../nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh | 6 +- src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh | 78 +++++++++++----------- src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h | 69 +------------------ src/gromacs/nbnxm/gpu_types_common.h | 66 ++++++++++++++++++ src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 15 ++--- src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp | 34 +++++----- src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h | 62 +---------------- 14 files changed, 157 insertions(+), 212 deletions(-) diff --git a/src/gromacs/gpu_utils/gputraits.cuh b/src/gromacs/gpu_utils/gputraits.cuh index 76606611b8..98fd8d04ef 100644 --- a/src/gromacs/gpu_utils/gputraits.cuh +++ b/src/gromacs/gpu_utils/gputraits.cuh @@ -48,6 +48,9 @@ #include "gromacs/hardware/gpu_hw_info.h" +//! Device texture for fast read-only data fetching +using DeviceTexture = cudaTextureObject_t; + /*! \brief CUDA device information. * * The CUDA device information is queried and set at detection and contains diff --git a/src/gromacs/gpu_utils/gputraits.h b/src/gromacs/gpu_utils/gputraits.h index 5fec00303a..9ae87f1436 100644 --- a/src/gromacs/gpu_utils/gputraits.h +++ b/src/gromacs/gpu_utils/gputraits.h @@ -57,6 +57,8 @@ #else +using DeviceTexture = void*; + //! Stub for device information. struct DeviceInformation { diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h index ff4572e1af..a3eb510c95 100644 --- a/src/gromacs/gpu_utils/gputraits_ocl.h +++ b/src/gromacs/gpu_utils/gputraits_ocl.h @@ -48,6 +48,8 @@ #include "gromacs/gpu_utils/gmxopencl.h" #include "gromacs/hardware/gpu_hw_info.h" +using DeviceTexture = void*; + //! OpenCL device vendors enum class DeviceVendor : int { diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 241392f187..f7a12cf99d 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -121,7 +121,7 @@ namespace Nbnxm constexpr static int c_bufOpsThreadsPerBlock = 128; /*! Nonbonded kernel function pointer type */ -typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t, const cu_nbparam_t, const cu_plist_t, bool); +typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t, const NBParamGpu, const cu_plist_t, bool); /*********************************/ @@ -330,7 +330,7 @@ static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int e /*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use. */ static inline int calc_shmem_required_nonbonded(const int num_threads_z, const DeviceInformation gmx_unused* deviceInfo, - const cu_nbparam_t* nbp) + const NBParamGpu* nbp) { int shmem; @@ -483,7 +483,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc) { cu_atomdata_t* adat = nb->atdat; - cu_nbparam_t* nbp = nb->nbparam; + NBParamGpu* nbp = nb->nbparam; cu_plist_t* plist = nb->plist[iloc]; cu_timers_t* t = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; @@ -596,7 +596,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z) void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts) { cu_atomdata_t* adat = nb->atdat; - cu_nbparam_t* nbp = nb->nbparam; + NBParamGpu* nbp = nb->nbparam; cu_plist_t* plist = nb->plist[iloc]; cu_timers_t* t = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index 962f5a049e..69beab6b6f 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -93,7 +93,7 @@ static unsigned int gpu_min_ci_balanced_factor = 44; static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb); /* Fw. decl, */ -static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t* nbparam); +static void nbnxn_cuda_free_nbparam_table(NBParamGpu* nbparam); /*! \brief Initialized the Ewald Coulomb correction GPU table. @@ -102,7 +102,7 @@ static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t* nbparam); it just re-uploads the table. */ static void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables, - cu_nbparam_t* nbp, + NBParamGpu* nbp, const DeviceContext& deviceContext) { if (nbp->coulomb_tab != nullptr) @@ -192,7 +192,7 @@ static int pick_ewald_kernel_type(const interaction_const_t& ic) } /*! Copies all parameters related to the cut-off from ic to nbp */ -static void set_cutoff_parameters(cu_nbparam_t* nbp, const interaction_const_t* ic, const PairlistParams& listParams) +static void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams) { nbp->ewald_beta = ic->ewaldcoeff_q; nbp->sh_ewald = ic->sh_ewald; @@ -215,7 +215,7 @@ static void set_cutoff_parameters(cu_nbparam_t* nbp, const interaction_const_t* } /*! Initializes the nonbonded parameter data structure. */ -static void init_nbparam(cu_nbparam_t* nbp, +static void init_nbparam(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams, const nbnxn_atomdata_t::Params& nbatParams, @@ -331,8 +331,8 @@ void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interacti { return; } - NbnxmGpu* nb = nbv->gpu_nbv; - cu_nbparam_t* nbp = nbv->gpu_nbv->nbparam; + NbnxmGpu* nb = nbv->gpu_nbv; + NBParamGpu* nbp = nbv->gpu_nbv->nbparam; set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params()); @@ -679,7 +679,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) } } -static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t* nbparam) +static void nbnxn_cuda_free_nbparam_table(NBParamGpu* nbparam) { if (nbparam->eeltype == eelTypeEWALD_TAB || nbparam->eeltype == eelTypeEWALD_TAB_TWIN) { @@ -691,7 +691,7 @@ void gpu_free(NbnxmGpu* nb) { cudaError_t stat; cu_atomdata_t* atdat; - cu_nbparam_t* nbparam; + NBParamGpu* nbparam; if (nb == nullptr) { diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh index 89f75da5f9..7faea980b7 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh @@ -158,7 +158,7 @@ __launch_bounds__(THREADS_PER_BLOCK) __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) # endif /* CALC_ENERGIES */ #endif /* PRUNE_NBL */ - (const cu_atomdata_t atdat, const cu_nbparam_t nbparam, const cu_plist_t plist, bool bCalcFshift) + (const cu_atomdata_t atdat, const NBParamGpu nbparam, const cu_plist_t plist, bool bCalcFshift) #ifdef FUNCTION_DECLARATION_ONLY ; /* Only do function declaration, omit the function body. */ #else diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu index 945b1912fc..81755cb903 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2019, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -39,7 +39,7 @@ #ifndef FUNCTION_DECLARATION_ONLY /* Instantiate external template functions */ template __global__ void -nbnxn_kernel_prune_cuda(const cu_atomdata_t, const cu_nbparam_t, const cu_plist_t, int, int); +nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const cu_plist_t, int, int); template __global__ void -nbnxn_kernel_prune_cuda(const cu_atomdata_t, const cu_nbparam_t, const cu_plist_t, int, int); +nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const cu_plist_t, int, int); #endif diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh index e9c5b51143..e5bf2b967c 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh @@ -104,7 +104,7 @@ template __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) __global__ void nbnxn_kernel_prune_cuda(const cu_atomdata_t atdat, - const cu_nbparam_t nbparam, + const NBParamGpu nbparam, const cu_plist_t plist, int numParts, int part) @@ -114,9 +114,9 @@ __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) __global__ // Add extern declarations so each translation unit understands that // there will be a definition provided. extern template __global__ void -nbnxn_kernel_prune_cuda(const cu_atomdata_t, const cu_nbparam_t, const cu_plist_t, int, int); +nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const cu_plist_t, int, int); extern template __global__ void -nbnxn_kernel_prune_cuda(const cu_atomdata_t, const cu_nbparam_t, const cu_plist_t, int, int); +nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const cu_plist_t, int, int); #else { diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh index cbca9452b9..4850298f8f 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh @@ -90,7 +90,7 @@ static __forceinline__ __device__ void /*! Apply force switch, force + energy version. */ static __forceinline__ __device__ void - calculate_force_switch_F(const cu_nbparam_t nbparam, float c6, float c12, float inv_r, float r2, float* F_invr) + calculate_force_switch_F(const NBParamGpu nbparam, float c6, float c12, float inv_r, float r2, float* F_invr) { float r, r_switch; @@ -109,13 +109,13 @@ static __forceinline__ __device__ void } /*! Apply force switch, force-only version. */ -static __forceinline__ __device__ void calculate_force_switch_F_E(const cu_nbparam_t nbparam, - float c6, - float c12, - float inv_r, - float r2, - float* F_invr, - float* E_lj) +static __forceinline__ __device__ void calculate_force_switch_F_E(const NBParamGpu nbparam, + float c6, + float c12, + float inv_r, + float r2, + float* F_invr, + float* E_lj) { float r, r_switch; @@ -142,7 +142,7 @@ static __forceinline__ __device__ void calculate_force_switch_F_E(const cu_nbpar /*! Apply potential switch, force-only version. */ static __forceinline__ __device__ void - calculate_potential_switch_F(const cu_nbparam_t nbparam, float inv_r, float r2, float* F_invr, float* E_lj) + calculate_potential_switch_F(const NBParamGpu nbparam, float inv_r, float r2, float* F_invr, float* E_lj) { float r, r_switch; float sw, dsw; @@ -170,7 +170,7 @@ static __forceinline__ __device__ void /*! Apply potential switch, force + energy version. */ static __forceinline__ __device__ void - calculate_potential_switch_F_E(const cu_nbparam_t nbparam, float inv_r, float r2, float* F_invr, float* E_lj) + calculate_potential_switch_F_E(const NBParamGpu nbparam, float inv_r, float r2, float* F_invr, float* E_lj) { float r, r_switch; float sw, dsw; @@ -201,7 +201,7 @@ static __forceinline__ __device__ void * Depending on what is supported, it fetches parameters either * using direct load, texture objects, or texrefs. */ -static __forceinline__ __device__ float calculate_lj_ewald_c6grid(const cu_nbparam_t nbparam, int typei, int typej) +static __forceinline__ __device__ float calculate_lj_ewald_c6grid(const NBParamGpu nbparam, int typei, int typej) { # if DISABLE_CUDA_TEXTURES return LDG(&nbparam.nbfp_comb[2 * typei]) * LDG(&nbparam.nbfp_comb[2 * typej]); @@ -215,14 +215,14 @@ static __forceinline__ __device__ float calculate_lj_ewald_c6grid(const cu_nbpar /*! Calculate LJ-PME grid force contribution with * geometric combination rule. */ -static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam, - int typei, - int typej, - float r2, - float inv_r2, - float lje_coeff2, - float lje_coeff6_6, - float* F_invr) +static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F(const NBParamGpu nbparam, + int typei, + int typej, + float r2, + float inv_r2, + float lje_coeff2, + float lje_coeff6_6, + float* F_invr) { float c6grid, inv_r6_nm, cr2, expmcr2, poly; @@ -242,12 +242,12 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F(const cu_n /*! Calculate LJ-PME grid force + energy contribution with * geometric combination rule. */ -static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam, - int typei, - int typej, - float r2, - float inv_r2, - float lje_coeff2, +static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F_E(const NBParamGpu nbparam, + int typei, + int typej, + float r2, + float inv_r2, + float lje_coeff2, float lje_coeff6_6, float int_bit, float* F_invr, @@ -276,7 +276,7 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F_E(const cu * Depending on what is supported, it fetches parameters either * using direct load, texture objects, or texrefs. */ -static __forceinline__ __device__ float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam, int type) +static __forceinline__ __device__ float2 fetch_nbfp_comb_c6_c12(const NBParamGpu nbparam, int type) { float2 c6c12; # if DISABLE_CUDA_TEXTURES @@ -299,16 +299,16 @@ static __forceinline__ __device__ float2 fetch_nbfp_comb_c6_c12(const cu_nbparam * We use a single F+E kernel with conditional because the performance impact * of this is pretty small and LB on the CPU is anyway very slow. */ -static __forceinline__ __device__ void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam, - int typei, - int typej, - float r2, - float inv_r2, - float lje_coeff2, - float lje_coeff6_6, - float int_bit, - float* F_invr, - float* E_lj) +static __forceinline__ __device__ void calculate_lj_ewald_comb_LB_F_E(const NBParamGpu nbparam, + int typei, + int typej, + float r2, + float inv_r2, + float lje_coeff2, + float lje_coeff6_6, + float int_bit, + float* F_invr, + float* E_lj) { float c6grid, inv_r6_nm, cr2, expmcr2, poly; float sigma, sigma2, epsilon; @@ -348,7 +348,7 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_LB_F_E(const cu_n * Depending on what is supported, it fetches parameters either * using direct load, texture objects, or texrefs. */ -static __forceinline__ __device__ float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam, int index) +static __forceinline__ __device__ float2 fetch_coulomb_force_r(const NBParamGpu nbparam, int index) { float2 d; @@ -379,7 +379,7 @@ __forceinline__ __host__ __device__ T lerp(T d0, T d1, T t) /*! Interpolate Ewald coulomb force correction using the F*r table. */ -static __forceinline__ __device__ float interpolate_coulomb_force_r(const cu_nbparam_t nbparam, float r) +static __forceinline__ __device__ float interpolate_coulomb_force_r(const NBParamGpu nbparam, float r) { float normalized = nbparam.coulomb_tab_scale * r; int index = (int)normalized; @@ -395,7 +395,7 @@ static __forceinline__ __device__ float interpolate_coulomb_force_r(const cu_nbp * Depending on what is supported, it fetches parameters either * using direct load, texture objects, or texrefs. */ -static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12, const cu_nbparam_t nbparam, int baseIndex) +static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12, const NBParamGpu nbparam, int baseIndex) { # if DISABLE_CUDA_TEXTURES /* Force an 8-byte fetch to save a memory instruction. */ diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index 67f220d15d..acadca29c5 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -80,7 +80,6 @@ static constexpr int c_clSize = c_nbnxnGpuClusterSize; * are passed to the kernels, except cu_timers_t. */ /*! \cond */ typedef struct cu_atomdata cu_atomdata_t; -typedef struct cu_nbparam cu_nbparam_t; /*! \endcond */ @@ -139,72 +138,6 @@ struct cu_atomdata }; /** \internal - * \brief Parameters required for the CUDA nonbonded calculations. - */ -struct cu_nbparam -{ - - //! type of electrostatics, takes values from #eelType - int eeltype; - //! type of VdW impl., takes values from #evdwType - int vdwtype; - - //! charge multiplication factor - float epsfac; - //! Reaction-field/plain cutoff electrostatics const. - float c_rf; - //! Reaction-field electrostatics constant - float two_k_rf; - //! Ewald/PME parameter - float ewald_beta; - //! Ewald/PME correction term substracted from the direct-space potential - float sh_ewald; - //! LJ-Ewald/PME correction term added to the correction potential - float sh_lj_ewald; - //! LJ-Ewald/PME coefficient - float ewaldcoeff_lj; - - //! Coulomb cut-off squared - float rcoulomb_sq; - - //! VdW cut-off squared - float rvdw_sq; - //! VdW switched cut-off - float rvdw_switch; - //! Full, outer pair-list cut-off squared - float rlistOuter_sq; - //! Inner, dynamic pruned pair-list cut-off squared - float rlistInner_sq; - //! True if we use dynamic pair-list pruning - bool useDynamicPruning; - - //! VdW shift dispersion constants - shift_consts_t dispersion_shift; - //! VdW shift repulsion constants - shift_consts_t repulsion_shift; - //! VdW switch constants - switch_consts_t vdw_switch; - - /* LJ non-bonded parameters - accessed through texture memory */ - //! nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements - float* nbfp; - //! texture object bound to nbfp - cudaTextureObject_t nbfp_texobj; - //! nonbonded parameter table per atom type, 2*ntype elements - float* nbfp_comb; - //! texture object bound to nbfp_texobj - cudaTextureObject_t nbfp_comb_texobj; - - /* Ewald Coulomb force table data - accessed through texture memory */ - //! table scale/spacing - float coulomb_tab_scale; - //! pointer to the table in the device memory - float* coulomb_tab; - //! texture object bound to coulomb_tab - cudaTextureObject_t coulomb_tab_texobj; -}; - -/** \internal * \brief Pair list data. */ using cu_plist_t = Nbnxm::gpu_plist; @@ -255,7 +188,7 @@ struct NbnxmGpu /*! \brief number of elements allocated allocated in device buffer */ int ncxy_ind_alloc = 0; /*! \brief parameters required for the non-bonded calc. */ - cu_nbparam_t* nbparam = nullptr; + NBParamGpu* nbparam = nullptr; /*! \brief pair-list data structures (local and non-local) */ gmx::EnumerationArray plist = { { nullptr } }; /*! \brief staging area where fshift/energies get downloaded */ diff --git a/src/gromacs/nbnxm/gpu_types_common.h b/src/gromacs/nbnxm/gpu_types_common.h index 28dbe70459..17b66e49d8 100644 --- a/src/gromacs/nbnxm/gpu_types_common.h +++ b/src/gromacs/nbnxm/gpu_types_common.h @@ -57,6 +57,72 @@ # include "gromacs/gpu_utils/gpuregiontimer.cuh" #endif +/** \internal + * \brief Parameters required for the GPU nonbonded calculations. + */ +struct NBParamGpu +{ + + //! type of electrostatics, takes values from #eelType + int eeltype; + //! type of VdW impl., takes values from #evdwType + int vdwtype; + + //! charge multiplication factor + float epsfac; + //! Reaction-field/plain cutoff electrostatics const. + float c_rf; + //! Reaction-field electrostatics constant + float two_k_rf; + //! Ewald/PME parameter + float ewald_beta; + //! Ewald/PME correction term substracted from the direct-space potential + float sh_ewald; + //! LJ-Ewald/PME correction term added to the correction potential + float sh_lj_ewald; + //! LJ-Ewald/PME coefficient + float ewaldcoeff_lj; + + //! Coulomb cut-off squared + float rcoulomb_sq; + + //! VdW cut-off squared + float rvdw_sq; + //! VdW switched cut-off + float rvdw_switch; + //! Full, outer pair-list cut-off squared + float rlistOuter_sq; + //! Inner, dynamic pruned pair-list cut-off squared + float rlistInner_sq; + //! True if we use dynamic pair-list pruning + bool useDynamicPruning; + + //! VdW shift dispersion constants + shift_consts_t dispersion_shift; + //! VdW shift repulsion constants + shift_consts_t repulsion_shift; + //! VdW switch constants + switch_consts_t vdw_switch; + + /* LJ non-bonded parameters - accessed through texture memory */ + //! nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements + DeviceBuffer nbfp; + //! texture object bound to nbfp + DeviceTexture nbfp_texobj; + //! nonbonded parameter table per atom type, 2*ntype elements + DeviceBuffer nbfp_comb; + //! texture object bound to nbfp_comb + DeviceTexture nbfp_comb_texobj; + + /* Ewald Coulomb force table data - accessed through texture memory */ + //! table scale/spacing + float coulomb_tab_scale; + //! pointer to the table in the device memory + DeviceBuffer coulomb_tab; + //! texture object bound to coulomb_tab + DeviceTexture coulomb_tab_texobj; +}; + namespace Nbnxm { diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index 226571dfd1..cd929a4dbd 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -431,7 +431,7 @@ static inline int calc_shmem_required_nonbonded(int vdwType, bool bPrefetchLjPar * * This function is called before the launch of both nbnxn and prune kernels. */ -static void fillin_ocl_structures(cl_nbparam_t* nbp, cl_nbparam_params_t* nbparams_params) +static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams_params) { nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale; nbparams_params->c_rf = nbp->c_rf; @@ -585,7 +585,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc) { cl_atomdata_t* adat = nb->atdat; - cl_nbparam_t* nbp = nb->nbparam; + NBParamGpu* nbp = nb->nbparam; cl_plist_t* plist = nb->plist[iloc]; cl_timers_t* t = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; @@ -669,8 +669,8 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb { const auto kernelArgs = prepareGpuKernelArguments( kernel, config, &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, - &adat->fshift, &adat->lj_comb, &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, - &nbp->coulomb_tab_climg2d, &plist->sci, &plist->cj4, &plist->excl, &computeFshift); + &adat->fshift, &adat->lj_comb, &adat->shift_vec, &nbp->nbfp, &nbp->nbfp_comb, + &nbp->coulomb_tab, &plist->sci, &plist->cj4, &plist->excl, &computeFshift); launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs); } @@ -678,9 +678,8 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb { const auto kernelArgs = prepareGpuKernelArguments( kernel, config, &adat->ntypes, &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, - &adat->e_el, &adat->fshift, &adat->atom_types, &adat->shift_vec, &nbp->nbfp_climg2d, - &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d, &plist->sci, &plist->cj4, - &plist->excl, &computeFshift); + &adat->e_el, &adat->fshift, &adat->atom_types, &adat->shift_vec, &nbp->nbfp, &nbp->nbfp_comb, + &nbp->coulomb_tab, &plist->sci, &plist->cj4, &plist->excl, &computeFshift); launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs); } @@ -723,7 +722,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z) void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts) { cl_atomdata_t* adat = nb->atdat; - cl_nbparam_t* nbp = nb->nbparam; + NBParamGpu* nbp = nb->nbparam; cl_plist_t* plist = nb->plist[iloc]; cl_timers_t* t = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index ce17f8152f..766789b930 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -105,20 +105,20 @@ static unsigned int gpu_min_ci_balanced_factor = 50; * table. */ static void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables, - cl_nbparam_t* nbp, + NBParamGpu* nbp, const DeviceContext& deviceContext) { - if (nbp->coulomb_tab_climg2d != nullptr) + if (nbp->coulomb_tab != nullptr) { - freeDeviceBuffer(&(nbp->coulomb_tab_climg2d)); + freeDeviceBuffer(&(nbp->coulomb_tab)); } DeviceBuffer coulomb_tab; initParamLookupTable(&coulomb_tab, nullptr, tables.tableF.data(), tables.tableF.size(), deviceContext); - nbp->coulomb_tab_climg2d = coulomb_tab; - nbp->coulomb_tab_scale = tables.scale; + nbp->coulomb_tab = coulomb_tab; + nbp->coulomb_tab_scale = tables.scale; } @@ -148,7 +148,7 @@ static void init_atomdata_first(cl_atomdata_t* ad, int ntypes, const DeviceConte /*! \brief Copies all parameters related to the cut-off from ic to nbp */ -static void set_cutoff_parameters(cl_nbparam_t* nbp, const interaction_const_t* ic, const PairlistParams& listParams) +static void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams) { nbp->ewald_beta = ic->ewaldcoeff_q; nbp->sh_ewald = ic->sh_ewald; @@ -244,7 +244,7 @@ static void map_interaction_types_to_gpu_kernel_flavors(const interaction_const_ /*! \brief Initializes the nonbonded parameter data structure. */ -static void init_nbparam(cl_nbparam_t* nbp, +static void init_nbparam(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams, const nbnxn_atomdata_t::Params& nbatParams, @@ -266,7 +266,7 @@ static void init_nbparam(cl_nbparam_t* nbp, } } /* generate table for PME */ - nbp->coulomb_tab_climg2d = nullptr; + nbp->coulomb_tab = nullptr; if (nbp->eeltype == eelTypeEWALD_TAB || nbp->eeltype == eelTypeEWALD_TAB_TWIN) { GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables"); @@ -274,7 +274,7 @@ static void init_nbparam(cl_nbparam_t* nbp, } else { - allocateDeviceBuffer(&nbp->coulomb_tab_climg2d, 1, deviceContext); + allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext); } const int nnbfp = 2 * nbatParams.numTypes * nbatParams.numTypes; @@ -284,13 +284,13 @@ static void init_nbparam(cl_nbparam_t* nbp, /* set up LJ parameter lookup table */ DeviceBuffer nbfp; initParamLookupTable(&nbfp, nullptr, nbatParams.nbfp.data(), nnbfp, deviceContext); - nbp->nbfp_climg2d = nbfp; + nbp->nbfp = nbfp; if (ic->vdwtype == evdwPME) { DeviceBuffer nbfp_comb; initParamLookupTable(&nbfp_comb, nullptr, nbatParams.nbfp_comb.data(), nnbfp_comb, deviceContext); - nbp->nbfp_comb_climg2d = nbfp_comb; + nbp->nbfp_comb = nbfp_comb; } } } @@ -302,8 +302,8 @@ void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interacti { return; } - NbnxmGpu* nb = nbv->gpu_nbv; - cl_nbparam_t* nbp = nb->nbparam; + NbnxmGpu* nb = nbv->gpu_nbv; + NBParamGpu* nbp = nb->nbparam; set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params()); @@ -444,7 +444,7 @@ static void nbnxn_gpu_init_kernels(NbnxmGpu* nb) * clears e/fshift output buffers. */ static void nbnxn_ocl_init_const(cl_atomdata_t* atomData, - cl_nbparam_t* nbParams, + NBParamGpu* nbParams, const interaction_const_t* ic, const PairlistParams& listParams, const nbnxn_atomdata_t::Params& nbatParams, @@ -825,9 +825,9 @@ void gpu_free(NbnxmGpu* nb) sfree(nb->atdat); /* Free nbparam */ - freeDeviceBuffer(&(nb->nbparam->nbfp_climg2d)); - freeDeviceBuffer(&(nb->nbparam->nbfp_comb_climg2d)); - freeDeviceBuffer(&(nb->nbparam->coulomb_tab_climg2d)); + freeDeviceBuffer(&(nb->nbparam->nbfp)); + freeDeviceBuffer(&(nb->nbparam->nbfp_comb)); + freeDeviceBuffer(&(nb->nbparam->coulomb_tab)); sfree(nb->nbparam); /* Free plist */ diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h index a1db11c930..a2f6913a90 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h @@ -147,66 +147,6 @@ typedef struct cl_atomdata } cl_atomdata_t; /*! \internal - * \brief Parameters required for the OpenCL nonbonded calculations. - */ -typedef struct cl_nbparam -{ - - //! type of electrostatics, takes values from #eelType - int eeltype; - //! type of VdW impl., takes values from #evdwType - int vdwtype; - - //! charge multiplication factor - float epsfac; - //! Reaction-field/plain cutoff electrostatics const. - float c_rf; - //! Reaction-field electrostatics constant - float two_k_rf; - //! Ewald/PME parameter - float ewald_beta; - //! Ewald/PME correction term substracted from the direct-space potential - float sh_ewald; - //! LJ-Ewald/PME correction term added to the correction potential - float sh_lj_ewald; - //! LJ-Ewald/PME coefficient - float ewaldcoeff_lj; - - //! Coulomb cut-off squared - float rcoulomb_sq; - - //! VdW cut-off squared - float rvdw_sq; - //! VdW switched cut-off - float rvdw_switch; - //! Full, outer pair-list cut-off squared - float rlistOuter_sq; - //! Inner, dynamic pruned pair-list cut-off squared - float rlistInner_sq; - //! True if we use dynamic pair-list pruning - bool useDynamicPruning; - - //! VdW shift dispersion constants - shift_consts_t dispersion_shift; - //! VdW shift repulsion constants - shift_consts_t repulsion_shift; - //! VdW switch constants - switch_consts_t vdw_switch; - - /* LJ non-bonded parameters - accessed through texture memory */ - //! nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements - cl_mem nbfp_climg2d; - //! nonbonded parameter table per atom type, 2*ntype elements - cl_mem nbfp_comb_climg2d; - - /* Ewald Coulomb force table data - accessed through texture memory */ - //! table scale/spacing - float coulomb_tab_scale; - //! pointer to the table in the device memory - DeviceBuffer coulomb_tab_climg2d; -} cl_nbparam_t; - -/*! \internal * \brief Data structure shared between the OpenCL device code and OpenCL host code * * Must not contain OpenCL objects (buffers) @@ -312,7 +252,7 @@ struct NbnxmGpu //! atom data cl_atomdata_t* atdat = nullptr; //! parameters required for the non-bonded calc. - cl_nbparam_t* nbparam = nullptr; + NBParamGpu* nbparam = nullptr; //! pair-list data structures (local and non-local) gmx::EnumerationArray plist = { nullptr }; //! staging area where fshift/energies get downloaded -- 2.11.4.GIT