From c048437f61b0fd1276917cfc8888798bd091e385 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Tue, 5 May 2020 06:37:33 +0000 Subject: [PATCH] Unify CUDA and OpenCL lookup-table creation In CUDA code, textures are used for the lookup-tables, whereas in OpenCL they are created as a read-only buffers. This commit hides these differences behind a unified wrapper. Refs #3318 Refs #3311 Change-Id: I003e0c982c2452a2753e331b46fc59f0b7e1b711 --- src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 82 ++++++++++---------------- src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h | 17 +++--- 2 files changed, 39 insertions(+), 60 deletions(-) diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index c6c7850493..526c2e7355 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -128,22 +128,15 @@ static void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables, /*! Initializes the atomdata structure first time, it only gets filled at pair-search. */ -static void init_atomdata_first(cu_atomdata_t* ad, int ntypes) +static void init_atomdata_first(cu_atomdata_t* ad, int ntypes, const DeviceContext& deviceContext) { - cudaError_t stat; - ad->ntypes = ntypes; - stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS * sizeof(*ad->shift_vec)); - CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec"); + allocateDeviceBuffer(&ad->shift_vec, SHIFTS, deviceContext); ad->bShiftVecUploaded = false; - stat = cudaMalloc((void**)&ad->fshift, SHIFTS * sizeof(*ad->fshift)); - CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift"); - - stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj)); - CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj"); - stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el)); - CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el"); + allocateDeviceBuffer(&ad->fshift, SHIFTS, deviceContext); + allocateDeviceBuffer(&ad->e_lj, 1, deviceContext); + allocateDeviceBuffer(&ad->e_el, 1, deviceContext); /* initialize to nullptr poiters to data that is not allocated here and will need reallocation in nbnxn_cuda_init_atomdata */ @@ -412,7 +405,7 @@ static void cuda_init_const(NbnxmGpu* nb, const PairlistParams& listParams, const nbnxn_atomdata_t::Params& nbatParams) { - init_atomdata_first(nb->atdat, nbatParams.numTypes); + init_atomdata_first(nb->atdat, nbatParams.numTypes, *nb->deviceContext_); init_nbparam(nb->nbparam, ic, listParams, nbatParams, *nb->deviceContext_); /* clear energy and shift force outputs */ @@ -589,27 +582,20 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */ static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear) { - cudaError_t stat; - cu_atomdata_t* adat = nb->atdat; - cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local]->stream(); - - stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls); - CU_RET_ERR(stat, "cudaMemsetAsync on f falied"); + cu_atomdata_t* adat = nb->atdat; + const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; + clearDeviceBufferAsync(&adat->f, 0, natoms_clear, localStream); } /*! Clears nonbonded shift force output array and energy outputs on the GPU. */ static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb) { - cudaError_t stat; - cu_atomdata_t* adat = nb->atdat; - cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local]->stream(); - - stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls); - CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied"); - stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls); - CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied"); - stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls); - CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied"); + cu_atomdata_t* adat = nb->atdat; + const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; + + clearDeviceBufferAsync(&adat->fshift, 0, SHIFTS, localStream); + clearDeviceBufferAsync(&adat->e_lj, 0, 1, localStream); + clearDeviceBufferAsync(&adat->e_el, 0, 1, localStream); } void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) @@ -625,13 +611,13 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) { - cudaError_t stat; - int nalloc, natoms; - bool realloced; - bool bDoTime = nb->bDoTime; - cu_timers_t* timers = nb->timers; - cu_atomdata_t* d_atdat = nb->atdat; - const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; + int nalloc, natoms; + bool realloced; + bool bDoTime = nb->bDoTime; + cu_timers_t* timers = nb->timers; + cu_atomdata_t* d_atdat = nb->atdat; + const DeviceContext& deviceContext = *nb->deviceContext_; + const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; natoms = nbat->numAtoms(); realloced = false; @@ -657,19 +643,15 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) freeDeviceBuffer(&d_atdat->lj_comb); } - stat = cudaMalloc((void**)&d_atdat->f, nalloc * sizeof(*d_atdat->f)); - CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f"); - stat = cudaMalloc((void**)&d_atdat->xq, nalloc * sizeof(*d_atdat->xq)); - CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq"); + allocateDeviceBuffer(&d_atdat->f, nalloc, deviceContext); + allocateDeviceBuffer(&d_atdat->xq, nalloc, deviceContext); if (useLjCombRule(nb->nbparam)) { - stat = cudaMalloc((void**)&d_atdat->lj_comb, nalloc * sizeof(*d_atdat->lj_comb)); - CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->lj_comb"); + allocateDeviceBuffer(&d_atdat->lj_comb, nalloc, deviceContext); } else { - stat = cudaMalloc((void**)&d_atdat->atom_types, nalloc * sizeof(*d_atdat->atom_types)); - CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types"); + allocateDeviceBuffer(&d_atdat->atom_types, nalloc, deviceContext); } d_atdat->nalloc = nalloc; @@ -748,15 +730,11 @@ void gpu_free(NbnxmGpu* nb) destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj); } - stat = cudaFree(atdat->shift_vec); - CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec"); - stat = cudaFree(atdat->fshift); - CU_RET_ERR(stat, "cudaFree failed on atdat->fshift"); + freeDeviceBuffer(&atdat->shift_vec); + freeDeviceBuffer(&atdat->fshift); - stat = cudaFree(atdat->e_lj); - CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj"); - stat = cudaFree(atdat->e_el); - CU_RET_ERR(stat, "cudaFree failed on atdat->e_el"); + freeDeviceBuffer(&atdat->e_lj); + freeDeviceBuffer(&atdat->e_el); freeDeviceBuffer(&atdat->f); freeDeviceBuffer(&atdat->xq); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index 68d5da81c4..eca71b1360 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -49,6 +49,7 @@ #include "gromacs/gpu_utils/cuda_arch_utils.cuh" #include "gromacs/gpu_utils/cudautils.cuh" #include "gromacs/gpu_utils/devicebuffer.h" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/gputraits.cuh" #include "gromacs/mdtypes/interaction_const.h" #include "gromacs/nbnxm/gpu_types_common.h" @@ -158,27 +159,27 @@ struct cu_atomdata int nalloc; //! atom coordinates + charges, size natoms - float4* xq; + DeviceBuffer xq; //! force output array, size natoms - float3* f; + DeviceBuffer f; //! LJ energy output, size 1 - float* e_lj; + DeviceBuffer e_lj; //! Electrostatics energy input, size 1 - float* e_el; + DeviceBuffer e_el; //! shift forces - float3* fshift; + DeviceBuffer fshift; //! number of atom types int ntypes; //! atom type indices, size natoms - int* atom_types; + DeviceBuffer atom_types; //! sqrt(c6),sqrt(c12) size natoms - float2* lj_comb; + DeviceBuffer lj_comb; //! shifts - float3* shift_vec; + DeviceBuffer shift_vec; //! true if the shift vector has been uploaded bool bShiftVecUploaded; }; -- 2.11.4.GIT