From 6c0ab48c23a64aee18d443a2f9d62e1c87b12379 Mon Sep 17 00:00:00 2001 From: Alan Gray Date: Fri, 1 Nov 2019 07:45:25 -0700 Subject: [PATCH] Replace blocking with non-blocking receive in GPU PME coordinate receiver Replaces MPI_Recv with MPI_Irecv in original coordinate receiver method, and adds associated method containing MPI_Waitall which is called to wait on data completion across all PP ranks. Implements part of #3158 Change-Id: Ifd152973e4d9a4da53c0541e591b41d29be173bb --- src/gromacs/ewald/pme_coordinate_receiver_gpu.h | 9 +++-- .../ewald/pme_coordinate_receiver_gpu_impl.cpp | 9 ++++- .../ewald/pme_coordinate_receiver_gpu_impl.cu | 39 +++++++++++++++++----- .../ewald/pme_coordinate_receiver_gpu_impl.h | 15 +++++++-- src/gromacs/ewald/pme_only.cpp | 8 ++++- 5 files changed, 66 insertions(+), 14 deletions(-) diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h index 774c831f03..dfe8481593 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h @@ -70,10 +70,15 @@ public: /*! \brief - * receive coordinate data from PP rank + * launch receive of coordinate data from PP rank * \param[in] ppRank PP rank to send data */ - void receiveCoordinatesFromPpCudaDirect(int ppRank); + void launchReceiveCoordinatesFromPpCudaDirect(int ppRank); + + /*! \brief + * enqueue wait for coordinate data from PP ranks + */ + void enqueueWaitReceiveCoordinatesFromPpCudaDirect(); private: class Impl; diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp index b92637f046..35ddc07733 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp @@ -80,7 +80,14 @@ void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(rvec gmx_unu "correct implementation."); } -void PmeCoordinateReceiverGpu::receiveCoordinatesFromPpCudaDirect(int gmx_unused ppRank) +void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaDirect(int gmx_unused ppRank) +{ + GMX_ASSERT(false, + "A CPU stub for PME-PP GPU communication was called instead of the correct " + "implementation."); +} + +void PmeCoordinateReceiverGpu::enqueueWaitReceiveCoordinatesFromPpCudaDirect() { GMX_ASSERT(false, "A CPU stub for PME-PP GPU communication was called instead of the correct " diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu index 171896b528..dbb0ace8b2 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu @@ -66,6 +66,8 @@ PmeCoordinateReceiverGpu::Impl::Impl(void* pmeStream, MPI_Comm comm, gmx::ArrayR GMX_RELEASE_ASSERT( GMX_THREAD_MPI, "PME-PP GPU Communication is currently only supported with thread-MPI enabled"); + request_.resize(ppRanks.size()); + ppSync_.resize(ppRanks.size()); } PmeCoordinateReceiverGpu::Impl::~Impl() = default; @@ -90,19 +92,35 @@ void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(rvec* } /*! \brief Receive coordinate data directly using CUDA memory copy */ -void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesFromPpCudaDirect(int ppRank) +void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaDirect(int ppRank) { // Data will be pushed directly from PP task #if GMX_MPI - // Receive event from PP task and add to PME stream, to ensure PME calculation doesn't - // commence until coordinate data has been transferred - GpuEventSynchronizer* ppSync; - MPI_Recv(&ppSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_, MPI_STATUS_IGNORE); - ppSync->enqueueWaitEvent(pmeStream_); + // Receive event from PP task + MPI_Irecv(&ppSync_[recvCount_], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_, + &request_[recvCount_]); + recvCount_++; #endif } +void PmeCoordinateReceiverGpu::Impl::enqueueWaitReceiveCoordinatesFromPpCudaDirect() +{ + if (recvCount_ > 0) + { + // ensure PME calculation doesn't commence until coordinate data has been transferred +#if GMX_MPI + MPI_Waitall(recvCount_, request_.data(), MPI_STATUS_IGNORE); +#endif + for (int i = 0; i < recvCount_; i++) + { + ppSync_[i]->enqueueWaitEvent(pmeStream_); + } + // reset receive counter + recvCount_ = 0; + } +} + PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(void* pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks) : @@ -117,9 +135,14 @@ void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(rvec* d_x) impl_->sendCoordinateBufferAddressToPpRanks(d_x); } -void PmeCoordinateReceiverGpu::receiveCoordinatesFromPpCudaDirect(int ppRank) +void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaDirect(int ppRank) +{ + impl_->launchReceiveCoordinatesFromPpCudaDirect(ppRank); +} + +void PmeCoordinateReceiverGpu::enqueueWaitReceiveCoordinatesFromPpCudaDirect() { - impl_->receiveCoordinatesFromPpCudaDirect(ppRank); + impl_->enqueueWaitReceiveCoordinatesFromPpCudaDirect(); } } // namespace gmx diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h index 015ae346d4..fe689799e1 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h @@ -70,10 +70,15 @@ public: void sendCoordinateBufferAddressToPpRanks(rvec* d_x); /*! \brief - * receive coordinate data from PP rank + * launch receive of coordinate data from PP rank * \param[in] ppRank PP rank to send data */ - void receiveCoordinatesFromPpCudaDirect(int ppRank); + void launchReceiveCoordinatesFromPpCudaDirect(int ppRank); + + /*! \brief + * enqueue wait for coordinate data from PP ranks + */ + void enqueueWaitReceiveCoordinatesFromPpCudaDirect(); private: //! CUDA stream for PME operations @@ -82,6 +87,12 @@ private: MPI_Comm comm_; //! list of PP ranks gmx::ArrayRef ppRanks_; + //! vector of MPI requests + std::vector request_; + //! vector of synchronization events to receive from PP tasks + std::vector ppSync_; + //! counter of messages to receive + int recvCount_ = 0; }; } // namespace gmx diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index 5d99b07139..0610a2dc67 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -451,7 +451,8 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t* pme, { if (pme_pp->useGpuDirectComm) { - pme_pp->pmeCoordinateReceiverGpu->receiveCoordinatesFromPpCudaDirect(sender.rankId); + pme_pp->pmeCoordinateReceiverGpu->launchReceiveCoordinatesFromPpCudaDirect( + sender.rankId); } else { @@ -469,6 +470,11 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t* pme, } } + if (pme_pp->useGpuDirectComm) + { + pme_pp->pmeCoordinateReceiverGpu->enqueueWaitReceiveCoordinatesFromPpCudaDirect(); + } + status = pmerecvqxX; } -- 2.11.4.GIT