From 6c0ab48c23a64aee18d443a2f9d62e1c87b12379 Mon Sep 17 00:00:00 2001
From: Alan Gray <alangraygerrit@gmail.com>
Date: Fri, 1 Nov 2019 07:45:25 -0700
Subject: [PATCH] Replace blocking with non-blocking receive in GPU PME
 coordinate receiver

Replaces MPI_Recv with MPI_Irecv in original coordinate receiver
method, and adds associated method containing MPI_Waitall which is
called to wait on data completion across all PP ranks.

Implements part of #3158

Change-Id: Ifd152973e4d9a4da53c0541e591b41d29be173bb
---
 src/gromacs/ewald/pme_coordinate_receiver_gpu.h    |  9 +++--
 .../ewald/pme_coordinate_receiver_gpu_impl.cpp     |  9 ++++-
 .../ewald/pme_coordinate_receiver_gpu_impl.cu      | 39 +++++++++++++++++-----
 .../ewald/pme_coordinate_receiver_gpu_impl.h       | 15 +++++++--
 src/gromacs/ewald/pme_only.cpp                     |  8 ++++-
 5 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
index 774c831f03..dfe8481593 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h
@@ -70,10 +70,15 @@ public:
 
 
     /*! \brief
-     * receive coordinate data from PP rank
+     * launch receive of coordinate data from PP rank
      * \param[in] ppRank  PP rank to send data
      */
-    void receiveCoordinatesFromPpCudaDirect(int ppRank);
+    void launchReceiveCoordinatesFromPpCudaDirect(int ppRank);
+
+    /*! \brief
+     * enqueue wait for coordinate data from PP ranks
+     */
+    void enqueueWaitReceiveCoordinatesFromPpCudaDirect();
 
 private:
     class Impl;
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
index b92637f046..35ddc07733 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp
@@ -80,7 +80,14 @@ void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(rvec gmx_unu
                "correct implementation.");
 }
 
-void PmeCoordinateReceiverGpu::receiveCoordinatesFromPpCudaDirect(int gmx_unused ppRank)
+void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaDirect(int gmx_unused ppRank)
+{
+    GMX_ASSERT(false,
+               "A CPU stub for PME-PP GPU communication was called instead of the correct "
+               "implementation.");
+}
+
+void PmeCoordinateReceiverGpu::enqueueWaitReceiveCoordinatesFromPpCudaDirect()
 {
     GMX_ASSERT(false,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
index 171896b528..dbb0ace8b2 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
@@ -66,6 +66,8 @@ PmeCoordinateReceiverGpu::Impl::Impl(void* pmeStream, MPI_Comm comm, gmx::ArrayR
     GMX_RELEASE_ASSERT(
             GMX_THREAD_MPI,
             "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
+    request_.resize(ppRanks.size());
+    ppSync_.resize(ppRanks.size());
 }
 
 PmeCoordinateReceiverGpu::Impl::~Impl() = default;
@@ -90,19 +92,35 @@ void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(rvec*
 }
 
 /*! \brief Receive coordinate data directly using CUDA memory copy */
-void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesFromPpCudaDirect(int ppRank)
+void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaDirect(int ppRank)
 {
     // Data will be pushed directly from PP task
 
 #if GMX_MPI
-    // Receive event from PP task and add to PME stream, to ensure PME calculation doesn't
-    // commence until coordinate data has been transferred
-    GpuEventSynchronizer* ppSync;
-    MPI_Recv(&ppSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_, MPI_STATUS_IGNORE);
-    ppSync->enqueueWaitEvent(pmeStream_);
+    // Receive event from PP task
+    MPI_Irecv(&ppSync_[recvCount_], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_,
+              &request_[recvCount_]);
+    recvCount_++;
 #endif
 }
 
+void PmeCoordinateReceiverGpu::Impl::enqueueWaitReceiveCoordinatesFromPpCudaDirect()
+{
+    if (recvCount_ > 0)
+    {
+        // ensure PME calculation doesn't commence until coordinate data has been transferred
+#if GMX_MPI
+        MPI_Waitall(recvCount_, request_.data(), MPI_STATUS_IGNORE);
+#endif
+        for (int i = 0; i < recvCount_; i++)
+        {
+            ppSync_[i]->enqueueWaitEvent(pmeStream_);
+        }
+        // reset receive counter
+        recvCount_ = 0;
+    }
+}
+
 PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(void*                  pmeStream,
                                                    MPI_Comm               comm,
                                                    gmx::ArrayRef<PpRanks> ppRanks) :
@@ -117,9 +135,14 @@ void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(rvec* d_x)
     impl_->sendCoordinateBufferAddressToPpRanks(d_x);
 }
 
-void PmeCoordinateReceiverGpu::receiveCoordinatesFromPpCudaDirect(int ppRank)
+void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaDirect(int ppRank)
+{
+    impl_->launchReceiveCoordinatesFromPpCudaDirect(ppRank);
+}
+
+void PmeCoordinateReceiverGpu::enqueueWaitReceiveCoordinatesFromPpCudaDirect()
 {
-    impl_->receiveCoordinatesFromPpCudaDirect(ppRank);
+    impl_->enqueueWaitReceiveCoordinatesFromPpCudaDirect();
 }
 
 } // namespace gmx
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
index 015ae346d4..fe689799e1 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h
@@ -70,10 +70,15 @@ public:
     void sendCoordinateBufferAddressToPpRanks(rvec* d_x);
 
     /*! \brief
-     * receive coordinate data from PP rank
+     * launch receive of coordinate data from PP rank
      * \param[in] ppRank  PP rank to send data
      */
-    void receiveCoordinatesFromPpCudaDirect(int ppRank);
+    void launchReceiveCoordinatesFromPpCudaDirect(int ppRank);
+
+    /*! \brief
+     * enqueue wait for coordinate data from PP ranks
+     */
+    void enqueueWaitReceiveCoordinatesFromPpCudaDirect();
 
 private:
     //! CUDA stream for PME operations
@@ -82,6 +87,12 @@ private:
     MPI_Comm comm_;
     //! list of PP ranks
     gmx::ArrayRef<PpRanks> ppRanks_;
+    //! vector of MPI requests
+    std::vector<MPI_Request> request_;
+    //! vector of synchronization events to receive from PP tasks
+    std::vector<GpuEventSynchronizer*> ppSync_;
+    //! counter of messages to receive
+    int recvCount_ = 0;
 };
 
 } // namespace gmx
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index 5d99b07139..0610a2dc67 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -451,7 +451,8 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                 {
                     if (pme_pp->useGpuDirectComm)
                     {
-                        pme_pp->pmeCoordinateReceiverGpu->receiveCoordinatesFromPpCudaDirect(sender.rankId);
+                        pme_pp->pmeCoordinateReceiverGpu->launchReceiveCoordinatesFromPpCudaDirect(
+                                sender.rankId);
                     }
                     else
                     {
@@ -469,6 +470,11 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                 }
             }
 
+            if (pme_pp->useGpuDirectComm)
+            {
+                pme_pp->pmeCoordinateReceiverGpu->enqueueWaitReceiveCoordinatesFromPpCudaDirect();
+            }
+
             status = pmerecvqxX;
         }
 
-- 
2.11.4.GIT