src/gromacs/ewald/pme_pp.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 /*! \internal \file
  39  *
  40  * \brief This file contains function definitions necessary for
  41  * managing the offload of long-ranged PME work to separate MPI rank,
  42  * for computing energies and forces (Coulomb and LJ).
  43  *
  44  * \author Berk Hess <hess@kth.se>
  45  * \ingroup module_ewald
  46  */
  47
  48 #include "gmxpre.h"
  49
  50 #include "pme_pp.h"
  51
  52 #include "config.h"
  53
  54 #include <cstdio>
  55 #include <cstring>
  56
  57 #include "gromacs/domdec/domdec.h"
  58 #include "gromacs/domdec/domdec_struct.h"
  59 #include "gromacs/ewald/pme.h"
  60 #include "gromacs/ewald/pme_pp_comm_gpu.h"
  61 #include "gromacs/gmxlib/network.h"
  62 #include "gromacs/math/vec.h"
  63 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  64 #include "gromacs/mdtypes/commrec.h"
  65 #include "gromacs/mdtypes/forceoutput.h"
  66 #include "gromacs/mdtypes/forcerec.h"
  67 #include "gromacs/mdtypes/interaction_const.h"
  68 #include "gromacs/mdtypes/md_enums.h"
  69 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
  70 #include "gromacs/nbnxm/nbnxm.h"
  71 #include "gromacs/timing/wallcycle.h"
  72 #include "gromacs/utility/fatalerror.h"
  73 #include "gromacs/utility/gmxmpi.h"
  74 #include "gromacs/utility/smalloc.h"
  75
  76 #include "pme_pp_communication.h"
  77
  78 /*! \brief Block to wait for communication to PME ranks to complete
  79  *
  80  * This should be faster with a real non-blocking MPI implementation
  81  */
  82 static constexpr bool c_useDelayedWait = false;
  83
  84 /*! \brief Wait for the pending data send requests to PME ranks to complete */
  85 static void gmx_pme_send_coeffs_coords_wait(gmx_domdec_t* dd)
  86 {
  87     if (dd->nreq_pme)
  88     {
  89 #if GMX_MPI
  90         MPI_Waitall(dd->nreq_pme, dd->req_pme, MPI_STATUSES_IGNORE);
  91 #endif
  92         dd->nreq_pme = 0;
  93     }
  94 }
  95
  96 /*! \brief Send data to PME ranks */
  97 static void gmx_pme_send_coeffs_coords(t_forcerec*      fr,
  98                                        const t_commrec* cr,
  99                                        unsigned int     flags,
 100                                        real gmx_unused* chargeA,
 101                                        real gmx_unused* chargeB,
 102                                        real gmx_unused* c6A,
 103                                        real gmx_unused* c6B,
 104                                        real gmx_unused* sigmaA,
 105                                        real gmx_unused* sigmaB,
 106                                        const matrix     box,
 107                                        const rvec gmx_unused* x,
 108                                        real                   lambda_q,
 109                                        real                   lambda_lj,
 110                                        int                    maxshift_x,
 111                                        int                    maxshift_y,
 112                                        int64_t                step,
 113                                        bool                   useGpuPmePpComms,
 114                                        bool                   reinitGpuPmePpComms,
 115                                        bool                   sendCoordinatesFromGpu,
 116                                        GpuEventSynchronizer*  coordinatesReadyOnDeviceEvent)
 117 {
 118     gmx_domdec_t*         dd;
 119     gmx_pme_comm_n_box_t* cnb;
 120     int                   n;
 121
 122     dd = cr->dd;
 123     n  = dd_numHomeAtoms(*dd);
 124
 125     if (debug)
 126     {
 127         fprintf(debug, "PP rank %d sending to PME rank %d: %d%s%s%s%s\n", cr->sim_nodeid, dd->pme_nodeid,
 128                 n, (flags & PP_PME_CHARGE) ? " charges" : "", (flags & PP_PME_SQRTC6) ? " sqrtC6" : "",
 129                 (flags & PP_PME_SIGMA) ? " sigma" : "", (flags & PP_PME_COORD) ? " coordinates" : "");
 130     }
 131
 132     if (useGpuPmePpComms)
 133     {
 134         flags |= PP_PME_GPUCOMMS;
 135     }
 136
 137     if (c_useDelayedWait)
 138     {
 139         /* We can not use cnb until pending communication has finished */
 140         gmx_pme_send_coeffs_coords_wait(dd);
 141     }
 142
 143     if (dd->pme_receive_vir_ener)
 144     {
 145         /* Peer PP node: communicate all data */
 146         if (dd->cnb == nullptr)
 147         {
 148             snew(dd->cnb, 1);
 149         }
 150         cnb = dd->cnb;
 151
 152         cnb->flags      = flags;
 153         cnb->natoms     = n;
 154         cnb->maxshift_x = maxshift_x;
 155         cnb->maxshift_y = maxshift_y;
 156         cnb->lambda_q   = lambda_q;
 157         cnb->lambda_lj  = lambda_lj;
 158         cnb->step       = step;
 159         if (flags & PP_PME_COORD)
 160         {
 161             copy_mat(box, cnb->box);
 162         }
 163 #if GMX_MPI
 164         MPI_Isend(cnb, sizeof(*cnb), MPI_BYTE, dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
 165                   &dd->req_pme[dd->nreq_pme++]);
 166 #endif
 167     }
 168     else if (flags & (PP_PME_CHARGE | PP_PME_SQRTC6 | PP_PME_SIGMA))
 169     {
 170 #if GMX_MPI
 171         /* Communicate only the number of atoms */
 172         MPI_Isend(&n, sizeof(n), MPI_BYTE, dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
 173                   &dd->req_pme[dd->nreq_pme++]);
 174 #endif
 175     }
 176
 177 #if GMX_MPI
 178     if (n > 0)
 179     {
 180         if (flags & PP_PME_CHARGE)
 181         {
 182             MPI_Isend(chargeA, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_ChargeA,
 183                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
 184         }
 185         if (flags & PP_PME_CHARGEB)
 186         {
 187             MPI_Isend(chargeB, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_ChargeB,
 188                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
 189         }
 190         if (flags & PP_PME_SQRTC6)
 191         {
 192             MPI_Isend(c6A, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SQRTC6A,
 193                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
 194         }
 195         if (flags & PP_PME_SQRTC6B)
 196         {
 197             MPI_Isend(c6B, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SQRTC6B,
 198                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
 199         }
 200         if (flags & PP_PME_SIGMA)
 201         {
 202             MPI_Isend(sigmaA, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SigmaA,
 203                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
 204         }
 205         if (flags & PP_PME_SIGMAB)
 206         {
 207             MPI_Isend(sigmaB, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SigmaB,
 208                       cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
 209         }
 210         if (flags & PP_PME_COORD)
 211         {
 212             if (reinitGpuPmePpComms)
 213             {
 214                 fr->pmePpCommGpu->reinit(n);
 215             }
 216
 217
 218             /* MPI_Isend does not accept a const buffer pointer */
 219             real* xRealPtr = const_cast<real*>(x[0]);
 220             if (useGpuPmePpComms && (fr != nullptr))
 221             {
 222                 void* sendPtr = sendCoordinatesFromGpu
 223                                         ? static_cast<void*>(fr->stateGpu->getCoordinates())
 224                                         : static_cast<void*>(xRealPtr);
 225                 fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(sendPtr, n, sendCoordinatesFromGpu,
 226                                                                  coordinatesReadyOnDeviceEvent);
 227             }
 228             else
 229             {
 230                 MPI_Isend(xRealPtr, n * sizeof(rvec), MPI_BYTE, dd->pme_nodeid, eCommType_COORD,
 231                           cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
 232             }
 233         }
 234     }
 235 #else
 236     GMX_UNUSED_VALUE(fr);
 237     GMX_UNUSED_VALUE(reinitGpuPmePpComms);
 238     GMX_UNUSED_VALUE(sendCoordinatesFromGpu);
 239     GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
 240 #endif
 241     if (!c_useDelayedWait)
 242     {
 243         /* Wait for the data to arrive */
 244         /* We can skip this wait as we are sure x and q will not be modified
 245          * before the next call to gmx_pme_send_x_q or gmx_pme_receive_f.
 246          */
 247         gmx_pme_send_coeffs_coords_wait(dd);
 248     }
 249 }
 250
 251 void gmx_pme_send_parameters(const t_commrec*           cr,
 252                              const interaction_const_t* ic,
 253                              gmx_bool                   bFreeEnergy_q,
 254                              gmx_bool                   bFreeEnergy_lj,
 255                              real*                      chargeA,
 256                              real*                      chargeB,
 257                              real*                      sqrt_c6A,
 258                              real*                      sqrt_c6B,
 259                              real*                      sigmaA,
 260                              real*                      sigmaB,
 261                              int                        maxshift_x,
 262                              int                        maxshift_y)
 263 {
 264     unsigned int flags = 0;
 265
 266     if (EEL_PME(ic->eeltype))
 267     {
 268         flags |= PP_PME_CHARGE;
 269     }
 270     if (EVDW_PME(ic->vdwtype))
 271     {
 272         flags |= (PP_PME_SQRTC6 | PP_PME_SIGMA);
 273     }
 274     if (bFreeEnergy_q || bFreeEnergy_lj)
 275     {
 276         /* Assumes that the B state flags are in the bits just above
 277          * the ones for the A state. */
 278         flags |= (flags << 1);
 279     }
 280
 281     gmx_pme_send_coeffs_coords(nullptr, cr, flags, chargeA, chargeB, sqrt_c6A, sqrt_c6B, sigmaA,
 282                                sigmaB, nullptr, nullptr, 0, 0, maxshift_x, maxshift_y, -1, false,
 283                                false, false, nullptr);
 284 }
 285
 286 void gmx_pme_send_coordinates(t_forcerec*           fr,
 287                               const t_commrec*      cr,
 288                               const matrix          box,
 289                               const rvec*           x,
 290                               real                  lambda_q,
 291                               real                  lambda_lj,
 292                               bool                  computeEnergyAndVirial,
 293                               int64_t               step,
 294                               bool                  useGpuPmePpComms,
 295                               bool                  receiveCoordinateAddressFromPme,
 296                               bool                  sendCoordinatesFromGpu,
 297                               GpuEventSynchronizer* coordinatesReadyOnDeviceEvent,
 298                               gmx_wallcycle*        wcycle)
 299 {
 300     wallcycle_start(wcycle, ewcPP_PMESENDX);
 301
 302     unsigned int flags = PP_PME_COORD;
 303     if (computeEnergyAndVirial)
 304     {
 305         flags |= PP_PME_ENER_VIR;
 306     }
 307     gmx_pme_send_coeffs_coords(fr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 308                                box, x, lambda_q, lambda_lj, 0, 0, step, useGpuPmePpComms,
 309                                receiveCoordinateAddressFromPme, sendCoordinatesFromGpu,
 310                                coordinatesReadyOnDeviceEvent);
 311
 312     wallcycle_stop(wcycle, ewcPP_PMESENDX);
 313 }
 314
 315 void gmx_pme_send_finish(const t_commrec* cr)
 316 {
 317     unsigned int flags = PP_PME_FINISH;
 318
 319     gmx_pme_send_coeffs_coords(nullptr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
 320                                nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false, nullptr);
 321 }
 322
 323 void gmx_pme_send_switchgrid(const t_commrec* cr, ivec grid_size, real ewaldcoeff_q, real ewaldcoeff_lj)
 324 {
 325 #if GMX_MPI
 326     gmx_pme_comm_n_box_t cnb;
 327
 328     /* Only let one PP node signal each PME node */
 329     if (cr->dd->pme_receive_vir_ener)
 330     {
 331         cnb.flags = PP_PME_SWITCHGRID;
 332         copy_ivec(grid_size, cnb.grid_size);
 333         cnb.ewaldcoeff_q  = ewaldcoeff_q;
 334         cnb.ewaldcoeff_lj = ewaldcoeff_lj;
 335
 336         /* We send this, uncommon, message blocking to simplify the code */
 337         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
 338     }
 339 #else
 340     GMX_UNUSED_VALUE(cr);
 341     GMX_UNUSED_VALUE(grid_size);
 342     GMX_UNUSED_VALUE(ewaldcoeff_q);
 343     GMX_UNUSED_VALUE(ewaldcoeff_lj);
 344 #endif
 345 }
 346
 347 void gmx_pme_send_resetcounters(const t_commrec gmx_unused* cr, int64_t gmx_unused step)
 348 {
 349 #if GMX_MPI
 350     gmx_pme_comm_n_box_t cnb;
 351
 352     /* Only let one PP node signal each PME node */
 353     if (cr->dd->pme_receive_vir_ener)
 354     {
 355         cnb.flags = PP_PME_RESETCOUNTERS;
 356         cnb.step  = step;
 357
 358         /* We send this, uncommon, message blocking to simplify the code */
 359         MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
 360     }
 361 #endif
 362 }
 363
 364 /*! \brief Receive virial and energy from PME rank */
 365 static void receive_virial_energy(const t_commrec*      cr,
 366                                   gmx::ForceWithVirial* forceWithVirial,
 367                                   real*                 energy_q,
 368                                   real*                 energy_lj,
 369                                   real*                 dvdlambda_q,
 370                                   real*                 dvdlambda_lj,
 371                                   float*                pme_cycles)
 372 {
 373     gmx_pme_comm_vir_ene_t cve;
 374
 375     if (cr->dd->pme_receive_vir_ener)
 376     {
 377         if (debug)
 378         {
 379             fprintf(debug, "PP rank %d receiving from PME rank %d: virial and energy\n",
 380                     cr->sim_nodeid, cr->dd->pme_nodeid);
 381         }
 382 #if GMX_MPI
 383         MPI_Recv(&cve, sizeof(cve), MPI_BYTE, cr->dd->pme_nodeid, 1, cr->mpi_comm_mysim, MPI_STATUS_IGNORE);
 384 #else
 385         memset(&cve, 0, sizeof(cve));
 386 #endif
 387
 388         forceWithVirial->addVirialContribution(cve.vir_q);
 389         forceWithVirial->addVirialContribution(cve.vir_lj);
 390         *energy_q  = cve.energy_q;
 391         *energy_lj = cve.energy_lj;
 392         *dvdlambda_q += cve.dvdlambda_q;
 393         *dvdlambda_lj += cve.dvdlambda_lj;
 394         *pme_cycles = cve.cycles;
 395
 396         if (cve.stop_cond != gmx_stop_cond_none)
 397         {
 398             gmx_set_stop_condition(cve.stop_cond);
 399         }
 400     }
 401     else
 402     {
 403         *energy_q   = 0;
 404         *energy_lj  = 0;
 405         *pme_cycles = 0;
 406     }
 407 }
 408
 409 /*! \brief Recieve force data from PME ranks */
 410 static void recvFFromPme(gmx::PmePpCommGpu* pmePpCommGpu,
 411                          void*              recvptr,
 412                          int                n,
 413                          const t_commrec*   cr,
 414                          bool               useGpuPmePpComms,
 415                          bool               receivePmeForceToGpu)
 416 {
 417     if (useGpuPmePpComms)
 418     {
 419         GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
 420         // Receive directly using CUDA memory copy
 421         pmePpCommGpu->receiveForceFromPmeCudaDirect(recvptr, n, receivePmeForceToGpu);
 422     }
 423     else
 424     {
 425         // Receive data using MPI
 426 #if GMX_MPI
 427         MPI_Recv(recvptr, n * sizeof(rvec), MPI_BYTE, cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim,
 428                  MPI_STATUS_IGNORE);
 429 #else
 430         GMX_UNUSED_VALUE(cr);
 431 #endif
 432     }
 433 }
 434
 435
 436 void gmx_pme_receive_f(gmx::PmePpCommGpu*    pmePpCommGpu,
 437                        const t_commrec*      cr,
 438                        gmx::ForceWithVirial* forceWithVirial,
 439                        real*                 energy_q,
 440                        real*                 energy_lj,
 441                        real*                 dvdlambda_q,
 442                        real*                 dvdlambda_lj,
 443                        bool                  useGpuPmePpComms,
 444                        bool                  receivePmeForceToGpu,
 445                        float*                pme_cycles)
 446 {
 447     if (c_useDelayedWait)
 448     {
 449         /* Wait for the x request to finish */
 450         gmx_pme_send_coeffs_coords_wait(cr->dd);
 451     }
 452
 453     const int               natoms = dd_numHomeAtoms(*cr->dd);
 454     std::vector<gmx::RVec>& buffer = cr->dd->pmeForceReceiveBuffer;
 455     buffer.resize(natoms);
 456
 457     void* recvptr = reinterpret_cast<void*>(buffer.data());
 458     recvFFromPme(pmePpCommGpu, recvptr, natoms, cr, useGpuPmePpComms, receivePmeForceToGpu);
 459
 460     int nt = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, natoms);
 461
 462     gmx::ArrayRef<gmx::RVec> f = forceWithVirial->force_;
 463
 464     if (!receivePmeForceToGpu)
 465     {
 466         /* Note that we would like to avoid this conditional by putting it
 467          * into the omp pragma instead, but then we still take the full
 468          * omp parallel for overhead (at least with gcc5).
 469          */
 470         if (nt == 1)
 471         {
 472             for (int i = 0; i < natoms; i++)
 473             {
 474                 f[i] += buffer[i];
 475             }
 476         }
 477         else
 478         {
 479 #pragma omp parallel for num_threads(nt) schedule(static)
 480             for (int i = 0; i < natoms; i++)
 481             {
 482                 f[i] += buffer[i];
 483             }
 484         }
 485     }
 486
 487     receive_virial_energy(cr, forceWithVirial, energy_q, energy_lj, dvdlambda_q, dvdlambda_lj, pme_cycles);
 488 }