2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
40 * \brief This file contains function definitions necessary for
41 * managing the offload of long-ranged PME work to separate MPI rank,
42 * for computing energies and forces (Coulomb and LJ).
44 * \author Berk Hess <hess@kth.se>
45 * \ingroup module_ewald
57 #include "gromacs/domdec/domdec.h"
58 #include "gromacs/domdec/domdec_struct.h"
59 #include "gromacs/ewald/pme.h"
60 #include "gromacs/ewald/pme_pp_comm_gpu.h"
61 #include "gromacs/gmxlib/network.h"
62 #include "gromacs/math/vec.h"
63 #include "gromacs/mdlib/gmx_omp_nthreads.h"
64 #include "gromacs/mdtypes/commrec.h"
65 #include "gromacs/mdtypes/forceoutput.h"
66 #include "gromacs/mdtypes/forcerec.h"
67 #include "gromacs/mdtypes/interaction_const.h"
68 #include "gromacs/mdtypes/md_enums.h"
69 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
70 #include "gromacs/nbnxm/nbnxm.h"
71 #include "gromacs/timing/wallcycle.h"
72 #include "gromacs/utility/fatalerror.h"
73 #include "gromacs/utility/gmxmpi.h"
74 #include "gromacs/utility/smalloc.h"
76 #include "pme_pp_communication.h"
78 /*! \brief Block to wait for communication to PME ranks to complete
80 * This should be faster with a real non-blocking MPI implementation
82 static constexpr bool c_useDelayedWait
= false;
84 /*! \brief Wait for the pending data send requests to PME ranks to complete */
85 static void gmx_pme_send_coeffs_coords_wait(gmx_domdec_t
* dd
)
90 MPI_Waitall(dd
->nreq_pme
, dd
->req_pme
, MPI_STATUSES_IGNORE
);
96 /*! \brief Send data to PME ranks */
97 static void gmx_pme_send_coeffs_coords(t_forcerec
* fr
,
100 real gmx_unused
* chargeA
,
101 real gmx_unused
* chargeB
,
102 real gmx_unused
* c6A
,
103 real gmx_unused
* c6B
,
104 real gmx_unused
* sigmaA
,
105 real gmx_unused
* sigmaB
,
107 const rvec gmx_unused
* x
,
113 bool useGpuPmePpComms
,
114 bool reinitGpuPmePpComms
,
115 bool sendCoordinatesFromGpu
,
116 GpuEventSynchronizer
* coordinatesReadyOnDeviceEvent
)
119 gmx_pme_comm_n_box_t
* cnb
;
123 n
= dd_numHomeAtoms(*dd
);
127 fprintf(debug
, "PP rank %d sending to PME rank %d: %d%s%s%s%s\n", cr
->sim_nodeid
, dd
->pme_nodeid
,
128 n
, (flags
& PP_PME_CHARGE
) ? " charges" : "", (flags
& PP_PME_SQRTC6
) ? " sqrtC6" : "",
129 (flags
& PP_PME_SIGMA
) ? " sigma" : "", (flags
& PP_PME_COORD
) ? " coordinates" : "");
132 if (useGpuPmePpComms
)
134 flags
|= PP_PME_GPUCOMMS
;
137 if (c_useDelayedWait
)
139 /* We can not use cnb until pending communication has finished */
140 gmx_pme_send_coeffs_coords_wait(dd
);
143 if (dd
->pme_receive_vir_ener
)
145 /* Peer PP node: communicate all data */
146 if (dd
->cnb
== nullptr)
154 cnb
->maxshift_x
= maxshift_x
;
155 cnb
->maxshift_y
= maxshift_y
;
156 cnb
->lambda_q
= lambda_q
;
157 cnb
->lambda_lj
= lambda_lj
;
159 if (flags
& PP_PME_COORD
)
161 copy_mat(box
, cnb
->box
);
164 MPI_Isend(cnb
, sizeof(*cnb
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_CNB
, cr
->mpi_comm_mysim
,
165 &dd
->req_pme
[dd
->nreq_pme
++]);
168 else if (flags
& (PP_PME_CHARGE
| PP_PME_SQRTC6
| PP_PME_SIGMA
))
171 /* Communicate only the number of atoms */
172 MPI_Isend(&n
, sizeof(n
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_CNB
, cr
->mpi_comm_mysim
,
173 &dd
->req_pme
[dd
->nreq_pme
++]);
180 if (flags
& PP_PME_CHARGE
)
182 MPI_Isend(chargeA
, n
* sizeof(real
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_ChargeA
,
183 cr
->mpi_comm_mysim
, &dd
->req_pme
[dd
->nreq_pme
++]);
185 if (flags
& PP_PME_CHARGEB
)
187 MPI_Isend(chargeB
, n
* sizeof(real
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_ChargeB
,
188 cr
->mpi_comm_mysim
, &dd
->req_pme
[dd
->nreq_pme
++]);
190 if (flags
& PP_PME_SQRTC6
)
192 MPI_Isend(c6A
, n
* sizeof(real
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_SQRTC6A
,
193 cr
->mpi_comm_mysim
, &dd
->req_pme
[dd
->nreq_pme
++]);
195 if (flags
& PP_PME_SQRTC6B
)
197 MPI_Isend(c6B
, n
* sizeof(real
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_SQRTC6B
,
198 cr
->mpi_comm_mysim
, &dd
->req_pme
[dd
->nreq_pme
++]);
200 if (flags
& PP_PME_SIGMA
)
202 MPI_Isend(sigmaA
, n
* sizeof(real
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_SigmaA
,
203 cr
->mpi_comm_mysim
, &dd
->req_pme
[dd
->nreq_pme
++]);
205 if (flags
& PP_PME_SIGMAB
)
207 MPI_Isend(sigmaB
, n
* sizeof(real
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_SigmaB
,
208 cr
->mpi_comm_mysim
, &dd
->req_pme
[dd
->nreq_pme
++]);
210 if (flags
& PP_PME_COORD
)
212 if (reinitGpuPmePpComms
)
214 fr
->pmePpCommGpu
->reinit(n
);
218 /* MPI_Isend does not accept a const buffer pointer */
219 real
* xRealPtr
= const_cast<real
*>(x
[0]);
220 if (useGpuPmePpComms
&& (fr
!= nullptr))
222 void* sendPtr
= sendCoordinatesFromGpu
223 ? static_cast<void*>(fr
->stateGpu
->getCoordinates())
224 : static_cast<void*>(xRealPtr
);
225 fr
->pmePpCommGpu
->sendCoordinatesToPmeCudaDirect(sendPtr
, n
, sendCoordinatesFromGpu
,
226 coordinatesReadyOnDeviceEvent
);
230 MPI_Isend(xRealPtr
, n
* sizeof(rvec
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_COORD
,
231 cr
->mpi_comm_mysim
, &dd
->req_pme
[dd
->nreq_pme
++]);
236 GMX_UNUSED_VALUE(fr
);
237 GMX_UNUSED_VALUE(reinitGpuPmePpComms
);
238 GMX_UNUSED_VALUE(sendCoordinatesFromGpu
);
239 GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent
);
241 if (!c_useDelayedWait
)
243 /* Wait for the data to arrive */
244 /* We can skip this wait as we are sure x and q will not be modified
245 * before the next call to gmx_pme_send_x_q or gmx_pme_receive_f.
247 gmx_pme_send_coeffs_coords_wait(dd
);
251 void gmx_pme_send_parameters(const t_commrec
* cr
,
252 const interaction_const_t
* ic
,
253 gmx_bool bFreeEnergy_q
,
254 gmx_bool bFreeEnergy_lj
,
264 unsigned int flags
= 0;
266 if (EEL_PME(ic
->eeltype
))
268 flags
|= PP_PME_CHARGE
;
270 if (EVDW_PME(ic
->vdwtype
))
272 flags
|= (PP_PME_SQRTC6
| PP_PME_SIGMA
);
274 if (bFreeEnergy_q
|| bFreeEnergy_lj
)
276 /* Assumes that the B state flags are in the bits just above
277 * the ones for the A state. */
278 flags
|= (flags
<< 1);
281 gmx_pme_send_coeffs_coords(nullptr, cr
, flags
, chargeA
, chargeB
, sqrt_c6A
, sqrt_c6B
, sigmaA
,
282 sigmaB
, nullptr, nullptr, 0, 0, maxshift_x
, maxshift_y
, -1, false,
283 false, false, nullptr);
286 void gmx_pme_send_coordinates(t_forcerec
* fr
,
292 bool computeEnergyAndVirial
,
294 bool useGpuPmePpComms
,
295 bool receiveCoordinateAddressFromPme
,
296 bool sendCoordinatesFromGpu
,
297 GpuEventSynchronizer
* coordinatesReadyOnDeviceEvent
,
298 gmx_wallcycle
* wcycle
)
300 wallcycle_start(wcycle
, ewcPP_PMESENDX
);
302 unsigned int flags
= PP_PME_COORD
;
303 if (computeEnergyAndVirial
)
305 flags
|= PP_PME_ENER_VIR
;
307 gmx_pme_send_coeffs_coords(fr
, cr
, flags
, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
308 box
, x
, lambda_q
, lambda_lj
, 0, 0, step
, useGpuPmePpComms
,
309 receiveCoordinateAddressFromPme
, sendCoordinatesFromGpu
,
310 coordinatesReadyOnDeviceEvent
);
312 wallcycle_stop(wcycle
, ewcPP_PMESENDX
);
315 void gmx_pme_send_finish(const t_commrec
* cr
)
317 unsigned int flags
= PP_PME_FINISH
;
319 gmx_pme_send_coeffs_coords(nullptr, cr
, flags
, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
320 nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false, nullptr);
323 void gmx_pme_send_switchgrid(const t_commrec
* cr
, ivec grid_size
, real ewaldcoeff_q
, real ewaldcoeff_lj
)
326 gmx_pme_comm_n_box_t cnb
;
328 /* Only let one PP node signal each PME node */
329 if (cr
->dd
->pme_receive_vir_ener
)
331 cnb
.flags
= PP_PME_SWITCHGRID
;
332 copy_ivec(grid_size
, cnb
.grid_size
);
333 cnb
.ewaldcoeff_q
= ewaldcoeff_q
;
334 cnb
.ewaldcoeff_lj
= ewaldcoeff_lj
;
336 /* We send this, uncommon, message blocking to simplify the code */
337 MPI_Send(&cnb
, sizeof(cnb
), MPI_BYTE
, cr
->dd
->pme_nodeid
, eCommType_CNB
, cr
->mpi_comm_mysim
);
340 GMX_UNUSED_VALUE(cr
);
341 GMX_UNUSED_VALUE(grid_size
);
342 GMX_UNUSED_VALUE(ewaldcoeff_q
);
343 GMX_UNUSED_VALUE(ewaldcoeff_lj
);
347 void gmx_pme_send_resetcounters(const t_commrec gmx_unused
* cr
, int64_t gmx_unused step
)
350 gmx_pme_comm_n_box_t cnb
;
352 /* Only let one PP node signal each PME node */
353 if (cr
->dd
->pme_receive_vir_ener
)
355 cnb
.flags
= PP_PME_RESETCOUNTERS
;
358 /* We send this, uncommon, message blocking to simplify the code */
359 MPI_Send(&cnb
, sizeof(cnb
), MPI_BYTE
, cr
->dd
->pme_nodeid
, eCommType_CNB
, cr
->mpi_comm_mysim
);
364 /*! \brief Receive virial and energy from PME rank */
365 static void receive_virial_energy(const t_commrec
* cr
,
366 gmx::ForceWithVirial
* forceWithVirial
,
373 gmx_pme_comm_vir_ene_t cve
;
375 if (cr
->dd
->pme_receive_vir_ener
)
379 fprintf(debug
, "PP rank %d receiving from PME rank %d: virial and energy\n",
380 cr
->sim_nodeid
, cr
->dd
->pme_nodeid
);
383 MPI_Recv(&cve
, sizeof(cve
), MPI_BYTE
, cr
->dd
->pme_nodeid
, 1, cr
->mpi_comm_mysim
, MPI_STATUS_IGNORE
);
385 memset(&cve
, 0, sizeof(cve
));
388 forceWithVirial
->addVirialContribution(cve
.vir_q
);
389 forceWithVirial
->addVirialContribution(cve
.vir_lj
);
390 *energy_q
= cve
.energy_q
;
391 *energy_lj
= cve
.energy_lj
;
392 *dvdlambda_q
+= cve
.dvdlambda_q
;
393 *dvdlambda_lj
+= cve
.dvdlambda_lj
;
394 *pme_cycles
= cve
.cycles
;
396 if (cve
.stop_cond
!= gmx_stop_cond_none
)
398 gmx_set_stop_condition(cve
.stop_cond
);
409 /*! \brief Recieve force data from PME ranks */
410 static void recvFFromPme(gmx::PmePpCommGpu
* pmePpCommGpu
,
414 bool useGpuPmePpComms
,
415 bool receivePmeForceToGpu
)
417 if (useGpuPmePpComms
)
419 GMX_ASSERT(pmePpCommGpu
!= nullptr, "Need valid pmePpCommGpu");
420 // Receive directly using CUDA memory copy
421 pmePpCommGpu
->receiveForceFromPmeCudaDirect(recvptr
, n
, receivePmeForceToGpu
);
425 // Receive data using MPI
427 MPI_Recv(recvptr
, n
* sizeof(rvec
), MPI_BYTE
, cr
->dd
->pme_nodeid
, 0, cr
->mpi_comm_mysim
,
430 GMX_UNUSED_VALUE(cr
);
436 void gmx_pme_receive_f(gmx::PmePpCommGpu
* pmePpCommGpu
,
438 gmx::ForceWithVirial
* forceWithVirial
,
443 bool useGpuPmePpComms
,
444 bool receivePmeForceToGpu
,
447 if (c_useDelayedWait
)
449 /* Wait for the x request to finish */
450 gmx_pme_send_coeffs_coords_wait(cr
->dd
);
453 const int natoms
= dd_numHomeAtoms(*cr
->dd
);
454 std::vector
<gmx::RVec
>& buffer
= cr
->dd
->pmeForceReceiveBuffer
;
455 buffer
.resize(natoms
);
457 void* recvptr
= reinterpret_cast<void*>(buffer
.data());
458 recvFFromPme(pmePpCommGpu
, recvptr
, natoms
, cr
, useGpuPmePpComms
, receivePmeForceToGpu
);
460 int nt
= gmx_omp_nthreads_get_simple_rvec_task(emntDefault
, natoms
);
462 gmx::ArrayRef
<gmx::RVec
> f
= forceWithVirial
->force_
;
464 if (!receivePmeForceToGpu
)
466 /* Note that we would like to avoid this conditional by putting it
467 * into the omp pragma instead, but then we still take the full
468 * omp parallel for overhead (at least with gcc5).
472 for (int i
= 0; i
< natoms
; i
++)
479 #pragma omp parallel for num_threads(nt) schedule(static)
480 for (int i
= 0; i
< natoms
; i
++)
487 receive_virial_energy(cr
, forceWithVirial
, energy_q
, energy_lj
, dvdlambda_q
, dvdlambda_lj
, pme_cycles
);