2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
40 * \brief This file contains function definitions necessary for
41 * managing the offload of long-ranged PME work to separate MPI rank,
42 * for computing energies and forces (Coulomb and LJ).
44 * \author Berk Hess <hess@kth.se>
45 * \ingroup module_ewald
55 #include "gromacs/domdec/domdec.h"
56 #include "gromacs/domdec/domdec_struct.h"
57 #include "gromacs/ewald/pme.h"
58 #include "gromacs/ewald/pme_pp_comm_gpu.h"
59 #include "gromacs/gmxlib/network.h"
60 #include "gromacs/math/vec.h"
61 #include "gromacs/mdlib/gmx_omp_nthreads.h"
62 #include "gromacs/mdtypes/commrec.h"
63 #include "gromacs/mdtypes/forceoutput.h"
64 #include "gromacs/mdtypes/forcerec.h"
65 #include "gromacs/mdtypes/interaction_const.h"
66 #include "gromacs/mdtypes/md_enums.h"
67 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
68 #include "gromacs/nbnxm/nbnxm.h"
69 #include "gromacs/timing/wallcycle.h"
70 #include "gromacs/utility/fatalerror.h"
71 #include "gromacs/utility/gmxmpi.h"
72 #include "gromacs/utility/smalloc.h"
74 #include "pme_internal.h"
75 #include "pme_pp_communication.h"
77 /*! \brief Block to wait for communication to PME ranks to complete
79 * This should be faster with a real non-blocking MPI implementation
81 static constexpr bool c_useDelayedWait
= false;
83 /*! \brief Wait for the pending data send requests to PME ranks to complete */
84 static void gmx_pme_send_coeffs_coords_wait(gmx_domdec_t
* dd
)
89 MPI_Waitall(dd
->nreq_pme
, dd
->req_pme
, MPI_STATUSES_IGNORE
);
95 /*! \brief Send data to PME ranks */
96 static void gmx_pme_send_coeffs_coords(t_forcerec
* fr
,
99 real gmx_unused
* chargeA
,
100 real gmx_unused
* chargeB
,
101 real gmx_unused
* c6A
,
102 real gmx_unused
* c6B
,
103 real gmx_unused
* sigmaA
,
104 real gmx_unused
* sigmaB
,
106 const rvec gmx_unused
* x
,
112 bool useGpuPmePpComms
,
113 bool reinitGpuPmePpComms
,
114 bool sendCoordinatesFromGpu
,
115 GpuEventSynchronizer
* coordinatesReadyOnDeviceEvent
)
118 gmx_pme_comm_n_box_t
* cnb
;
122 n
= dd_numHomeAtoms(*dd
);
126 fprintf(debug
, "PP rank %d sending to PME rank %d: %d%s%s%s%s\n", cr
->sim_nodeid
, dd
->pme_nodeid
,
127 n
, (flags
& PP_PME_CHARGE
) ? " charges" : "", (flags
& PP_PME_SQRTC6
) ? " sqrtC6" : "",
128 (flags
& PP_PME_SIGMA
) ? " sigma" : "", (flags
& PP_PME_COORD
) ? " coordinates" : "");
131 if (useGpuPmePpComms
)
133 flags
|= PP_PME_GPUCOMMS
;
136 if (c_useDelayedWait
)
138 /* We can not use cnb until pending communication has finished */
139 gmx_pme_send_coeffs_coords_wait(dd
);
142 if (dd
->pme_receive_vir_ener
)
144 /* Peer PP node: communicate all data */
145 if (dd
->cnb
== nullptr)
153 cnb
->maxshift_x
= maxshift_x
;
154 cnb
->maxshift_y
= maxshift_y
;
155 cnb
->lambda_q
= lambda_q
;
156 cnb
->lambda_lj
= lambda_lj
;
158 if (flags
& PP_PME_COORD
)
160 copy_mat(box
, cnb
->box
);
163 MPI_Isend(cnb
, sizeof(*cnb
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_CNB
, cr
->mpi_comm_mysim
,
164 &dd
->req_pme
[dd
->nreq_pme
++]);
167 else if (flags
& (PP_PME_CHARGE
| PP_PME_SQRTC6
| PP_PME_SIGMA
))
170 /* Communicate only the number of atoms */
171 MPI_Isend(&n
, sizeof(n
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_CNB
, cr
->mpi_comm_mysim
,
172 &dd
->req_pme
[dd
->nreq_pme
++]);
179 if (flags
& PP_PME_CHARGE
)
181 MPI_Isend(chargeA
, n
* sizeof(real
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_ChargeA
,
182 cr
->mpi_comm_mysim
, &dd
->req_pme
[dd
->nreq_pme
++]);
184 if (flags
& PP_PME_CHARGEB
)
186 MPI_Isend(chargeB
, n
* sizeof(real
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_ChargeB
,
187 cr
->mpi_comm_mysim
, &dd
->req_pme
[dd
->nreq_pme
++]);
189 if (flags
& PP_PME_SQRTC6
)
191 MPI_Isend(c6A
, n
* sizeof(real
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_SQRTC6A
,
192 cr
->mpi_comm_mysim
, &dd
->req_pme
[dd
->nreq_pme
++]);
194 if (flags
& PP_PME_SQRTC6B
)
196 MPI_Isend(c6B
, n
* sizeof(real
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_SQRTC6B
,
197 cr
->mpi_comm_mysim
, &dd
->req_pme
[dd
->nreq_pme
++]);
199 if (flags
& PP_PME_SIGMA
)
201 MPI_Isend(sigmaA
, n
* sizeof(real
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_SigmaA
,
202 cr
->mpi_comm_mysim
, &dd
->req_pme
[dd
->nreq_pme
++]);
204 if (flags
& PP_PME_SIGMAB
)
206 MPI_Isend(sigmaB
, n
* sizeof(real
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_SigmaB
,
207 cr
->mpi_comm_mysim
, &dd
->req_pme
[dd
->nreq_pme
++]);
209 if (flags
& PP_PME_COORD
)
211 if (reinitGpuPmePpComms
)
213 fr
->pmePpCommGpu
->reinit(n
);
217 /* MPI_Isend does not accept a const buffer pointer */
218 real
* xRealPtr
= const_cast<real
*>(x
[0]);
219 if (useGpuPmePpComms
&& (fr
!= nullptr))
221 void* sendPtr
= sendCoordinatesFromGpu
222 ? static_cast<void*>(fr
->stateGpu
->getCoordinates())
223 : static_cast<void*>(xRealPtr
);
224 fr
->pmePpCommGpu
->sendCoordinatesToPmeCudaDirect(sendPtr
, n
, sendCoordinatesFromGpu
,
225 coordinatesReadyOnDeviceEvent
);
229 MPI_Isend(xRealPtr
, n
* sizeof(rvec
), MPI_BYTE
, dd
->pme_nodeid
, eCommType_COORD
,
230 cr
->mpi_comm_mysim
, &dd
->req_pme
[dd
->nreq_pme
++]);
235 GMX_UNUSED_VALUE(fr
);
236 GMX_UNUSED_VALUE(reinitGpuPmePpComms
);
237 GMX_UNUSED_VALUE(sendCoordinatesFromGpu
);
238 GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent
);
240 if (!c_useDelayedWait
)
242 /* Wait for the data to arrive */
243 /* We can skip this wait as we are sure x and q will not be modified
244 * before the next call to gmx_pme_send_x_q or gmx_pme_receive_f.
246 gmx_pme_send_coeffs_coords_wait(dd
);
250 void gmx_pme_send_parameters(const t_commrec
* cr
,
251 const interaction_const_t
* ic
,
252 gmx_bool bFreeEnergy_q
,
253 gmx_bool bFreeEnergy_lj
,
263 unsigned int flags
= 0;
265 if (EEL_PME(ic
->eeltype
))
267 flags
|= PP_PME_CHARGE
;
269 if (EVDW_PME(ic
->vdwtype
))
271 flags
|= (PP_PME_SQRTC6
| PP_PME_SIGMA
);
273 if (bFreeEnergy_q
|| bFreeEnergy_lj
)
275 /* Assumes that the B state flags are in the bits just above
276 * the ones for the A state. */
277 flags
|= (flags
<< 1);
280 gmx_pme_send_coeffs_coords(nullptr, cr
, flags
, chargeA
, chargeB
, sqrt_c6A
, sqrt_c6B
, sigmaA
,
281 sigmaB
, nullptr, nullptr, 0, 0, maxshift_x
, maxshift_y
, -1, false,
282 false, false, nullptr);
285 void gmx_pme_send_coordinates(t_forcerec
* fr
,
293 bool useGpuPmePpComms
,
294 bool receiveCoordinateAddressFromPme
,
295 bool sendCoordinatesFromGpu
,
296 GpuEventSynchronizer
* coordinatesReadyOnDeviceEvent
,
297 gmx_wallcycle
* wcycle
)
299 wallcycle_start(wcycle
, ewcPP_PMESENDX
);
301 unsigned int flags
= PP_PME_COORD
;
304 flags
|= PP_PME_ENER_VIR
;
306 gmx_pme_send_coeffs_coords(fr
, cr
, flags
, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
307 box
, x
, lambda_q
, lambda_lj
, 0, 0, step
, useGpuPmePpComms
,
308 receiveCoordinateAddressFromPme
, sendCoordinatesFromGpu
,
309 coordinatesReadyOnDeviceEvent
);
311 wallcycle_stop(wcycle
, ewcPP_PMESENDX
);
314 void gmx_pme_send_finish(const t_commrec
* cr
)
316 unsigned int flags
= PP_PME_FINISH
;
318 gmx_pme_send_coeffs_coords(nullptr, cr
, flags
, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
319 nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false, nullptr);
322 void gmx_pme_send_switchgrid(const t_commrec
* cr
, ivec grid_size
, real ewaldcoeff_q
, real ewaldcoeff_lj
)
325 gmx_pme_comm_n_box_t cnb
;
327 /* Only let one PP node signal each PME node */
328 if (cr
->dd
->pme_receive_vir_ener
)
330 cnb
.flags
= PP_PME_SWITCHGRID
;
331 copy_ivec(grid_size
, cnb
.grid_size
);
332 cnb
.ewaldcoeff_q
= ewaldcoeff_q
;
333 cnb
.ewaldcoeff_lj
= ewaldcoeff_lj
;
335 /* We send this, uncommon, message blocking to simplify the code */
336 MPI_Send(&cnb
, sizeof(cnb
), MPI_BYTE
, cr
->dd
->pme_nodeid
, eCommType_CNB
, cr
->mpi_comm_mysim
);
339 GMX_UNUSED_VALUE(cr
);
340 GMX_UNUSED_VALUE(grid_size
);
341 GMX_UNUSED_VALUE(ewaldcoeff_q
);
342 GMX_UNUSED_VALUE(ewaldcoeff_lj
);
346 void gmx_pme_send_resetcounters(const t_commrec gmx_unused
* cr
, int64_t gmx_unused step
)
349 gmx_pme_comm_n_box_t cnb
;
351 /* Only let one PP node signal each PME node */
352 if (cr
->dd
->pme_receive_vir_ener
)
354 cnb
.flags
= PP_PME_RESETCOUNTERS
;
357 /* We send this, uncommon, message blocking to simplify the code */
358 MPI_Send(&cnb
, sizeof(cnb
), MPI_BYTE
, cr
->dd
->pme_nodeid
, eCommType_CNB
, cr
->mpi_comm_mysim
);
363 /*! \brief Receive virial and energy from PME rank */
364 static void receive_virial_energy(const t_commrec
* cr
,
365 gmx::ForceWithVirial
* forceWithVirial
,
372 gmx_pme_comm_vir_ene_t cve
;
374 if (cr
->dd
->pme_receive_vir_ener
)
378 fprintf(debug
, "PP rank %d receiving from PME rank %d: virial and energy\n",
379 cr
->sim_nodeid
, cr
->dd
->pme_nodeid
);
382 MPI_Recv(&cve
, sizeof(cve
), MPI_BYTE
, cr
->dd
->pme_nodeid
, 1, cr
->mpi_comm_mysim
, MPI_STATUS_IGNORE
);
384 memset(&cve
, 0, sizeof(cve
));
387 forceWithVirial
->addVirialContribution(cve
.vir_q
);
388 forceWithVirial
->addVirialContribution(cve
.vir_lj
);
389 *energy_q
= cve
.energy_q
;
390 *energy_lj
= cve
.energy_lj
;
391 *dvdlambda_q
+= cve
.dvdlambda_q
;
392 *dvdlambda_lj
+= cve
.dvdlambda_lj
;
393 *pme_cycles
= cve
.cycles
;
395 if (cve
.stop_cond
!= gmx_stop_cond_none
)
397 gmx_set_stop_condition(cve
.stop_cond
);
408 /*! \brief Recieve force data from PME ranks */
409 static void recvFFromPme(gmx::PmePpCommGpu
* pmePpCommGpu
,
413 bool useGpuPmePpComms
,
414 bool receivePmeForceToGpu
)
416 if (useGpuPmePpComms
)
418 GMX_ASSERT(pmePpCommGpu
!= nullptr, "Need valid pmePpCommGpu");
419 // Receive directly using CUDA memory copy
420 pmePpCommGpu
->receiveForceFromPmeCudaDirect(recvptr
, n
, receivePmeForceToGpu
);
424 // Receive data using MPI
426 MPI_Recv(recvptr
, n
* sizeof(rvec
), MPI_BYTE
, cr
->dd
->pme_nodeid
, 0, cr
->mpi_comm_mysim
,
429 GMX_UNUSED_VALUE(cr
);
435 void gmx_pme_receive_f(gmx::PmePpCommGpu
* pmePpCommGpu
,
437 gmx::ForceWithVirial
* forceWithVirial
,
442 bool useGpuPmePpComms
,
443 bool receivePmeForceToGpu
,
446 if (c_useDelayedWait
)
448 /* Wait for the x request to finish */
449 gmx_pme_send_coeffs_coords_wait(cr
->dd
);
452 const int natoms
= dd_numHomeAtoms(*cr
->dd
);
453 std::vector
<gmx::RVec
>& buffer
= cr
->dd
->pmeForceReceiveBuffer
;
454 buffer
.resize(natoms
);
456 void* recvptr
= reinterpret_cast<void*>(buffer
.data());
457 recvFFromPme(pmePpCommGpu
, recvptr
, natoms
, cr
, useGpuPmePpComms
, receivePmeForceToGpu
);
459 int nt
= gmx_omp_nthreads_get_simple_rvec_task(emntDefault
, natoms
);
461 gmx::ArrayRef
<gmx::RVec
> f
= forceWithVirial
->force_
;
463 if (!receivePmeForceToGpu
)
465 /* Note that we would like to avoid this conditional by putting it
466 * into the omp pragma instead, but then we still take the full
467 * omp parallel for overhead (at least with gcc5).
471 for (int i
= 0; i
< natoms
; i
++)
478 #pragma omp parallel for num_threads(nt) schedule(static)
479 for (int i
= 0; i
< natoms
; i
++)
486 receive_virial_energy(cr
, forceWithVirial
, energy_q
, energy_lj
, dvdlambda_q
, dvdlambda_lj
, pme_cycles
);