Split lines with many copyright years
[gromacs.git] / src / gromacs / ewald / pme_pp.cpp
blob691cee18dc509d4996d05006037097794104d6b2
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
38 /*! \internal \file
40 * \brief This file contains function definitions necessary for
41 * managing the offload of long-ranged PME work to separate MPI rank,
42 * for computing energies and forces (Coulomb and LJ).
44 * \author Berk Hess <hess@kth.se>
45 * \ingroup module_ewald
48 #include "gmxpre.h"
50 #include "config.h"
52 #include <cstdio>
53 #include <cstring>
55 #include "gromacs/domdec/domdec.h"
56 #include "gromacs/domdec/domdec_struct.h"
57 #include "gromacs/ewald/pme.h"
58 #include "gromacs/ewald/pme_pp_comm_gpu.h"
59 #include "gromacs/gmxlib/network.h"
60 #include "gromacs/math/vec.h"
61 #include "gromacs/mdlib/gmx_omp_nthreads.h"
62 #include "gromacs/mdtypes/commrec.h"
63 #include "gromacs/mdtypes/forceoutput.h"
64 #include "gromacs/mdtypes/forcerec.h"
65 #include "gromacs/mdtypes/interaction_const.h"
66 #include "gromacs/mdtypes/md_enums.h"
67 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
68 #include "gromacs/nbnxm/nbnxm.h"
69 #include "gromacs/timing/wallcycle.h"
70 #include "gromacs/utility/fatalerror.h"
71 #include "gromacs/utility/gmxmpi.h"
72 #include "gromacs/utility/smalloc.h"
74 #include "pme_internal.h"
75 #include "pme_pp_communication.h"
77 /*! \brief Block to wait for communication to PME ranks to complete
79 * This should be faster with a real non-blocking MPI implementation
81 static constexpr bool c_useDelayedWait = false;
83 /*! \brief Wait for the pending data send requests to PME ranks to complete */
84 static void gmx_pme_send_coeffs_coords_wait(gmx_domdec_t* dd)
86 if (dd->nreq_pme)
88 #if GMX_MPI
89 MPI_Waitall(dd->nreq_pme, dd->req_pme, MPI_STATUSES_IGNORE);
90 #endif
91 dd->nreq_pme = 0;
95 /*! \brief Send data to PME ranks */
96 static void gmx_pme_send_coeffs_coords(t_forcerec* fr,
97 const t_commrec* cr,
98 unsigned int flags,
99 real gmx_unused* chargeA,
100 real gmx_unused* chargeB,
101 real gmx_unused* c6A,
102 real gmx_unused* c6B,
103 real gmx_unused* sigmaA,
104 real gmx_unused* sigmaB,
105 const matrix box,
106 const rvec gmx_unused* x,
107 real lambda_q,
108 real lambda_lj,
109 int maxshift_x,
110 int maxshift_y,
111 int64_t step,
112 bool useGpuPmePpComms,
113 bool reinitGpuPmePpComms,
114 bool sendCoordinatesFromGpu,
115 GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
117 gmx_domdec_t* dd;
118 gmx_pme_comm_n_box_t* cnb;
119 int n;
121 dd = cr->dd;
122 n = dd_numHomeAtoms(*dd);
124 if (debug)
126 fprintf(debug, "PP rank %d sending to PME rank %d: %d%s%s%s%s\n", cr->sim_nodeid, dd->pme_nodeid,
127 n, (flags & PP_PME_CHARGE) ? " charges" : "", (flags & PP_PME_SQRTC6) ? " sqrtC6" : "",
128 (flags & PP_PME_SIGMA) ? " sigma" : "", (flags & PP_PME_COORD) ? " coordinates" : "");
131 if (useGpuPmePpComms)
133 flags |= PP_PME_GPUCOMMS;
136 if (c_useDelayedWait)
138 /* We can not use cnb until pending communication has finished */
139 gmx_pme_send_coeffs_coords_wait(dd);
142 if (dd->pme_receive_vir_ener)
144 /* Peer PP node: communicate all data */
145 if (dd->cnb == nullptr)
147 snew(dd->cnb, 1);
149 cnb = dd->cnb;
151 cnb->flags = flags;
152 cnb->natoms = n;
153 cnb->maxshift_x = maxshift_x;
154 cnb->maxshift_y = maxshift_y;
155 cnb->lambda_q = lambda_q;
156 cnb->lambda_lj = lambda_lj;
157 cnb->step = step;
158 if (flags & PP_PME_COORD)
160 copy_mat(box, cnb->box);
162 #if GMX_MPI
163 MPI_Isend(cnb, sizeof(*cnb), MPI_BYTE, dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
164 &dd->req_pme[dd->nreq_pme++]);
165 #endif
167 else if (flags & (PP_PME_CHARGE | PP_PME_SQRTC6 | PP_PME_SIGMA))
169 #if GMX_MPI
170 /* Communicate only the number of atoms */
171 MPI_Isend(&n, sizeof(n), MPI_BYTE, dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
172 &dd->req_pme[dd->nreq_pme++]);
173 #endif
176 #if GMX_MPI
177 if (n > 0)
179 if (flags & PP_PME_CHARGE)
181 MPI_Isend(chargeA, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_ChargeA,
182 cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
184 if (flags & PP_PME_CHARGEB)
186 MPI_Isend(chargeB, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_ChargeB,
187 cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
189 if (flags & PP_PME_SQRTC6)
191 MPI_Isend(c6A, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SQRTC6A,
192 cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
194 if (flags & PP_PME_SQRTC6B)
196 MPI_Isend(c6B, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SQRTC6B,
197 cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
199 if (flags & PP_PME_SIGMA)
201 MPI_Isend(sigmaA, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SigmaA,
202 cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
204 if (flags & PP_PME_SIGMAB)
206 MPI_Isend(sigmaB, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SigmaB,
207 cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
209 if (flags & PP_PME_COORD)
211 if (reinitGpuPmePpComms)
213 fr->pmePpCommGpu->reinit(n);
217 /* MPI_Isend does not accept a const buffer pointer */
218 real* xRealPtr = const_cast<real*>(x[0]);
219 if (useGpuPmePpComms && (fr != nullptr))
221 void* sendPtr = sendCoordinatesFromGpu
222 ? static_cast<void*>(fr->stateGpu->getCoordinates())
223 : static_cast<void*>(xRealPtr);
224 fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(sendPtr, n, sendCoordinatesFromGpu,
225 coordinatesReadyOnDeviceEvent);
227 else
229 MPI_Isend(xRealPtr, n * sizeof(rvec), MPI_BYTE, dd->pme_nodeid, eCommType_COORD,
230 cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
234 #else
235 GMX_UNUSED_VALUE(fr);
236 GMX_UNUSED_VALUE(reinitGpuPmePpComms);
237 GMX_UNUSED_VALUE(sendCoordinatesFromGpu);
238 GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
239 #endif
240 if (!c_useDelayedWait)
242 /* Wait for the data to arrive */
243 /* We can skip this wait as we are sure x and q will not be modified
244 * before the next call to gmx_pme_send_x_q or gmx_pme_receive_f.
246 gmx_pme_send_coeffs_coords_wait(dd);
250 void gmx_pme_send_parameters(const t_commrec* cr,
251 const interaction_const_t* ic,
252 gmx_bool bFreeEnergy_q,
253 gmx_bool bFreeEnergy_lj,
254 real* chargeA,
255 real* chargeB,
256 real* sqrt_c6A,
257 real* sqrt_c6B,
258 real* sigmaA,
259 real* sigmaB,
260 int maxshift_x,
261 int maxshift_y)
263 unsigned int flags = 0;
265 if (EEL_PME(ic->eeltype))
267 flags |= PP_PME_CHARGE;
269 if (EVDW_PME(ic->vdwtype))
271 flags |= (PP_PME_SQRTC6 | PP_PME_SIGMA);
273 if (bFreeEnergy_q || bFreeEnergy_lj)
275 /* Assumes that the B state flags are in the bits just above
276 * the ones for the A state. */
277 flags |= (flags << 1);
280 gmx_pme_send_coeffs_coords(nullptr, cr, flags, chargeA, chargeB, sqrt_c6A, sqrt_c6B, sigmaA,
281 sigmaB, nullptr, nullptr, 0, 0, maxshift_x, maxshift_y, -1, false,
282 false, false, nullptr);
285 void gmx_pme_send_coordinates(t_forcerec* fr,
286 const t_commrec* cr,
287 const matrix box,
288 const rvec* x,
289 real lambda_q,
290 real lambda_lj,
291 gmx_bool bEnerVir,
292 int64_t step,
293 bool useGpuPmePpComms,
294 bool receiveCoordinateAddressFromPme,
295 bool sendCoordinatesFromGpu,
296 GpuEventSynchronizer* coordinatesReadyOnDeviceEvent,
297 gmx_wallcycle* wcycle)
299 wallcycle_start(wcycle, ewcPP_PMESENDX);
301 unsigned int flags = PP_PME_COORD;
302 if (bEnerVir)
304 flags |= PP_PME_ENER_VIR;
306 gmx_pme_send_coeffs_coords(fr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
307 box, x, lambda_q, lambda_lj, 0, 0, step, useGpuPmePpComms,
308 receiveCoordinateAddressFromPme, sendCoordinatesFromGpu,
309 coordinatesReadyOnDeviceEvent);
311 wallcycle_stop(wcycle, ewcPP_PMESENDX);
314 void gmx_pme_send_finish(const t_commrec* cr)
316 unsigned int flags = PP_PME_FINISH;
318 gmx_pme_send_coeffs_coords(nullptr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
319 nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false, nullptr);
322 void gmx_pme_send_switchgrid(const t_commrec* cr, ivec grid_size, real ewaldcoeff_q, real ewaldcoeff_lj)
324 #if GMX_MPI
325 gmx_pme_comm_n_box_t cnb;
327 /* Only let one PP node signal each PME node */
328 if (cr->dd->pme_receive_vir_ener)
330 cnb.flags = PP_PME_SWITCHGRID;
331 copy_ivec(grid_size, cnb.grid_size);
332 cnb.ewaldcoeff_q = ewaldcoeff_q;
333 cnb.ewaldcoeff_lj = ewaldcoeff_lj;
335 /* We send this, uncommon, message blocking to simplify the code */
336 MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
338 #else
339 GMX_UNUSED_VALUE(cr);
340 GMX_UNUSED_VALUE(grid_size);
341 GMX_UNUSED_VALUE(ewaldcoeff_q);
342 GMX_UNUSED_VALUE(ewaldcoeff_lj);
343 #endif
346 void gmx_pme_send_resetcounters(const t_commrec gmx_unused* cr, int64_t gmx_unused step)
348 #if GMX_MPI
349 gmx_pme_comm_n_box_t cnb;
351 /* Only let one PP node signal each PME node */
352 if (cr->dd->pme_receive_vir_ener)
354 cnb.flags = PP_PME_RESETCOUNTERS;
355 cnb.step = step;
357 /* We send this, uncommon, message blocking to simplify the code */
358 MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
360 #endif
363 /*! \brief Receive virial and energy from PME rank */
364 static void receive_virial_energy(const t_commrec* cr,
365 gmx::ForceWithVirial* forceWithVirial,
366 real* energy_q,
367 real* energy_lj,
368 real* dvdlambda_q,
369 real* dvdlambda_lj,
370 float* pme_cycles)
372 gmx_pme_comm_vir_ene_t cve;
374 if (cr->dd->pme_receive_vir_ener)
376 if (debug)
378 fprintf(debug, "PP rank %d receiving from PME rank %d: virial and energy\n",
379 cr->sim_nodeid, cr->dd->pme_nodeid);
381 #if GMX_MPI
382 MPI_Recv(&cve, sizeof(cve), MPI_BYTE, cr->dd->pme_nodeid, 1, cr->mpi_comm_mysim, MPI_STATUS_IGNORE);
383 #else
384 memset(&cve, 0, sizeof(cve));
385 #endif
387 forceWithVirial->addVirialContribution(cve.vir_q);
388 forceWithVirial->addVirialContribution(cve.vir_lj);
389 *energy_q = cve.energy_q;
390 *energy_lj = cve.energy_lj;
391 *dvdlambda_q += cve.dvdlambda_q;
392 *dvdlambda_lj += cve.dvdlambda_lj;
393 *pme_cycles = cve.cycles;
395 if (cve.stop_cond != gmx_stop_cond_none)
397 gmx_set_stop_condition(cve.stop_cond);
400 else
402 *energy_q = 0;
403 *energy_lj = 0;
404 *pme_cycles = 0;
408 /*! \brief Recieve force data from PME ranks */
409 static void recvFFromPme(gmx::PmePpCommGpu* pmePpCommGpu,
410 void* recvptr,
411 int n,
412 const t_commrec* cr,
413 bool useGpuPmePpComms,
414 bool receivePmeForceToGpu)
416 if (useGpuPmePpComms)
418 GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
419 // Receive directly using CUDA memory copy
420 pmePpCommGpu->receiveForceFromPmeCudaDirect(recvptr, n, receivePmeForceToGpu);
422 else
424 // Receive data using MPI
425 #if GMX_MPI
426 MPI_Recv(recvptr, n * sizeof(rvec), MPI_BYTE, cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim,
427 MPI_STATUS_IGNORE);
428 #else
429 GMX_UNUSED_VALUE(cr);
430 #endif
435 void gmx_pme_receive_f(gmx::PmePpCommGpu* pmePpCommGpu,
436 const t_commrec* cr,
437 gmx::ForceWithVirial* forceWithVirial,
438 real* energy_q,
439 real* energy_lj,
440 real* dvdlambda_q,
441 real* dvdlambda_lj,
442 bool useGpuPmePpComms,
443 bool receivePmeForceToGpu,
444 float* pme_cycles)
446 if (c_useDelayedWait)
448 /* Wait for the x request to finish */
449 gmx_pme_send_coeffs_coords_wait(cr->dd);
452 const int natoms = dd_numHomeAtoms(*cr->dd);
453 std::vector<gmx::RVec>& buffer = cr->dd->pmeForceReceiveBuffer;
454 buffer.resize(natoms);
456 void* recvptr = reinterpret_cast<void*>(buffer.data());
457 recvFFromPme(pmePpCommGpu, recvptr, natoms, cr, useGpuPmePpComms, receivePmeForceToGpu);
459 int nt = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, natoms);
461 gmx::ArrayRef<gmx::RVec> f = forceWithVirial->force_;
463 if (!receivePmeForceToGpu)
465 /* Note that we would like to avoid this conditional by putting it
466 * into the omp pragma instead, but then we still take the full
467 * omp parallel for overhead (at least with gcc5).
469 if (nt == 1)
471 for (int i = 0; i < natoms; i++)
473 f[i] += buffer[i];
476 else
478 #pragma omp parallel for num_threads(nt) schedule(static)
479 for (int i = 0; i < natoms; i++)
481 f[i] += buffer[i];
486 receive_virial_energy(cr, forceWithVirial, energy_q, energy_lj, dvdlambda_q, dvdlambda_lj, pme_cycles);