Simplify PME GPU constants
[gromacs.git] / src / gromacs / ewald / pme_pp.cpp
blobf788992e52b594a149386805a3199be3ec1c1201
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
38 /*! \internal \file
40 * \brief This file contains function definitions necessary for
41 * managing the offload of long-ranged PME work to separate MPI rank,
42 * for computing energies and forces (Coulomb and LJ).
44 * \author Berk Hess <hess@kth.se>
45 * \ingroup module_ewald
48 #include "gmxpre.h"
50 #include "pme_pp.h"
52 #include "config.h"
54 #include <cstdio>
55 #include <cstring>
57 #include "gromacs/domdec/domdec.h"
58 #include "gromacs/domdec/domdec_struct.h"
59 #include "gromacs/ewald/pme.h"
60 #include "gromacs/ewald/pme_pp_comm_gpu.h"
61 #include "gromacs/gmxlib/network.h"
62 #include "gromacs/math/vec.h"
63 #include "gromacs/mdlib/gmx_omp_nthreads.h"
64 #include "gromacs/mdtypes/commrec.h"
65 #include "gromacs/mdtypes/forceoutput.h"
66 #include "gromacs/mdtypes/forcerec.h"
67 #include "gromacs/mdtypes/interaction_const.h"
68 #include "gromacs/mdtypes/md_enums.h"
69 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
70 #include "gromacs/nbnxm/nbnxm.h"
71 #include "gromacs/timing/wallcycle.h"
72 #include "gromacs/utility/fatalerror.h"
73 #include "gromacs/utility/gmxmpi.h"
74 #include "gromacs/utility/smalloc.h"
76 #include "pme_pp_communication.h"
78 /*! \brief Block to wait for communication to PME ranks to complete
80 * This should be faster with a real non-blocking MPI implementation
82 static constexpr bool c_useDelayedWait = false;
84 /*! \brief Wait for the pending data send requests to PME ranks to complete */
85 static void gmx_pme_send_coeffs_coords_wait(gmx_domdec_t* dd)
87 if (dd->nreq_pme)
89 #if GMX_MPI
90 MPI_Waitall(dd->nreq_pme, dd->req_pme, MPI_STATUSES_IGNORE);
91 #endif
92 dd->nreq_pme = 0;
96 /*! \brief Send data to PME ranks */
97 static void gmx_pme_send_coeffs_coords(t_forcerec* fr,
98 const t_commrec* cr,
99 unsigned int flags,
100 real gmx_unused* chargeA,
101 real gmx_unused* chargeB,
102 real gmx_unused* c6A,
103 real gmx_unused* c6B,
104 real gmx_unused* sigmaA,
105 real gmx_unused* sigmaB,
106 const matrix box,
107 const rvec gmx_unused* x,
108 real lambda_q,
109 real lambda_lj,
110 int maxshift_x,
111 int maxshift_y,
112 int64_t step,
113 bool useGpuPmePpComms,
114 bool reinitGpuPmePpComms,
115 bool sendCoordinatesFromGpu,
116 GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
118 gmx_domdec_t* dd;
119 gmx_pme_comm_n_box_t* cnb;
120 int n;
122 dd = cr->dd;
123 n = dd_numHomeAtoms(*dd);
125 if (debug)
127 fprintf(debug, "PP rank %d sending to PME rank %d: %d%s%s%s%s\n", cr->sim_nodeid, dd->pme_nodeid,
128 n, (flags & PP_PME_CHARGE) ? " charges" : "", (flags & PP_PME_SQRTC6) ? " sqrtC6" : "",
129 (flags & PP_PME_SIGMA) ? " sigma" : "", (flags & PP_PME_COORD) ? " coordinates" : "");
132 if (useGpuPmePpComms)
134 flags |= PP_PME_GPUCOMMS;
137 if (c_useDelayedWait)
139 /* We can not use cnb until pending communication has finished */
140 gmx_pme_send_coeffs_coords_wait(dd);
143 if (dd->pme_receive_vir_ener)
145 /* Peer PP node: communicate all data */
146 if (dd->cnb == nullptr)
148 snew(dd->cnb, 1);
150 cnb = dd->cnb;
152 cnb->flags = flags;
153 cnb->natoms = n;
154 cnb->maxshift_x = maxshift_x;
155 cnb->maxshift_y = maxshift_y;
156 cnb->lambda_q = lambda_q;
157 cnb->lambda_lj = lambda_lj;
158 cnb->step = step;
159 if (flags & PP_PME_COORD)
161 copy_mat(box, cnb->box);
163 #if GMX_MPI
164 MPI_Isend(cnb, sizeof(*cnb), MPI_BYTE, dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
165 &dd->req_pme[dd->nreq_pme++]);
166 #endif
168 else if (flags & (PP_PME_CHARGE | PP_PME_SQRTC6 | PP_PME_SIGMA))
170 #if GMX_MPI
171 /* Communicate only the number of atoms */
172 MPI_Isend(&n, sizeof(n), MPI_BYTE, dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim,
173 &dd->req_pme[dd->nreq_pme++]);
174 #endif
177 #if GMX_MPI
178 if (n > 0)
180 if (flags & PP_PME_CHARGE)
182 MPI_Isend(chargeA, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_ChargeA,
183 cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
185 if (flags & PP_PME_CHARGEB)
187 MPI_Isend(chargeB, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_ChargeB,
188 cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
190 if (flags & PP_PME_SQRTC6)
192 MPI_Isend(c6A, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SQRTC6A,
193 cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
195 if (flags & PP_PME_SQRTC6B)
197 MPI_Isend(c6B, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SQRTC6B,
198 cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
200 if (flags & PP_PME_SIGMA)
202 MPI_Isend(sigmaA, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SigmaA,
203 cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
205 if (flags & PP_PME_SIGMAB)
207 MPI_Isend(sigmaB, n * sizeof(real), MPI_BYTE, dd->pme_nodeid, eCommType_SigmaB,
208 cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
210 if (flags & PP_PME_COORD)
212 if (reinitGpuPmePpComms)
214 fr->pmePpCommGpu->reinit(n);
218 /* MPI_Isend does not accept a const buffer pointer */
219 real* xRealPtr = const_cast<real*>(x[0]);
220 if (useGpuPmePpComms && (fr != nullptr))
222 void* sendPtr = sendCoordinatesFromGpu
223 ? static_cast<void*>(fr->stateGpu->getCoordinates())
224 : static_cast<void*>(xRealPtr);
225 fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(sendPtr, n, sendCoordinatesFromGpu,
226 coordinatesReadyOnDeviceEvent);
228 else
230 MPI_Isend(xRealPtr, n * sizeof(rvec), MPI_BYTE, dd->pme_nodeid, eCommType_COORD,
231 cr->mpi_comm_mysim, &dd->req_pme[dd->nreq_pme++]);
235 #else
236 GMX_UNUSED_VALUE(fr);
237 GMX_UNUSED_VALUE(reinitGpuPmePpComms);
238 GMX_UNUSED_VALUE(sendCoordinatesFromGpu);
239 GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
240 #endif
241 if (!c_useDelayedWait)
243 /* Wait for the data to arrive */
244 /* We can skip this wait as we are sure x and q will not be modified
245 * before the next call to gmx_pme_send_x_q or gmx_pme_receive_f.
247 gmx_pme_send_coeffs_coords_wait(dd);
251 void gmx_pme_send_parameters(const t_commrec* cr,
252 const interaction_const_t* ic,
253 gmx_bool bFreeEnergy_q,
254 gmx_bool bFreeEnergy_lj,
255 real* chargeA,
256 real* chargeB,
257 real* sqrt_c6A,
258 real* sqrt_c6B,
259 real* sigmaA,
260 real* sigmaB,
261 int maxshift_x,
262 int maxshift_y)
264 unsigned int flags = 0;
266 if (EEL_PME(ic->eeltype))
268 flags |= PP_PME_CHARGE;
270 if (EVDW_PME(ic->vdwtype))
272 flags |= (PP_PME_SQRTC6 | PP_PME_SIGMA);
274 if (bFreeEnergy_q || bFreeEnergy_lj)
276 /* Assumes that the B state flags are in the bits just above
277 * the ones for the A state. */
278 flags |= (flags << 1);
281 gmx_pme_send_coeffs_coords(nullptr, cr, flags, chargeA, chargeB, sqrt_c6A, sqrt_c6B, sigmaA,
282 sigmaB, nullptr, nullptr, 0, 0, maxshift_x, maxshift_y, -1, false,
283 false, false, nullptr);
286 void gmx_pme_send_coordinates(t_forcerec* fr,
287 const t_commrec* cr,
288 const matrix box,
289 const rvec* x,
290 real lambda_q,
291 real lambda_lj,
292 bool computeEnergyAndVirial,
293 int64_t step,
294 bool useGpuPmePpComms,
295 bool receiveCoordinateAddressFromPme,
296 bool sendCoordinatesFromGpu,
297 GpuEventSynchronizer* coordinatesReadyOnDeviceEvent,
298 gmx_wallcycle* wcycle)
300 wallcycle_start(wcycle, ewcPP_PMESENDX);
302 unsigned int flags = PP_PME_COORD;
303 if (computeEnergyAndVirial)
305 flags |= PP_PME_ENER_VIR;
307 gmx_pme_send_coeffs_coords(fr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
308 box, x, lambda_q, lambda_lj, 0, 0, step, useGpuPmePpComms,
309 receiveCoordinateAddressFromPme, sendCoordinatesFromGpu,
310 coordinatesReadyOnDeviceEvent);
312 wallcycle_stop(wcycle, ewcPP_PMESENDX);
315 void gmx_pme_send_finish(const t_commrec* cr)
317 unsigned int flags = PP_PME_FINISH;
319 gmx_pme_send_coeffs_coords(nullptr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
320 nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false, nullptr);
323 void gmx_pme_send_switchgrid(const t_commrec* cr, ivec grid_size, real ewaldcoeff_q, real ewaldcoeff_lj)
325 #if GMX_MPI
326 gmx_pme_comm_n_box_t cnb;
328 /* Only let one PP node signal each PME node */
329 if (cr->dd->pme_receive_vir_ener)
331 cnb.flags = PP_PME_SWITCHGRID;
332 copy_ivec(grid_size, cnb.grid_size);
333 cnb.ewaldcoeff_q = ewaldcoeff_q;
334 cnb.ewaldcoeff_lj = ewaldcoeff_lj;
336 /* We send this, uncommon, message blocking to simplify the code */
337 MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
339 #else
340 GMX_UNUSED_VALUE(cr);
341 GMX_UNUSED_VALUE(grid_size);
342 GMX_UNUSED_VALUE(ewaldcoeff_q);
343 GMX_UNUSED_VALUE(ewaldcoeff_lj);
344 #endif
347 void gmx_pme_send_resetcounters(const t_commrec gmx_unused* cr, int64_t gmx_unused step)
349 #if GMX_MPI
350 gmx_pme_comm_n_box_t cnb;
352 /* Only let one PP node signal each PME node */
353 if (cr->dd->pme_receive_vir_ener)
355 cnb.flags = PP_PME_RESETCOUNTERS;
356 cnb.step = step;
358 /* We send this, uncommon, message blocking to simplify the code */
359 MPI_Send(&cnb, sizeof(cnb), MPI_BYTE, cr->dd->pme_nodeid, eCommType_CNB, cr->mpi_comm_mysim);
361 #endif
364 /*! \brief Receive virial and energy from PME rank */
365 static void receive_virial_energy(const t_commrec* cr,
366 gmx::ForceWithVirial* forceWithVirial,
367 real* energy_q,
368 real* energy_lj,
369 real* dvdlambda_q,
370 real* dvdlambda_lj,
371 float* pme_cycles)
373 gmx_pme_comm_vir_ene_t cve;
375 if (cr->dd->pme_receive_vir_ener)
377 if (debug)
379 fprintf(debug, "PP rank %d receiving from PME rank %d: virial and energy\n",
380 cr->sim_nodeid, cr->dd->pme_nodeid);
382 #if GMX_MPI
383 MPI_Recv(&cve, sizeof(cve), MPI_BYTE, cr->dd->pme_nodeid, 1, cr->mpi_comm_mysim, MPI_STATUS_IGNORE);
384 #else
385 memset(&cve, 0, sizeof(cve));
386 #endif
388 forceWithVirial->addVirialContribution(cve.vir_q);
389 forceWithVirial->addVirialContribution(cve.vir_lj);
390 *energy_q = cve.energy_q;
391 *energy_lj = cve.energy_lj;
392 *dvdlambda_q += cve.dvdlambda_q;
393 *dvdlambda_lj += cve.dvdlambda_lj;
394 *pme_cycles = cve.cycles;
396 if (cve.stop_cond != gmx_stop_cond_none)
398 gmx_set_stop_condition(cve.stop_cond);
401 else
403 *energy_q = 0;
404 *energy_lj = 0;
405 *pme_cycles = 0;
409 /*! \brief Recieve force data from PME ranks */
410 static void recvFFromPme(gmx::PmePpCommGpu* pmePpCommGpu,
411 void* recvptr,
412 int n,
413 const t_commrec* cr,
414 bool useGpuPmePpComms,
415 bool receivePmeForceToGpu)
417 if (useGpuPmePpComms)
419 GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
420 // Receive directly using CUDA memory copy
421 pmePpCommGpu->receiveForceFromPmeCudaDirect(recvptr, n, receivePmeForceToGpu);
423 else
425 // Receive data using MPI
426 #if GMX_MPI
427 MPI_Recv(recvptr, n * sizeof(rvec), MPI_BYTE, cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim,
428 MPI_STATUS_IGNORE);
429 #else
430 GMX_UNUSED_VALUE(cr);
431 #endif
436 void gmx_pme_receive_f(gmx::PmePpCommGpu* pmePpCommGpu,
437 const t_commrec* cr,
438 gmx::ForceWithVirial* forceWithVirial,
439 real* energy_q,
440 real* energy_lj,
441 real* dvdlambda_q,
442 real* dvdlambda_lj,
443 bool useGpuPmePpComms,
444 bool receivePmeForceToGpu,
445 float* pme_cycles)
447 if (c_useDelayedWait)
449 /* Wait for the x request to finish */
450 gmx_pme_send_coeffs_coords_wait(cr->dd);
453 const int natoms = dd_numHomeAtoms(*cr->dd);
454 std::vector<gmx::RVec>& buffer = cr->dd->pmeForceReceiveBuffer;
455 buffer.resize(natoms);
457 void* recvptr = reinterpret_cast<void*>(buffer.data());
458 recvFFromPme(pmePpCommGpu, recvptr, natoms, cr, useGpuPmePpComms, receivePmeForceToGpu);
460 int nt = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, natoms);
462 gmx::ArrayRef<gmx::RVec> f = forceWithVirial->force_;
464 if (!receivePmeForceToGpu)
466 /* Note that we would like to avoid this conditional by putting it
467 * into the omp pragma instead, but then we still take the full
468 * omp parallel for overhead (at least with gcc5).
470 if (nt == 1)
472 for (int i = 0; i < natoms; i++)
474 f[i] += buffer[i];
477 else
479 #pragma omp parallel for num_threads(nt) schedule(static)
480 for (int i = 0; i < natoms; i++)
482 f[i] += buffer[i];
487 receive_virial_energy(cr, forceWithVirial, energy_q, energy_lj, dvdlambda_q, dvdlambda_lj, pme_cycles);