src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <cassert>
  43 #include <cinttypes>
  44 #include <climits>
  45 #include <cmath>
  46 #include <cstdio>
  47 #include <cstdlib>
  48 #include <cstring>
  49
  50 #include <algorithm>
  51 #include <memory>
  52
  53 #include "gromacs/domdec/collect.h"
  54 #include "gromacs/domdec/dlb.h"
  55 #include "gromacs/domdec/dlbtiming.h"
  56 #include "gromacs/domdec/domdec_network.h"
  57 #include "gromacs/domdec/ga2la.h"
  58 #include "gromacs/domdec/options.h"
  59 #include "gromacs/domdec/partition.h"
  60 #include "gromacs/gmxlib/network.h"
  61 #include "gromacs/gmxlib/nrnb.h"
  62 #include "gromacs/gpu_utils/gpu_utils.h"
  63 #include "gromacs/hardware/hw_info.h"
  64 #include "gromacs/listed_forces/manage_threading.h"
  65 #include "gromacs/math/vec.h"
  66 #include "gromacs/math/vectypes.h"
  67 #include "gromacs/mdlib/calc_verletbuf.h"
  68 #include "gromacs/mdlib/constr.h"
  69 #include "gromacs/mdlib/constraintrange.h"
  70 #include "gromacs/mdlib/updategroups.h"
  71 #include "gromacs/mdlib/vsite.h"
  72 #include "gromacs/mdtypes/commrec.h"
  73 #include "gromacs/mdtypes/forceoutput.h"
  74 #include "gromacs/mdtypes/inputrec.h"
  75 #include "gromacs/mdtypes/mdrunoptions.h"
  76 #include "gromacs/mdtypes/state.h"
  77 #include "gromacs/pbcutil/ishift.h"
  78 #include "gromacs/pbcutil/pbc.h"
  79 #include "gromacs/pulling/pull.h"
  80 #include "gromacs/timing/wallcycle.h"
  81 #include "gromacs/topology/block.h"
  82 #include "gromacs/topology/idef.h"
  83 #include "gromacs/topology/ifunc.h"
  84 #include "gromacs/topology/mtop_lookup.h"
  85 #include "gromacs/topology/mtop_util.h"
  86 #include "gromacs/topology/topology.h"
  87 #include "gromacs/utility/basedefinitions.h"
  88 #include "gromacs/utility/basenetwork.h"
  89 #include "gromacs/utility/cstringutil.h"
  90 #include "gromacs/utility/exceptions.h"
  91 #include "gromacs/utility/fatalerror.h"
  92 #include "gromacs/utility/gmxmpi.h"
  93 #include "gromacs/utility/logger.h"
  94 #include "gromacs/utility/real.h"
  95 #include "gromacs/utility/smalloc.h"
  96 #include "gromacs/utility/strconvert.h"
  97 #include "gromacs/utility/stringstream.h"
  98 #include "gromacs/utility/stringutil.h"
  99 #include "gromacs/utility/textwriter.h"
 100
 101 #include "atomdistribution.h"
 102 #include "box.h"
 103 #include "cellsizes.h"
 104 #include "distribute.h"
 105 #include "domdec_constraints.h"
 106 #include "domdec_internal.h"
 107 #include "domdec_vsite.h"
 108 #include "redistribute.h"
 109 #include "utility.h"
 110
 111 // TODO remove this when moving domdec into gmx namespace
 112 using gmx::DdRankOrder;
 113 using gmx::DlbOption;
 114 using gmx::DomdecOptions;
 115
 116 static const char *edlbs_names[int(DlbState::nr)] = { "off", "auto", "locked", "on", "on" };
 117
 118 /* The size per atom group of the cggl_flag buffer in gmx_domdec_comm_t */
 119 #define DD_CGIBS 2
 120
 121 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 122 #define DD_FLAG_NRCG  65535
 123 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 124 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 125
 126 /* The DD zone order */
 127 static const ivec dd_zo[DD_MAXZONE] =
 128 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 129
 130 /* The non-bonded zone-pair setup for domain decomposition
 131  * The first number is the i-zone, the second number the first j-zone seen by
 132  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 133  * As is, this is for 3D decomposition, where there are 4 i-zones.
 134  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 135  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 136  */
 137 static const int
 138     ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
 139                                                  {1, 3, 6},
 140                                                  {2, 5, 6},
 141                                                  {3, 5, 7}};
 142
 143
 144 /*
 145    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 146
 147    static void index2xyz(ivec nc,int ind,ivec xyz)
 148    {
 149    xyz[XX] = ind % nc[XX];
 150    xyz[YY] = (ind / nc[XX]) % nc[YY];
 151    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 152    }
 153  */
 154
 155 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
 156 {
 157     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 158     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 159     xyz[ZZ] = ind % nc[ZZ];
 160 }
 161
 162 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 163 {
 164     int ddindex;
 165     int ddnodeid = -1;
 166
 167     ddindex = dd_index(dd->nc, c);
 168     if (dd->comm->bCartesianPP_PME)
 169     {
 170         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 171     }
 172     else if (dd->comm->bCartesianPP)
 173     {
 174 #if GMX_MPI
 175         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 176 #endif
 177     }
 178     else
 179     {
 180         ddnodeid = ddindex;
 181     }
 182
 183     return ddnodeid;
 184 }
 185
 186 int ddglatnr(const gmx_domdec_t *dd, int i)
 187 {
 188     int atnr;
 189
 190     if (dd == nullptr)
 191     {
 192         atnr = i + 1;
 193     }
 194     else
 195     {
 196         if (i >= dd->comm->atomRanges.numAtomsTotal())
 197         {
 198             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->atomRanges.numAtomsTotal());
 199         }
 200         atnr = dd->globalAtomIndices[i] + 1;
 201     }
 202
 203     return atnr;
 204 }
 205
 206 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 207 {
 208     return &dd->comm->cgs_gl;
 209 }
 210
 211 gmx::ArrayRef<const gmx::RangePartitioning> getUpdateGroupingPerMoleculetype(const gmx_domdec_t &dd)
 212 {
 213     GMX_RELEASE_ASSERT(dd.comm, "Need a valid dd.comm");
 214     return dd.comm->updateGroupingPerMoleculetype;
 215 }
 216
 217 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 218 {
 219     int i;
 220
 221     if (state->ddp_count != dd->ddp_count)
 222     {
 223         gmx_incons("The MD state does not match the domain decomposition state");
 224     }
 225
 226     state->cg_gl.resize(dd->ncg_home);
 227     for (i = 0; i < dd->ncg_home; i++)
 228     {
 229         state->cg_gl[i] = dd->globalAtomGroupIndices[i];
 230     }
 231
 232     state->ddp_count_cg_gl = dd->ddp_count;
 233 }
 234
 235 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 236 {
 237     return &dd->comm->zones;
 238 }
 239
 240 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
 241                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 242 {
 243     gmx_domdec_zones_t *zones;
 244     int                 izone, d, dim;
 245
 246     zones = &dd->comm->zones;
 247
 248     izone = 0;
 249     while (icg >= zones->izone[izone].cg1)
 250     {
 251         izone++;
 252     }
 253
 254     if (izone == 0)
 255     {
 256         *jcg0 = icg;
 257     }
 258     else if (izone < zones->nizone)
 259     {
 260         *jcg0 = zones->izone[izone].jcg0;
 261     }
 262     else
 263     {
 264         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 265                   icg, izone, zones->nizone);
 266     }
 267
 268     *jcg1 = zones->izone[izone].jcg1;
 269
 270     for (d = 0; d < dd->ndim; d++)
 271     {
 272         dim         = dd->dim[d];
 273         shift0[dim] = zones->izone[izone].shift0[dim];
 274         shift1[dim] = zones->izone[izone].shift1[dim];
 275         if (dd->comm->tric_dir[dim] || (isDlbOn(dd->comm) && d > 0))
 276         {
 277             /* A conservative approach, this can be optimized */
 278             shift0[dim] -= 1;
 279             shift1[dim] += 1;
 280         }
 281     }
 282 }
 283
 284 int dd_numHomeAtoms(const gmx_domdec_t &dd)
 285 {
 286     return dd.comm->atomRanges.numHomeAtoms();
 287 }
 288
 289 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
 290 {
 291     /* We currently set mdatoms entries for all atoms:
 292      * local + non-local + communicated for vsite + constraints
 293      */
 294
 295     return dd->comm->atomRanges.numAtomsTotal();
 296 }
 297
 298 int dd_natoms_vsite(const gmx_domdec_t *dd)
 299 {
 300     return dd->comm->atomRanges.end(DDAtomRanges::Type::Vsites);
 301 }
 302
 303 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
 304 {
 305     *at_start = dd->comm->atomRanges.start(DDAtomRanges::Type::Constraints);
 306     *at_end   = dd->comm->atomRanges.end(DDAtomRanges::Type::Constraints);
 307 }
 308
 309 void dd_move_x(gmx_domdec_t             *dd,
 310                const matrix              box,
 311                gmx::ArrayRef<gmx::RVec>  x,
 312                gmx_wallcycle            *wcycle)
 313 {
 314     wallcycle_start(wcycle, ewcMOVEX);
 315
 316     int                    nzone, nat_tot;
 317     gmx_domdec_comm_t     *comm;
 318     gmx_domdec_comm_dim_t *cd;
 319     rvec                   shift = {0, 0, 0};
 320     gmx_bool               bPBC, bScrew;
 321
 322     comm = dd->comm;
 323
 324     nzone   = 1;
 325     nat_tot = comm->atomRanges.numHomeAtoms();
 326     for (int d = 0; d < dd->ndim; d++)
 327     {
 328         bPBC   = (dd->ci[dd->dim[d]] == 0);
 329         bScrew = (bPBC && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
 330         if (bPBC)
 331         {
 332             copy_rvec(box[dd->dim[d]], shift);
 333         }
 334         cd = &comm->cd[d];
 335         for (const gmx_domdec_ind_t &ind : cd->ind)
 336         {
 337             DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 338             gmx::ArrayRef<gmx::RVec>  &sendBuffer = sendBufferAccess.buffer;
 339             int                        n          = 0;
 340             if (!bPBC)
 341             {
 342                 for (int j : ind.index)
 343                 {
 344                     sendBuffer[n] = x[j];
 345                     n++;
 346                 }
 347             }
 348             else if (!bScrew)
 349             {
 350                 for (int j : ind.index)
 351                 {
 352                     /* We need to shift the coordinates */
 353                     for (int d = 0; d < DIM; d++)
 354                     {
 355                         sendBuffer[n][d] = x[j][d] + shift[d];
 356                     }
 357                     n++;
 358                 }
 359             }
 360             else
 361             {
 362                 for (int j : ind.index)
 363                 {
 364                     /* Shift x */
 365                     sendBuffer[n][XX] = x[j][XX] + shift[XX];
 366                     /* Rotate y and z.
 367                      * This operation requires a special shift force
 368                      * treatment, which is performed in calc_vir.
 369                      */
 370                     sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
 371                     sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 372                     n++;
 373                 }
 374             }
 375
 376             DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 377
 378             gmx::ArrayRef<gmx::RVec>   receiveBuffer;
 379             if (cd->receiveInPlace)
 380             {
 381                 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
 382             }
 383             else
 384             {
 385                 receiveBuffer = receiveBufferAccess.buffer;
 386             }
 387             /* Send and receive the coordinates */
 388             ddSendrecv(dd, d, dddirBackward,
 389                        sendBuffer, receiveBuffer);
 390
 391             if (!cd->receiveInPlace)
 392             {
 393                 int j = 0;
 394                 for (int zone = 0; zone < nzone; zone++)
 395                 {
 396                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 397                     {
 398                         x[i] = receiveBuffer[j++];
 399                     }
 400                 }
 401             }
 402             nat_tot += ind.nrecv[nzone+1];
 403         }
 404         nzone += nzone;
 405     }
 406
 407     wallcycle_stop(wcycle, ewcMOVEX);
 408 }
 409
 410 void dd_move_f(gmx_domdec_t              *dd,
 411                gmx::ForceWithShiftForces *forceWithShiftForces,
 412                gmx_wallcycle             *wcycle)
 413 {
 414     wallcycle_start(wcycle, ewcMOVEF);
 415
 416     gmx::ArrayRef<gmx::RVec> f       = forceWithShiftForces->force();
 417     gmx::ArrayRef<gmx::RVec> fshift  = forceWithShiftForces->shiftForces();
 418
 419     gmx_domdec_comm_t       &comm    = *dd->comm;
 420     int                      nzone   = comm.zones.n/2;
 421     int                      nat_tot = comm.atomRanges.end(DDAtomRanges::Type::Zones);
 422     for (int d = dd->ndim-1; d >= 0; d--)
 423     {
 424         /* Only forces in domains near the PBC boundaries need to
 425            consider PBC in the treatment of fshift */
 426         const bool shiftForcesNeedPbc = (forceWithShiftForces->computeVirial() && dd->ci[dd->dim[d]] == 0);
 427         const bool applyScrewPbc      = (shiftForcesNeedPbc && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
 428         /* Determine which shift vector we need */
 429         ivec       vis   = { 0, 0, 0 };
 430         vis[dd->dim[d]]  = 1;
 431         const int  is    = IVEC2IS(vis);
 432
 433         /* Loop over the pulses */
 434         const gmx_domdec_comm_dim_t &cd = comm.cd[d];
 435         for (int p = cd.numPulses() - 1; p >= 0; p--)
 436         {
 437             const gmx_domdec_ind_t    &ind  = cd.ind[p];
 438             DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm.rvecBuffer, ind.nsend[nzone + 1]);
 439             gmx::ArrayRef<gmx::RVec>  &receiveBuffer = receiveBufferAccess.buffer;
 440
 441             nat_tot                                 -= ind.nrecv[nzone+1];
 442
 443             DDBufferAccess<gmx::RVec>  sendBufferAccess(comm.rvecBuffer2, cd.receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 444
 445             gmx::ArrayRef<gmx::RVec>   sendBuffer;
 446             if (cd.receiveInPlace)
 447             {
 448                 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
 449             }
 450             else
 451             {
 452                 sendBuffer = sendBufferAccess.buffer;
 453                 int j = 0;
 454                 for (int zone = 0; zone < nzone; zone++)
 455                 {
 456                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 457                     {
 458                         sendBuffer[j++] = f[i];
 459                     }
 460                 }
 461             }
 462             /* Communicate the forces */
 463             ddSendrecv(dd, d, dddirForward,
 464                        sendBuffer, receiveBuffer);
 465             /* Add the received forces */
 466             int n = 0;
 467             if (!shiftForcesNeedPbc)
 468             {
 469                 for (int j : ind.index)
 470                 {
 471                     for (int d = 0; d < DIM; d++)
 472                     {
 473                         f[j][d] += receiveBuffer[n][d];
 474                     }
 475                     n++;
 476                 }
 477             }
 478             else if (!applyScrewPbc)
 479             {
 480                 for (int j : ind.index)
 481                 {
 482                     for (int d = 0; d < DIM; d++)
 483                     {
 484                         f[j][d] += receiveBuffer[n][d];
 485                     }
 486                     /* Add this force to the shift force */
 487                     for (int d = 0; d < DIM; d++)
 488                     {
 489                         fshift[is][d] += receiveBuffer[n][d];
 490                     }
 491                     n++;
 492                 }
 493             }
 494             else
 495             {
 496                 for (int j : ind.index)
 497                 {
 498                     /* Rotate the force */
 499                     f[j][XX] += receiveBuffer[n][XX];
 500                     f[j][YY] -= receiveBuffer[n][YY];
 501                     f[j][ZZ] -= receiveBuffer[n][ZZ];
 502                     if (shiftForcesNeedPbc)
 503                     {
 504                         /* Add this force to the shift force */
 505                         for (int d = 0; d < DIM; d++)
 506                         {
 507                             fshift[is][d] += receiveBuffer[n][d];
 508                         }
 509                     }
 510                     n++;
 511                 }
 512             }
 513         }
 514         nzone /= 2;
 515     }
 516     wallcycle_stop(wcycle, ewcMOVEF);
 517 }
 518
 519 /* Convenience function for extracting a real buffer from an rvec buffer
 520  *
 521  * To reduce the number of temporary communication buffers and avoid
 522  * cache polution, we reuse gmx::RVec buffers for storing reals.
 523  * This functions return a real buffer reference with the same number
 524  * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
 525  */
 526 static gmx::ArrayRef<real>
 527 realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
 528 {
 529     return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0],
 530                                   arrayRef.size());
 531 }
 532
 533 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 534 {
 535     int                    nzone, nat_tot;
 536     gmx_domdec_comm_t     *comm;
 537     gmx_domdec_comm_dim_t *cd;
 538
 539     comm = dd->comm;
 540
 541     nzone   = 1;
 542     nat_tot = comm->atomRanges.numHomeAtoms();
 543     for (int d = 0; d < dd->ndim; d++)
 544     {
 545         cd = &comm->cd[d];
 546         for (const gmx_domdec_ind_t &ind : cd->ind)
 547         {
 548             /* Note: We provision for RVec instead of real, so a factor of 3
 549              * more than needed. The buffer actually already has this size
 550              * and we pass a plain pointer below, so this does not matter.
 551              */
 552             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 553             gmx::ArrayRef<real>       sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 554             int                       n          = 0;
 555             for (int j : ind.index)
 556             {
 557                 sendBuffer[n++] = v[j];
 558             }
 559
 560             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 561
 562             gmx::ArrayRef<real>       receiveBuffer;
 563             if (cd->receiveInPlace)
 564             {
 565                 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 566             }
 567             else
 568             {
 569                 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 570             }
 571             /* Send and receive the data */
 572             ddSendrecv(dd, d, dddirBackward,
 573                        sendBuffer, receiveBuffer);
 574             if (!cd->receiveInPlace)
 575             {
 576                 int j = 0;
 577                 for (int zone = 0; zone < nzone; zone++)
 578                 {
 579                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 580                     {
 581                         v[i] = receiveBuffer[j++];
 582                     }
 583                 }
 584             }
 585             nat_tot += ind.nrecv[nzone+1];
 586         }
 587         nzone += nzone;
 588     }
 589 }
 590
 591 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 592 {
 593     int                    nzone, nat_tot;
 594     gmx_domdec_comm_t     *comm;
 595     gmx_domdec_comm_dim_t *cd;
 596
 597     comm = dd->comm;
 598
 599     nzone   = comm->zones.n/2;
 600     nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 601     for (int d = dd->ndim-1; d >= 0; d--)
 602     {
 603         cd = &comm->cd[d];
 604         for (int p = cd->numPulses() - 1; p >= 0; p--)
 605         {
 606             const gmx_domdec_ind_t &ind = cd->ind[p];
 607
 608             /* Note: We provision for RVec instead of real, so a factor of 3
 609              * more than needed. The buffer actually already has this size
 610              * and we typecast, so this works as intended.
 611              */
 612             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 613             gmx::ArrayRef<real>       receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 614             nat_tot -= ind.nrecv[nzone + 1];
 615
 616             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 617
 618             gmx::ArrayRef<real>       sendBuffer;
 619             if (cd->receiveInPlace)
 620             {
 621                 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 622             }
 623             else
 624             {
 625                 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 626                 int j = 0;
 627                 for (int zone = 0; zone < nzone; zone++)
 628                 {
 629                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 630                     {
 631                         sendBuffer[j++] = v[i];
 632                     }
 633                 }
 634             }
 635             /* Communicate the forces */
 636             ddSendrecv(dd, d, dddirForward,
 637                        sendBuffer, receiveBuffer);
 638             /* Add the received forces */
 639             int n = 0;
 640             for (int j : ind.index)
 641             {
 642                 v[j] += receiveBuffer[n];
 643                 n++;
 644             }
 645         }
 646         nzone /= 2;
 647     }
 648 }
 649
 650 real dd_cutoff_multibody(const gmx_domdec_t *dd)
 651 {
 652     gmx_domdec_comm_t *comm;
 653     int                di;
 654     real               r;
 655
 656     comm = dd->comm;
 657
 658     r = -1;
 659     if (comm->haveInterDomainMultiBodyBondeds)
 660     {
 661         if (comm->cutoff_mbody > 0)
 662         {
 663             r = comm->cutoff_mbody;
 664         }
 665         else
 666         {
 667             /* cutoff_mbody=0 means we do not have DLB */
 668             r = comm->cellsize_min[dd->dim[0]];
 669             for (di = 1; di < dd->ndim; di++)
 670             {
 671                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
 672             }
 673             if (comm->bBondComm)
 674             {
 675                 r = std::max(r, comm->cutoff_mbody);
 676             }
 677             else
 678             {
 679                 r = std::min(r, comm->cutoff);
 680             }
 681         }
 682     }
 683
 684     return r;
 685 }
 686
 687 real dd_cutoff_twobody(const gmx_domdec_t *dd)
 688 {
 689     real r_mb;
 690
 691     r_mb = dd_cutoff_multibody(dd);
 692
 693     return std::max(dd->comm->cutoff, r_mb);
 694 }
 695
 696
 697 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
 698                                    ivec coord_pme)
 699 {
 700     int nc, ntot;
 701
 702     nc   = dd->nc[dd->comm->cartpmedim];
 703     ntot = dd->comm->ntot[dd->comm->cartpmedim];
 704     copy_ivec(coord, coord_pme);
 705     coord_pme[dd->comm->cartpmedim] =
 706         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 707 }
 708
 709 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 710 {
 711     int npp, npme;
 712
 713     npp  = dd->nnodes;
 714     npme = dd->comm->npmenodes;
 715
 716     /* Here we assign a PME node to communicate with this DD node
 717      * by assuming that the major index of both is x.
 718      * We add cr->npmenodes/2 to obtain an even distribution.
 719      */
 720     return (ddindex*npme + npme/2)/npp;
 721 }
 722
 723 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
 724 {
 725     int *pme_rank;
 726     int  n, i, p0, p1;
 727
 728     snew(pme_rank, dd->comm->npmenodes);
 729     n = 0;
 730     for (i = 0; i < dd->nnodes; i++)
 731     {
 732         p0 = ddindex2pmeindex(dd, i);
 733         p1 = ddindex2pmeindex(dd, i+1);
 734         if (i+1 == dd->nnodes || p1 > p0)
 735         {
 736             if (debug)
 737             {
 738                 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
 739             }
 740             pme_rank[n] = i + 1 + n;
 741             n++;
 742         }
 743     }
 744
 745     return pme_rank;
 746 }
 747
 748 static int gmx_ddcoord2pmeindex(const t_commrec *cr, int x, int y, int z)
 749 {
 750     gmx_domdec_t *dd;
 751     ivec          coords;
 752     int           slab;
 753
 754     dd = cr->dd;
 755     /*
 756        if (dd->comm->bCartesian) {
 757        gmx_ddindex2xyz(dd->nc,ddindex,coords);
 758        dd_coords2pmecoords(dd,coords,coords_pme);
 759        copy_ivec(dd->ntot,nc);
 760        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 761        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 762
 763        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 764        } else {
 765        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 766        }
 767      */
 768     coords[XX] = x;
 769     coords[YY] = y;
 770     coords[ZZ] = z;
 771     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
 772
 773     return slab;
 774 }
 775
 776 static int ddcoord2simnodeid(const t_commrec *cr, int x, int y, int z)
 777 {
 778     gmx_domdec_comm_t *comm;
 779     ivec               coords;
 780     int                ddindex, nodeid = -1;
 781
 782     comm = cr->dd->comm;
 783
 784     coords[XX] = x;
 785     coords[YY] = y;
 786     coords[ZZ] = z;
 787     if (comm->bCartesianPP_PME)
 788     {
 789 #if GMX_MPI
 790         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 791 #endif
 792     }
 793     else
 794     {
 795         ddindex = dd_index(cr->dd->nc, coords);
 796         if (comm->bCartesianPP)
 797         {
 798             nodeid = comm->ddindex2simnodeid[ddindex];
 799         }
 800         else
 801         {
 802             if (comm->pmenodes)
 803             {
 804                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 805             }
 806             else
 807             {
 808                 nodeid = ddindex;
 809             }
 810         }
 811     }
 812
 813     return nodeid;
 814 }
 815
 816 static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
 817                               const t_commrec gmx_unused *cr,
 818                               int                         sim_nodeid)
 819 {
 820     int pmenode = -1;
 821
 822     const gmx_domdec_comm_t *comm = dd->comm;
 823
 824     /* This assumes a uniform x domain decomposition grid cell size */
 825     if (comm->bCartesianPP_PME)
 826     {
 827 #if GMX_MPI
 828         ivec coord, coord_pme;
 829         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 830         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 831         {
 832             /* This is a PP node */
 833             dd_cart_coord2pmecoord(dd, coord, coord_pme);
 834             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 835         }
 836 #endif
 837     }
 838     else if (comm->bCartesianPP)
 839     {
 840         if (sim_nodeid < dd->nnodes)
 841         {
 842             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 843         }
 844     }
 845     else
 846     {
 847         /* This assumes DD cells with identical x coordinates
 848          * are numbered sequentially.
 849          */
 850         if (dd->comm->pmenodes == nullptr)
 851         {
 852             if (sim_nodeid < dd->nnodes)
 853             {
 854                 /* The DD index equals the nodeid */
 855                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 856             }
 857         }
 858         else
 859         {
 860             int i = 0;
 861             while (sim_nodeid > dd->comm->pmenodes[i])
 862             {
 863                 i++;
 864             }
 865             if (sim_nodeid < dd->comm->pmenodes[i])
 866             {
 867                 pmenode = dd->comm->pmenodes[i];
 868             }
 869         }
 870     }
 871
 872     return pmenode;
 873 }
 874
 875 NumPmeDomains getNumPmeDomains(const gmx_domdec_t *dd)
 876 {
 877     if (dd != nullptr)
 878     {
 879         return {
 880                    dd->comm->npmenodes_x, dd->comm->npmenodes_y
 881         };
 882     }
 883     else
 884     {
 885         return {
 886                    1, 1
 887         };
 888     }
 889 }
 890
 891 std::vector<int> get_pme_ddranks(const t_commrec *cr, int pmenodeid)
 892 {
 893     gmx_domdec_t *dd;
 894     int           x, y, z;
 895     ivec          coord, coord_pme;
 896
 897     dd = cr->dd;
 898
 899     std::vector<int> ddranks;
 900     ddranks.reserve((dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 901
 902     for (x = 0; x < dd->nc[XX]; x++)
 903     {
 904         for (y = 0; y < dd->nc[YY]; y++)
 905         {
 906             for (z = 0; z < dd->nc[ZZ]; z++)
 907             {
 908                 if (dd->comm->bCartesianPP_PME)
 909                 {
 910                     coord[XX] = x;
 911                     coord[YY] = y;
 912                     coord[ZZ] = z;
 913                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
 914                     if (dd->ci[XX] == coord_pme[XX] &&
 915                         dd->ci[YY] == coord_pme[YY] &&
 916                         dd->ci[ZZ] == coord_pme[ZZ])
 917                     {
 918                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 919                     }
 920                 }
 921                 else
 922                 {
 923                     /* The slab corresponds to the nodeid in the PME group */
 924                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 925                     {
 926                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 927                     }
 928                 }
 929             }
 930         }
 931     }
 932     return ddranks;
 933 }
 934
 935 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
 936 {
 937     gmx_bool bReceive = TRUE;
 938
 939     if (cr->npmenodes < dd->nnodes)
 940     {
 941         gmx_domdec_comm_t *comm = dd->comm;
 942         if (comm->bCartesianPP_PME)
 943         {
 944 #if GMX_MPI
 945             int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
 946             ivec coords;
 947             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 948             coords[comm->cartpmedim]++;
 949             if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 950             {
 951                 int rank;
 952                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 953                 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
 954                 {
 955                     /* This is not the last PP node for pmenode */
 956                     bReceive = FALSE;
 957                 }
 958             }
 959 #else
 960             GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
 961 #endif
 962         }
 963         else
 964         {
 965             int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
 966             if (cr->sim_nodeid+1 < cr->nnodes &&
 967                 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
 968             {
 969                 /* This is not the last PP node for pmenode */
 970                 bReceive = FALSE;
 971             }
 972         }
 973     }
 974
 975     return bReceive;
 976 }
 977
 978 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
 979 {
 980     gmx_domdec_comm_t *comm;
 981     int                i;
 982
 983     comm = dd->comm;
 984
 985     snew(*dim_f, dd->nc[dim]+1);
 986     (*dim_f)[0] = 0;
 987     for (i = 1; i < dd->nc[dim]; i++)
 988     {
 989         if (comm->slb_frac[dim])
 990         {
 991             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
 992         }
 993         else
 994         {
 995             (*dim_f)[i] = static_cast<real>(i)/static_cast<real>(dd->nc[dim]);
 996         }
 997     }
 998     (*dim_f)[dd->nc[dim]] = 1;
 999 }
1000
1001 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
1002 {
1003     int  pmeindex, slab, nso, i;
1004     ivec xyz;
1005
1006     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
1007     {
1008         ddpme->dim = YY;
1009     }
1010     else
1011     {
1012         ddpme->dim = dimind;
1013     }
1014     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
1015
1016     ddpme->nslab = (ddpme->dim == 0 ?
1017                     dd->comm->npmenodes_x :
1018                     dd->comm->npmenodes_y);
1019
1020     if (ddpme->nslab <= 1)
1021     {
1022         return;
1023     }
1024
1025     nso = dd->comm->npmenodes/ddpme->nslab;
1026     /* Determine for each PME slab the PP location range for dimension dim */
1027     snew(ddpme->pp_min, ddpme->nslab);
1028     snew(ddpme->pp_max, ddpme->nslab);
1029     for (slab = 0; slab < ddpme->nslab; slab++)
1030     {
1031         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
1032         ddpme->pp_max[slab] = 0;
1033     }
1034     for (i = 0; i < dd->nnodes; i++)
1035     {
1036         ddindex2xyz(dd->nc, i, xyz);
1037         /* For y only use our y/z slab.
1038          * This assumes that the PME x grid size matches the DD grid size.
1039          */
1040         if (dimind == 0 || xyz[XX] == dd->ci[XX])
1041         {
1042             pmeindex = ddindex2pmeindex(dd, i);
1043             if (dimind == 0)
1044             {
1045                 slab = pmeindex/nso;
1046             }
1047             else
1048             {
1049                 slab = pmeindex % ddpme->nslab;
1050             }
1051             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
1052             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
1053         }
1054     }
1055
1056     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
1057 }
1058
1059 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
1060 {
1061     if (dd->comm->ddpme[0].dim == XX)
1062     {
1063         return dd->comm->ddpme[0].maxshift;
1064     }
1065     else
1066     {
1067         return 0;
1068     }
1069 }
1070
1071 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
1072 {
1073     if (dd->comm->ddpme[0].dim == YY)
1074     {
1075         return dd->comm->ddpme[0].maxshift;
1076     }
1077     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
1078     {
1079         return dd->comm->ddpme[1].maxshift;
1080     }
1081     else
1082     {
1083         return 0;
1084     }
1085 }
1086
1087 void dd_cycles_add(const gmx_domdec_t *dd, float cycles, int ddCycl)
1088 {
1089     /* Note that the cycles value can be incorrect, either 0 or some
1090      * extremely large value, when our thread migrated to another core
1091      * with an unsynchronized cycle counter. If this happens less often
1092      * that once per nstlist steps, this will not cause issues, since
1093      * we later subtract the maximum value from the sum over nstlist steps.
1094      * A zero count will slightly lower the total, but that's a small effect.
1095      * Note that the main purpose of the subtraction of the maximum value
1096      * is to avoid throwing off the load balancing when stalls occur due
1097      * e.g. system activity or network congestion.
1098      */
1099     dd->comm->cycl[ddCycl] += cycles;
1100     dd->comm->cycl_n[ddCycl]++;
1101     if (cycles > dd->comm->cycl_max[ddCycl])
1102     {
1103         dd->comm->cycl_max[ddCycl] = cycles;
1104     }
1105 }
1106
1107 #if GMX_MPI
1108 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
1109 {
1110     MPI_Comm           c_row;
1111     int                dim, i, rank;
1112     ivec               loc_c;
1113     gmx_bool           bPartOfGroup = FALSE;
1114
1115     dim = dd->dim[dim_ind];
1116     copy_ivec(loc, loc_c);
1117     for (i = 0; i < dd->nc[dim]; i++)
1118     {
1119         loc_c[dim] = i;
1120         rank       = dd_index(dd->nc, loc_c);
1121         if (rank == dd->rank)
1122         {
1123             /* This process is part of the group */
1124             bPartOfGroup = TRUE;
1125         }
1126     }
1127     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
1128                    &c_row);
1129     if (bPartOfGroup)
1130     {
1131         dd->comm->mpi_comm_load[dim_ind] = c_row;
1132         if (!isDlbDisabled(dd->comm))
1133         {
1134             DDCellsizesWithDlb &cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1135
1136             if (dd->ci[dim] == dd->master_ci[dim])
1137             {
1138                 /* This is the root process of this row */
1139                 cellsizes.rowMaster  = std::make_unique<RowMaster>();
1140
1141                 RowMaster &rowMaster = *cellsizes.rowMaster;
1142                 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1143                 rowMaster.oldCellFrac.resize(dd->nc[dim] + 1);
1144                 rowMaster.isCellMin.resize(dd->nc[dim]);
1145                 if (dim_ind > 0)
1146                 {
1147                     rowMaster.bounds.resize(dd->nc[dim]);
1148                 }
1149                 rowMaster.buf_ncd.resize(dd->nc[dim]);
1150             }
1151             else
1152             {
1153                 /* This is not a root process, we only need to receive cell_f */
1154                 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1155             }
1156         }
1157         if (dd->ci[dim] == dd->master_ci[dim])
1158         {
1159             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
1160         }
1161     }
1162 }
1163 #endif
1164
1165 void dd_setup_dlb_resource_sharing(t_commrec            *cr,
1166                                    int                   gpu_id)
1167 {
1168 #if GMX_MPI
1169     int           physicalnode_id_hash;
1170     gmx_domdec_t *dd;
1171     MPI_Comm      mpi_comm_pp_physicalnode;
1172
1173     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1174     {
1175         /* Only ranks with short-ranged tasks (currently) use GPUs.
1176          * If we don't have GPUs assigned, there are no resources to share.
1177          */
1178         return;
1179     }
1180
1181     physicalnode_id_hash = gmx_physicalnode_id_hash();
1182
1183     dd = cr->dd;
1184
1185     if (debug)
1186     {
1187         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1188         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
1189                 dd->rank, physicalnode_id_hash, gpu_id);
1190     }
1191     /* Split the PP communicator over the physical nodes */
1192     /* TODO: See if we should store this (before), as it's also used for
1193      * for the nodecomm summation.
1194      */
1195     // TODO PhysicalNodeCommunicator could be extended/used to handle
1196     // the need for per-node per-group communicators.
1197     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
1198                    &mpi_comm_pp_physicalnode);
1199     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
1200                    &dd->comm->mpi_comm_gpu_shared);
1201     MPI_Comm_free(&mpi_comm_pp_physicalnode);
1202     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1203
1204     if (debug)
1205     {
1206         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1207     }
1208
1209     /* Note that some ranks could share a GPU, while others don't */
1210
1211     if (dd->comm->nrank_gpu_shared == 1)
1212     {
1213         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1214     }
1215 #else
1216     GMX_UNUSED_VALUE(cr);
1217     GMX_UNUSED_VALUE(gpu_id);
1218 #endif
1219 }
1220
1221 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
1222 {
1223 #if GMX_MPI
1224     int  dim0, dim1, i, j;
1225     ivec loc;
1226
1227     if (debug)
1228     {
1229         fprintf(debug, "Making load communicators\n");
1230     }
1231
1232     dd->comm->load = new domdec_load_t[std::max(dd->ndim, 1)];
1233     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1234
1235     if (dd->ndim == 0)
1236     {
1237         return;
1238     }
1239
1240     clear_ivec(loc);
1241     make_load_communicator(dd, 0, loc);
1242     if (dd->ndim > 1)
1243     {
1244         dim0 = dd->dim[0];
1245         for (i = 0; i < dd->nc[dim0]; i++)
1246         {
1247             loc[dim0] = i;
1248             make_load_communicator(dd, 1, loc);
1249         }
1250     }
1251     if (dd->ndim > 2)
1252     {
1253         dim0 = dd->dim[0];
1254         for (i = 0; i < dd->nc[dim0]; i++)
1255         {
1256             loc[dim0] = i;
1257             dim1      = dd->dim[1];
1258             for (j = 0; j < dd->nc[dim1]; j++)
1259             {
1260                 loc[dim1] = j;
1261                 make_load_communicator(dd, 2, loc);
1262             }
1263         }
1264     }
1265
1266     if (debug)
1267     {
1268         fprintf(debug, "Finished making load communicators\n");
1269     }
1270 #endif
1271 }
1272
1273 /*! \brief Sets up the relation between neighboring domains and zones */
1274 static void setup_neighbor_relations(gmx_domdec_t *dd)
1275 {
1276     int                     d, dim, i, j, m;
1277     ivec                    tmp, s;
1278     gmx_domdec_zones_t     *zones;
1279     gmx_domdec_ns_ranges_t *izone;
1280     GMX_ASSERT(dd->ndim >= 0, "Must have non-negative number of dimensions for DD");
1281
1282     for (d = 0; d < dd->ndim; d++)
1283     {
1284         dim = dd->dim[d];
1285         copy_ivec(dd->ci, tmp);
1286         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
1287         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1288         copy_ivec(dd->ci, tmp);
1289         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
1290         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1291         if (debug)
1292         {
1293             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1294                     dd->rank, dim,
1295                     dd->neighbor[d][0],
1296                     dd->neighbor[d][1]);
1297         }
1298     }
1299
1300     int nzone  = (1 << dd->ndim);
1301     GMX_ASSERT(dd->ndim < DIM, "Invalid number of dimensions");
1302     int nizone = (1 << std::max(dd->ndim - 1, 0));
1303     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1304
1305     zones = &dd->comm->zones;
1306
1307     for (i = 0; i < nzone; i++)
1308     {
1309         m = 0;
1310         clear_ivec(zones->shift[i]);
1311         for (d = 0; d < dd->ndim; d++)
1312         {
1313             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1314         }
1315     }
1316
1317     zones->n = nzone;
1318     for (i = 0; i < nzone; i++)
1319     {
1320         for (d = 0; d < DIM; d++)
1321         {
1322             s[d] = dd->ci[d] - zones->shift[i][d];
1323             if (s[d] < 0)
1324             {
1325                 s[d] += dd->nc[d];
1326             }
1327             else if (s[d] >= dd->nc[d])
1328             {
1329                 s[d] -= dd->nc[d];
1330             }
1331         }
1332     }
1333     zones->nizone = nizone;
1334     for (i = 0; i < zones->nizone; i++)
1335     {
1336         assert(ddNonbondedZonePairRanges[i][0] == i);
1337
1338         izone     = &zones->izone[i];
1339         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1340          * j-zones up to nzone.
1341          */
1342         izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
1343         izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
1344         for (dim = 0; dim < DIM; dim++)
1345         {
1346             if (dd->nc[dim] == 1)
1347             {
1348                 /* All shifts should be allowed */
1349                 izone->shift0[dim] = -1;
1350                 izone->shift1[dim] = 1;
1351             }
1352             else
1353             {
1354                 /* Determine the min/max j-zone shift wrt the i-zone */
1355                 izone->shift0[dim] = 1;
1356                 izone->shift1[dim] = -1;
1357                 for (j = izone->j0; j < izone->j1; j++)
1358                 {
1359                     int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
1360                     if (shift_diff < izone->shift0[dim])
1361                     {
1362                         izone->shift0[dim] = shift_diff;
1363                     }
1364                     if (shift_diff > izone->shift1[dim])
1365                     {
1366                         izone->shift1[dim] = shift_diff;
1367                     }
1368                 }
1369             }
1370         }
1371     }
1372
1373     if (!isDlbDisabled(dd->comm))
1374     {
1375         dd->comm->cellsizesWithDlb.resize(dd->ndim);
1376     }
1377
1378     if (dd->comm->bRecordLoad)
1379     {
1380         make_load_communicators(dd);
1381     }
1382 }
1383
1384 static void make_pp_communicator(const gmx::MDLogger  &mdlog,
1385                                  gmx_domdec_t         *dd,
1386                                  t_commrec gmx_unused *cr,
1387                                  bool gmx_unused       reorder)
1388 {
1389 #if GMX_MPI
1390     gmx_domdec_comm_t *comm;
1391     int                rank, *buf;
1392     ivec               periods;
1393     MPI_Comm           comm_cart;
1394
1395     comm = dd->comm;
1396
1397     if (comm->bCartesianPP)
1398     {
1399         /* Set up cartesian communication for the particle-particle part */
1400         GMX_LOG(mdlog.info).appendTextFormatted(
1401                 "Will use a Cartesian communicator: %d x %d x %d",
1402                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
1403
1404         for (int i = 0; i < DIM; i++)
1405         {
1406             periods[i] = TRUE;
1407         }
1408         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, static_cast<int>(reorder),
1409                         &comm_cart);
1410         /* We overwrite the old communicator with the new cartesian one */
1411         cr->mpi_comm_mygroup = comm_cart;
1412     }
1413
1414     dd->mpi_comm_all = cr->mpi_comm_mygroup;
1415     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1416
1417     if (comm->bCartesianPP_PME)
1418     {
1419         /* Since we want to use the original cartesian setup for sim,
1420          * and not the one after split, we need to make an index.
1421          */
1422         snew(comm->ddindex2ddnodeid, dd->nnodes);
1423         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
1424         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
1425         /* Get the rank of the DD master,
1426          * above we made sure that the master node is a PP node.
1427          */
1428         if (MASTER(cr))
1429         {
1430             rank = dd->rank;
1431         }
1432         else
1433         {
1434             rank = 0;
1435         }
1436         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1437     }
1438     else if (comm->bCartesianPP)
1439     {
1440         if (cr->npmenodes == 0)
1441         {
1442             /* The PP communicator is also
1443              * the communicator for this simulation
1444              */
1445             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1446         }
1447         cr->nodeid = dd->rank;
1448
1449         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1450
1451         /* We need to make an index to go from the coordinates
1452          * to the nodeid of this simulation.
1453          */
1454         snew(comm->ddindex2simnodeid, dd->nnodes);
1455         snew(buf, dd->nnodes);
1456         if (thisRankHasDuty(cr, DUTY_PP))
1457         {
1458             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1459         }
1460         /* Communicate the ddindex to simulation nodeid index */
1461         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1462                       cr->mpi_comm_mysim);
1463         sfree(buf);
1464
1465         /* Determine the master coordinates and rank.
1466          * The DD master should be the same node as the master of this sim.
1467          */
1468         for (int i = 0; i < dd->nnodes; i++)
1469         {
1470             if (comm->ddindex2simnodeid[i] == 0)
1471             {
1472                 ddindex2xyz(dd->nc, i, dd->master_ci);
1473                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1474             }
1475         }
1476         if (debug)
1477         {
1478             fprintf(debug, "The master rank is %d\n", dd->masterrank);
1479         }
1480     }
1481     else
1482     {
1483         /* No Cartesian communicators */
1484         /* We use the rank in dd->comm->all as DD index */
1485         ddindex2xyz(dd->nc, dd->rank, dd->ci);
1486         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1487         dd->masterrank = 0;
1488         clear_ivec(dd->master_ci);
1489     }
1490 #endif
1491
1492     GMX_LOG(mdlog.info).appendTextFormatted(
1493             "Domain decomposition rank %d, coordinates %d %d %d\n",
1494             dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1495     if (debug)
1496     {
1497         fprintf(debug,
1498                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1499                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1500     }
1501 }
1502
1503 static void receive_ddindex2simnodeid(gmx_domdec_t         *dd,
1504                                       t_commrec            *cr)
1505 {
1506 #if GMX_MPI
1507     gmx_domdec_comm_t *comm = dd->comm;
1508
1509     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
1510     {
1511         int *buf;
1512         snew(comm->ddindex2simnodeid, dd->nnodes);
1513         snew(buf, dd->nnodes);
1514         if (thisRankHasDuty(cr, DUTY_PP))
1515         {
1516             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1517         }
1518         /* Communicate the ddindex to simulation nodeid index */
1519         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1520                       cr->mpi_comm_mysim);
1521         sfree(buf);
1522     }
1523 #else
1524     GMX_UNUSED_VALUE(dd);
1525     GMX_UNUSED_VALUE(cr);
1526 #endif
1527 }
1528
1529 static void split_communicator(const gmx::MDLogger &mdlog,
1530                                t_commrec *cr, gmx_domdec_t *dd,
1531                                DdRankOrder gmx_unused rankOrder,
1532                                bool gmx_unused reorder)
1533 {
1534     gmx_domdec_comm_t *comm;
1535     int                i;
1536     gmx_bool           bDiv[DIM];
1537 #if GMX_MPI
1538     MPI_Comm           comm_cart;
1539 #endif
1540
1541     comm = dd->comm;
1542
1543     if (comm->bCartesianPP)
1544     {
1545         for (i = 1; i < DIM; i++)
1546         {
1547             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
1548         }
1549         if (bDiv[YY] || bDiv[ZZ])
1550         {
1551             comm->bCartesianPP_PME = TRUE;
1552             /* If we have 2D PME decomposition, which is always in x+y,
1553              * we stack the PME only nodes in z.
1554              * Otherwise we choose the direction that provides the thinnest slab
1555              * of PME only nodes as this will have the least effect
1556              * on the PP communication.
1557              * But for the PME communication the opposite might be better.
1558              */
1559             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
1560                              !bDiv[YY] ||
1561                              dd->nc[YY] > dd->nc[ZZ]))
1562             {
1563                 comm->cartpmedim = ZZ;
1564             }
1565             else
1566             {
1567                 comm->cartpmedim = YY;
1568             }
1569             comm->ntot[comm->cartpmedim]
1570                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
1571         }
1572         else
1573         {
1574             GMX_LOG(mdlog.info).appendTextFormatted(
1575                     "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)",
1576                     cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
1577             GMX_LOG(mdlog.info).appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1578         }
1579     }
1580
1581     if (comm->bCartesianPP_PME)
1582     {
1583 #if GMX_MPI
1584         int  rank;
1585         ivec periods;
1586
1587         GMX_LOG(mdlog.info).appendTextFormatted(
1588                 "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1589                 comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
1590
1591         for (i = 0; i < DIM; i++)
1592         {
1593             periods[i] = TRUE;
1594         }
1595         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, static_cast<int>(reorder),
1596                         &comm_cart);
1597         MPI_Comm_rank(comm_cart, &rank);
1598         if (MASTER(cr) && rank != 0)
1599         {
1600             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1601         }
1602
1603         /* With this assigment we loose the link to the original communicator
1604          * which will usually be MPI_COMM_WORLD, unless have multisim.
1605          */
1606         cr->mpi_comm_mysim = comm_cart;
1607         cr->sim_nodeid     = rank;
1608
1609         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
1610
1611         GMX_LOG(mdlog.info).appendTextFormatted(
1612                 "Cartesian rank %d, coordinates %d %d %d\n",
1613                 cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1614
1615         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1616         {
1617             cr->duty = DUTY_PP;
1618         }
1619         if (cr->npmenodes == 0 ||
1620             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
1621         {
1622             cr->duty = DUTY_PME;
1623         }
1624
1625         /* Split the sim communicator into PP and PME only nodes */
1626         MPI_Comm_split(cr->mpi_comm_mysim,
1627                        getThisRankDuties(cr),
1628                        dd_index(comm->ntot, dd->ci),
1629                        &cr->mpi_comm_mygroup);
1630 #endif
1631     }
1632     else
1633     {
1634         switch (rankOrder)
1635         {
1636             case DdRankOrder::pp_pme:
1637                 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1638                 break;
1639             case DdRankOrder::interleave:
1640                 /* Interleave the PP-only and PME-only ranks */
1641                 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1642                 comm->pmenodes = dd_interleaved_pme_ranks(dd);
1643                 break;
1644             case DdRankOrder::cartesian:
1645                 break;
1646             default:
1647                 gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(rankOrder));
1648         }
1649
1650         if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
1651         {
1652             cr->duty = DUTY_PME;
1653         }
1654         else
1655         {
1656             cr->duty = DUTY_PP;
1657         }
1658 #if GMX_MPI
1659         /* Split the sim communicator into PP and PME only nodes */
1660         MPI_Comm_split(cr->mpi_comm_mysim,
1661                        getThisRankDuties(cr),
1662                        cr->nodeid,
1663                        &cr->mpi_comm_mygroup);
1664         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1665 #endif
1666     }
1667
1668     GMX_LOG(mdlog.info).appendTextFormatted(
1669             "This rank does only %s work.\n",
1670             thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1671 }
1672
1673 /*! \brief Generates the MPI communicators for domain decomposition */
1674 static void make_dd_communicators(const gmx::MDLogger &mdlog,
1675                                   t_commrec *cr,
1676                                   gmx_domdec_t *dd, DdRankOrder ddRankOrder)
1677 {
1678     gmx_domdec_comm_t *comm;
1679     bool               CartReorder;
1680
1681     comm = dd->comm;
1682
1683     copy_ivec(dd->nc, comm->ntot);
1684
1685     comm->bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
1686     comm->bCartesianPP_PME = FALSE;
1687
1688     /* Reorder the nodes by default. This might change the MPI ranks.
1689      * Real reordering is only supported on very few architectures,
1690      * Blue Gene is one of them.
1691      */
1692     CartReorder = getenv("GMX_NO_CART_REORDER") == nullptr;
1693
1694     if (cr->npmenodes > 0)
1695     {
1696         /* Split the communicator into a PP and PME part */
1697         split_communicator(mdlog, cr, dd, ddRankOrder, CartReorder);
1698         if (comm->bCartesianPP_PME)
1699         {
1700             /* We (possibly) reordered the nodes in split_communicator,
1701              * so it is no longer required in make_pp_communicator.
1702              */
1703             CartReorder = FALSE;
1704         }
1705     }
1706     else
1707     {
1708         /* All nodes do PP and PME */
1709 #if GMX_MPI
1710         /* We do not require separate communicators */
1711         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
1712 #endif
1713     }
1714
1715     if (thisRankHasDuty(cr, DUTY_PP))
1716     {
1717         /* Copy or make a new PP communicator */
1718         make_pp_communicator(mdlog, dd, cr, CartReorder);
1719     }
1720     else
1721     {
1722         receive_ddindex2simnodeid(dd, cr);
1723     }
1724
1725     if (!thisRankHasDuty(cr, DUTY_PME))
1726     {
1727         /* Set up the commnuication to our PME node */
1728         dd->pme_nodeid           = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
1729         dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
1730         if (debug)
1731         {
1732             fprintf(debug, "My pme_nodeid %d receive ener %s\n",
1733                     dd->pme_nodeid, gmx::boolToString(dd->pme_receive_vir_ener));
1734         }
1735     }
1736     else
1737     {
1738         dd->pme_nodeid = -1;
1739     }
1740
1741     /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1742     if (MASTER(cr))
1743     {
1744         dd->ma = std::make_unique<AtomDistribution>(dd->nc,
1745                                                     comm->cgs_gl.nr,
1746                                                     comm->cgs_gl.index[comm->cgs_gl.nr]);
1747     }
1748 }
1749
1750 static real *get_slb_frac(const gmx::MDLogger &mdlog,
1751                           const char *dir, int nc, const char *size_string)
1752 {
1753     real  *slb_frac, tot;
1754     int    i, n;
1755     double dbl;
1756
1757     slb_frac = nullptr;
1758     if (nc > 1 && size_string != nullptr)
1759     {
1760         GMX_LOG(mdlog.info).appendTextFormatted(
1761                 "Using static load balancing for the %s direction", dir);
1762         snew(slb_frac, nc);
1763         tot = 0;
1764         for (i = 0; i < nc; i++)
1765         {
1766             dbl = 0;
1767             sscanf(size_string, "%20lf%n", &dbl, &n);
1768             if (dbl == 0)
1769             {
1770                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
1771             }
1772             slb_frac[i]  = dbl;
1773             size_string += n;
1774             tot         += slb_frac[i];
1775         }
1776         /* Normalize */
1777         std::string relativeCellSizes = "Relative cell sizes:";
1778         for (i = 0; i < nc; i++)
1779         {
1780             slb_frac[i]       /= tot;
1781             relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1782         }
1783         GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1784     }
1785
1786     return slb_frac;
1787 }
1788
1789 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
1790 {
1791     int                  n     = 0;
1792     gmx_mtop_ilistloop_t iloop = gmx_mtop_ilistloop_init(mtop);
1793     int                  nmol;
1794     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(iloop, &nmol))
1795     {
1796         for (auto &ilist : extractILists(*ilists, IF_BOND))
1797         {
1798             if (NRAL(ilist.functionType) >  2)
1799             {
1800                 n += nmol*(ilist.iatoms.size()/ilistStride(ilist));
1801             }
1802         }
1803     }
1804
1805     return n;
1806 }
1807
1808 static int dd_getenv(const gmx::MDLogger &mdlog,
1809                      const char *env_var, int def)
1810 {
1811     char *val;
1812     int   nst;
1813
1814     nst = def;
1815     val = getenv(env_var);
1816     if (val)
1817     {
1818         if (sscanf(val, "%20d", &nst) <= 0)
1819         {
1820             nst = 1;
1821         }
1822         GMX_LOG(mdlog.info).appendTextFormatted(
1823                 "Found env.var. %s = %s, using value %d",
1824                 env_var, val, nst);
1825     }
1826
1827     return nst;
1828 }
1829
1830 static void check_dd_restrictions(const gmx_domdec_t  *dd,
1831                                   const t_inputrec    *ir,
1832                                   const gmx::MDLogger &mdlog)
1833 {
1834     if (ir->ePBC == epbcSCREW &&
1835         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
1836     {
1837         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
1838     }
1839
1840     if (ir->ns_type == ensSIMPLE)
1841     {
1842         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
1843     }
1844
1845     if (ir->nstlist == 0)
1846     {
1847         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1848     }
1849
1850     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
1851     {
1852         GMX_LOG(mdlog.warning).appendText("comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
1853     }
1854 }
1855
1856 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
1857 {
1858     int  di, d;
1859     real r;
1860
1861     r = ddbox->box_size[XX];
1862     for (di = 0; di < dd->ndim; di++)
1863     {
1864         d = dd->dim[di];
1865         /* Check using the initial average cell size */
1866         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
1867     }
1868
1869     return r;
1870 }
1871
1872 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1873  */
1874 static DlbState forceDlbOffOrBail(DlbState             cmdlineDlbState,
1875                                   const std::string   &reasonStr,
1876                                   const gmx::MDLogger &mdlog)
1877 {
1878     std::string dlbNotSupportedErr  = "Dynamic load balancing requested, but ";
1879     std::string dlbDisableNote      = "NOTE: disabling dynamic load balancing as ";
1880
1881     if (cmdlineDlbState == DlbState::onUser)
1882     {
1883         gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1884     }
1885     else if (cmdlineDlbState == DlbState::offCanTurnOn)
1886     {
1887         GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1888     }
1889     return DlbState::offForever;
1890 }
1891
1892 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1893  *
1894  * This function parses the parameters of "-dlb" command line option setting
1895  * corresponding state values. Then it checks the consistency of the determined
1896  * state with other run parameters and settings. As a result, the initial state
1897  * may be altered or an error may be thrown if incompatibility of options is detected.
1898  *
1899  * \param [in] mdlog       Logger.
1900  * \param [in] dlbOption   Enum value for the DLB option.
1901  * \param [in] bRecordLoad True if the load balancer is recording load information.
1902  * \param [in] mdrunOptions  Options for mdrun.
1903  * \param [in] ir          Pointer mdrun to input parameters.
1904  * \returns                DLB initial/startup state.
1905  */
1906 static DlbState determineInitialDlbState(const gmx::MDLogger &mdlog,
1907                                          DlbOption dlbOption, gmx_bool bRecordLoad,
1908                                          const gmx::MdrunOptions &mdrunOptions,
1909                                          const t_inputrec *ir)
1910 {
1911     DlbState dlbState = DlbState::offCanTurnOn;
1912
1913     switch (dlbOption)
1914     {
1915         case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1916         case DlbOption::no:               dlbState = DlbState::offUser;      break;
1917         case DlbOption::yes:              dlbState = DlbState::onUser;       break;
1918         default: gmx_incons("Invalid dlbOption enum value");
1919     }
1920
1921     /* Reruns don't support DLB: bail or override auto mode */
1922     if (mdrunOptions.rerun)
1923     {
1924         std::string reasonStr = "it is not supported in reruns.";
1925         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1926     }
1927
1928     /* Unsupported integrators */
1929     if (!EI_DYNAMICS(ir->eI))
1930     {
1931         auto reasonStr = gmx::formatString("it is only supported with dynamics, not with integrator '%s'.", EI(ir->eI));
1932         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1933     }
1934
1935     /* Without cycle counters we can't time work to balance on */
1936     if (!bRecordLoad)
1937     {
1938         std::string reasonStr = "cycle counters unsupported or not enabled in the operating system kernel.";
1939         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1940     }
1941
1942     if (mdrunOptions.reproducible)
1943     {
1944         std::string reasonStr = "you started a reproducible run.";
1945         switch (dlbState)
1946         {
1947             case DlbState::offUser:
1948                 break;
1949             case DlbState::offForever:
1950                 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1951                 break;
1952             case DlbState::offCanTurnOn:
1953                 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1954             case DlbState::onCanTurnOff:
1955                 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1956                 break;
1957             case DlbState::onUser:
1958                 return forceDlbOffOrBail(dlbState, reasonStr + " In load balanced runs binary reproducibility cannot be ensured.", mdlog);
1959             default:
1960                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", static_cast<int>(dlbState));
1961         }
1962     }
1963
1964     return dlbState;
1965 }
1966
1967 static void set_dd_dim(const gmx::MDLogger &mdlog, gmx_domdec_t *dd)
1968 {
1969     dd->ndim = 0;
1970     if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
1971     {
1972         /* Decomposition order z,y,x */
1973         GMX_LOG(mdlog.info).appendText("Using domain decomposition order z, y, x");
1974         for (int dim = DIM-1; dim >= 0; dim--)
1975         {
1976             if (dd->nc[dim] > 1)
1977             {
1978                 dd->dim[dd->ndim++] = dim;
1979             }
1980         }
1981     }
1982     else
1983     {
1984         /* Decomposition order x,y,z */
1985         for (int dim = 0; dim < DIM; dim++)
1986         {
1987             if (dd->nc[dim] > 1)
1988             {
1989                 dd->dim[dd->ndim++] = dim;
1990             }
1991         }
1992     }
1993
1994     if (dd->ndim == 0)
1995     {
1996         /* Set dim[0] to avoid extra checks on ndim in several places */
1997         dd->dim[0] = XX;
1998     }
1999 }
2000
2001 static gmx_domdec_comm_t *init_dd_comm()
2002 {
2003     gmx_domdec_comm_t *comm = new gmx_domdec_comm_t;
2004
2005     comm->n_load_have      = 0;
2006     comm->n_load_collect   = 0;
2007
2008     comm->haveTurnedOffDlb = false;
2009
2010     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2011     {
2012         comm->sum_nat[i] = 0;
2013     }
2014     comm->ndecomp   = 0;
2015     comm->nload     = 0;
2016     comm->load_step = 0;
2017     comm->load_sum  = 0;
2018     comm->load_max  = 0;
2019     clear_ivec(comm->load_lim);
2020     comm->load_mdf  = 0;
2021     comm->load_pme  = 0;
2022
2023     /* This should be replaced by a unique pointer */
2024     comm->balanceRegion = ddBalanceRegionAllocate();
2025
2026     return comm;
2027 }
2028
2029 /* Returns whether mtop contains constraints and/or vsites */
2030 static bool systemHasConstraintsOrVsites(const gmx_mtop_t &mtop)
2031 {
2032     auto ilistLoop = gmx_mtop_ilistloop_init(mtop);
2033     int  nmol;
2034     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(ilistLoop, &nmol))
2035     {
2036         if (!extractILists(*ilists, IF_CONSTRAINT | IF_VSITE).empty())
2037         {
2038             return true;
2039         }
2040     }
2041
2042     return false;
2043 }
2044
2045 static void setupUpdateGroups(const gmx::MDLogger &mdlog,
2046                               const gmx_mtop_t    &mtop,
2047                               const t_inputrec    &inputrec,
2048                               real                 cutoffMargin,
2049                               int                  numMpiRanksTotal,
2050                               gmx_domdec_comm_t   *comm)
2051 {
2052     /* When we have constraints and/or vsites, it is beneficial to use
2053      * update groups (when possible) to allow independent update of groups.
2054      */
2055     if (!systemHasConstraintsOrVsites(mtop))
2056     {
2057         /* No constraints or vsites, atoms can be updated independently */
2058         return;
2059     }
2060
2061     comm->updateGroupingPerMoleculetype = gmx::makeUpdateGroups(mtop);
2062     comm->useUpdateGroups               =
2063         (!comm->updateGroupingPerMoleculetype.empty() &&
2064          getenv("GMX_NO_UPDATEGROUPS") == nullptr);
2065
2066     if (comm->useUpdateGroups)
2067     {
2068         int numUpdateGroups = 0;
2069         for (const auto &molblock : mtop.molblock)
2070         {
2071             numUpdateGroups += molblock.nmol*comm->updateGroupingPerMoleculetype[molblock.type].numBlocks();
2072         }
2073
2074         /* Note: We would like to use dd->nnodes for the atom count estimate,
2075          *       but that is not yet available here. But this anyhow only
2076          *       affect performance up to the second dd_partition_system call.
2077          */
2078         int homeAtomCountEstimate =  mtop.natoms/numMpiRanksTotal;
2079         comm->updateGroupsCog =
2080             std::make_unique<gmx::UpdateGroupsCog>(mtop,
2081                                                    comm->updateGroupingPerMoleculetype,
2082                                                    maxReferenceTemperature(inputrec),
2083                                                    homeAtomCountEstimate);
2084
2085         /* To use update groups, the large domain-to-domain cutoff distance
2086          * should be compatible with the box size.
2087          */
2088         comm->useUpdateGroups = (atomToAtomIntoDomainToDomainCutoff(*comm, 0) < cutoffMargin);
2089
2090         if (comm->useUpdateGroups)
2091         {
2092             GMX_LOG(mdlog.info).appendTextFormatted(
2093                     "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f nm\n",
2094                     numUpdateGroups,
2095                     mtop.natoms/static_cast<double>(numUpdateGroups),
2096                     comm->updateGroupsCog->maxUpdateGroupRadius());
2097         }
2098         else
2099         {
2100             GMX_LOG(mdlog.info).appendTextFormatted("The combination of rlist and box size prohibits the use of update groups\n");
2101             comm->updateGroupingPerMoleculetype.clear();
2102             comm->updateGroupsCog.reset(nullptr);
2103         }
2104     }
2105 }
2106
2107 UnitCellInfo::UnitCellInfo(const t_inputrec &ir) :
2108     npbcdim(ePBC2npbcdim(ir.ePBC)),
2109     numBoundedDimensions(inputrec2nboundeddim(&ir)),
2110     ddBoxIsDynamic(numBoundedDimensions < DIM || inputrecDynamicBox(&ir)),
2111     haveScrewPBC(ir.ePBC == epbcSCREW)
2112 {
2113 }
2114
2115 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2116 static void set_dd_limits_and_grid(const gmx::MDLogger &mdlog,
2117                                    t_commrec *cr, gmx_domdec_t *dd,
2118                                    const DomdecOptions &options,
2119                                    const gmx::MdrunOptions &mdrunOptions,
2120                                    const gmx_mtop_t *mtop,
2121                                    const t_inputrec *ir,
2122                                    const matrix box,
2123                                    gmx::ArrayRef<const gmx::RVec> xGlobal,
2124                                    gmx_ddbox_t *ddbox)
2125 {
2126     real               r_bonded         = -1;
2127     real               r_bonded_limit   = -1;
2128     const real         tenPercentMargin = 1.1;
2129     gmx_domdec_comm_t *comm             = dd->comm;
2130
2131     dd->pme_recv_f_alloc = 0;
2132     dd->pme_recv_f_buf   = nullptr;
2133
2134     /* Initialize to GPU share count to 0, might change later */
2135     comm->nrank_gpu_shared = 0;
2136
2137     comm->dlbState         = determineInitialDlbState(mdlog, options.dlbOption, comm->bRecordLoad, mdrunOptions, ir);
2138     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2139     /* To consider turning DLB on after 2*nstlist steps we need to check
2140      * at partitioning count 3. Thus we need to increase the first count by 2.
2141      */
2142     comm->ddPartioningCountFirstDlbOff += 2;
2143
2144     GMX_LOG(mdlog.info).appendTextFormatted(
2145             "Dynamic load balancing: %s", edlbs_names[int(comm->dlbState)]);
2146
2147     comm->bPMELoadBalDLBLimits = FALSE;
2148
2149     /* Allocate the charge group/atom sorting struct */
2150     comm->sort = std::make_unique<gmx_domdec_sort_t>();
2151
2152     /* We need to decide on update groups early, as this affects communication distances */
2153     comm->useUpdateGroups = false;
2154     if (ir->cutoff_scheme == ecutsVERLET)
2155     {
2156         real cutoffMargin = std::sqrt(max_cutoff2(ir->ePBC, box)) - ir->rlist;
2157         setupUpdateGroups(mdlog, *mtop, *ir, cutoffMargin, cr->nnodes, comm);
2158     }
2159
2160     // TODO: Check whether all bondeds are within update groups
2161     comm->haveInterDomainBondeds          = (mtop->natoms > gmx_mtop_num_molecules(*mtop) ||
2162                                              mtop->bIntermolecularInteractions);
2163     comm->haveInterDomainMultiBodyBondeds = (multi_body_bondeds_count(mtop) > 0);
2164
2165     if (comm->useUpdateGroups)
2166     {
2167         dd->splitConstraints = false;
2168         dd->splitSettles     = false;
2169     }
2170     else
2171     {
2172         dd->splitConstraints = gmx::inter_charge_group_constraints(*mtop);
2173         dd->splitSettles     = gmx::inter_charge_group_settles(*mtop);
2174     }
2175
2176     if (ir->rlist == 0)
2177     {
2178         /* Set the cut-off to some very large value,
2179          * so we don't need if statements everywhere in the code.
2180          * We use sqrt, since the cut-off is squared in some places.
2181          */
2182         comm->cutoff   = GMX_CUTOFF_INF;
2183     }
2184     else
2185     {
2186         comm->cutoff   = atomToAtomIntoDomainToDomainCutoff(*comm, ir->rlist);
2187     }
2188     comm->cutoff_mbody = 0;
2189
2190     /* Determine the minimum cell size limit, affected by many factors */
2191     comm->cellsize_limit = 0;
2192     comm->bBondComm      = FALSE;
2193
2194     /* We do not allow home atoms to move beyond the neighboring domain
2195      * between domain decomposition steps, which limits the cell size.
2196      * Get an estimate of cell size limit due to atom displacement.
2197      * In most cases this is a large overestimate, because it assumes
2198      * non-interaction atoms.
2199      * We set the chance to 1 in a trillion steps.
2200      */
2201     constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2202     const real     limitForAtomDisplacement          =
2203         minCellSizeForAtomDisplacement(*mtop, *ir,
2204                                        comm->updateGroupingPerMoleculetype,
2205                                        c_chanceThatAtomMovesBeyondDomain);
2206     GMX_LOG(mdlog.info).appendTextFormatted(
2207             "Minimum cell size due to atom displacement: %.3f nm",
2208             limitForAtomDisplacement);
2209
2210     comm->cellsize_limit = std::max(comm->cellsize_limit,
2211                                     limitForAtomDisplacement);
2212
2213     /* TODO: PME decomposition currently requires atoms not to be more than
2214      *       2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2215      *       In nearly all cases, limitForAtomDisplacement will be smaller
2216      *       than 2/3*rlist, so the PME requirement is satisfied.
2217      *       But it would be better for both correctness and performance
2218      *       to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2219      *       Note that we would need to improve the pairlist buffer case.
2220      */
2221
2222     if (comm->haveInterDomainBondeds)
2223     {
2224         if (options.minimumCommunicationRange > 0)
2225         {
2226             comm->cutoff_mbody = atomToAtomIntoDomainToDomainCutoff(*comm, options.minimumCommunicationRange);
2227             if (options.useBondedCommunication)
2228             {
2229                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
2230             }
2231             else
2232             {
2233                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
2234             }
2235             r_bonded_limit = comm->cutoff_mbody;
2236         }
2237         else if (ir->bPeriodicMols)
2238         {
2239             /* Can not easily determine the required cut-off */
2240             GMX_LOG(mdlog.warning).appendText("NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.");
2241             comm->cutoff_mbody = comm->cutoff/2;
2242             r_bonded_limit     = comm->cutoff_mbody;
2243         }
2244         else
2245         {
2246             real r_2b, r_mb;
2247
2248             if (MASTER(cr))
2249             {
2250                 dd_bonded_cg_distance(mdlog, mtop, ir, as_rvec_array(xGlobal.data()), box,
2251                                       options.checkBondedInteractions,
2252                                       &r_2b, &r_mb);
2253             }
2254             gmx_bcast(sizeof(r_2b), &r_2b, cr);
2255             gmx_bcast(sizeof(r_mb), &r_mb, cr);
2256
2257             /* We use an initial margin of 10% for the minimum cell size,
2258              * except when we are just below the non-bonded cut-off.
2259              */
2260             if (options.useBondedCommunication)
2261             {
2262                 if (std::max(r_2b, r_mb) > comm->cutoff)
2263                 {
2264                     r_bonded        = std::max(r_2b, r_mb);
2265                     r_bonded_limit  = tenPercentMargin*r_bonded;
2266                     comm->bBondComm = TRUE;
2267                 }
2268                 else
2269                 {
2270                     r_bonded       = r_mb;
2271                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
2272                 }
2273                 /* We determine cutoff_mbody later */
2274             }
2275             else
2276             {
2277                 /* No special bonded communication,
2278                  * simply increase the DD cut-off.
2279                  */
2280                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
2281                 comm->cutoff_mbody = r_bonded_limit;
2282                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
2283             }
2284         }
2285         GMX_LOG(mdlog.info).appendTextFormatted(
2286                 "Minimum cell size due to bonded interactions: %.3f nm",
2287                 r_bonded_limit);
2288
2289         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
2290     }
2291
2292     real rconstr = 0;
2293     if (dd->splitConstraints && options.constraintCommunicationRange <= 0)
2294     {
2295         /* There is a cell size limit due to the constraints (P-LINCS) */
2296         rconstr = gmx::constr_r_max(mdlog, mtop, ir);
2297         GMX_LOG(mdlog.info).appendTextFormatted(
2298                 "Estimated maximum distance required for P-LINCS: %.3f nm",
2299                 rconstr);
2300         if (rconstr > comm->cellsize_limit)
2301         {
2302             GMX_LOG(mdlog.info).appendText("This distance will limit the DD cell size, you can override this with -rcon");
2303         }
2304     }
2305     else if (options.constraintCommunicationRange > 0)
2306     {
2307         /* Here we do not check for dd->splitConstraints.
2308          * because one can also set a cell size limit for virtual sites only
2309          * and at this point we don't know yet if there are intercg v-sites.
2310          */
2311         GMX_LOG(mdlog.info).appendTextFormatted(
2312                 "User supplied maximum distance required for P-LINCS: %.3f nm",
2313                 options.constraintCommunicationRange);
2314         rconstr = options.constraintCommunicationRange;
2315     }
2316     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
2317
2318     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
2319
2320     if (options.numCells[XX] > 0)
2321     {
2322         copy_ivec(options.numCells, dd->nc);
2323         set_dd_dim(mdlog, dd);
2324         set_ddbox_cr(*cr, &dd->nc, *ir, box, xGlobal, ddbox);
2325
2326         if (options.numPmeRanks >= 0)
2327         {
2328             cr->npmenodes = options.numPmeRanks;
2329         }
2330         else
2331         {
2332             /* When the DD grid is set explicitly and -npme is set to auto,
2333              * don't use PME ranks. We check later if the DD grid is
2334              * compatible with the total number of ranks.
2335              */
2336             cr->npmenodes = 0;
2337         }
2338
2339         real acs = average_cellsize_min(dd, ddbox);
2340         if (acs < comm->cellsize_limit)
2341         {
2342             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2343                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
2344                                  acs, comm->cellsize_limit);
2345         }
2346     }
2347     else
2348     {
2349         set_ddbox_cr(*cr, nullptr, *ir, box, xGlobal, ddbox);
2350
2351         /* We need to choose the optimal DD grid and possibly PME nodes */
2352         real limit =
2353             dd_choose_grid(mdlog, cr, dd, ir, mtop, box, ddbox,
2354                            options.numPmeRanks,
2355                            !isDlbDisabled(comm),
2356                            options.dlbScaling,
2357                            comm->cellsize_limit, comm->cutoff,
2358                            comm->haveInterDomainBondeds);
2359
2360         if (dd->nc[XX] == 0)
2361         {
2362             char     buf[STRLEN];
2363             gmx_bool bC = (dd->splitConstraints && rconstr > r_bonded_limit);
2364             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
2365                     !bC ? "-rdd" : "-rcon",
2366                     comm->dlbState != DlbState::offUser ? " or -dds" : "",
2367                     bC ? " or your LINCS settings" : "");
2368
2369             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2370                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
2371                                  "%s\n"
2372                                  "Look in the log file for details on the domain decomposition",
2373                                  cr->nnodes-cr->npmenodes, limit, buf);
2374         }
2375         set_dd_dim(mdlog, dd);
2376     }
2377
2378     GMX_LOG(mdlog.info).appendTextFormatted(
2379             "Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2380             dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
2381
2382     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
2383     if (cr->nnodes - dd->nnodes != cr->npmenodes)
2384     {
2385         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2386                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
2387                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
2388     }
2389     if (cr->npmenodes > dd->nnodes)
2390     {
2391         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2392                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
2393     }
2394     if (cr->npmenodes > 0)
2395     {
2396         comm->npmenodes = cr->npmenodes;
2397     }
2398     else
2399     {
2400         comm->npmenodes = dd->nnodes;
2401     }
2402
2403     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2404     {
2405         /* The following choices should match those
2406          * in comm_cost_est in domdec_setup.c.
2407          * Note that here the checks have to take into account
2408          * that the decomposition might occur in a different order than xyz
2409          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2410          * in which case they will not match those in comm_cost_est,
2411          * but since that is mainly for testing purposes that's fine.
2412          */
2413         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
2414             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
2415             getenv("GMX_PMEONEDD") == nullptr)
2416         {
2417             comm->npmedecompdim = 2;
2418             comm->npmenodes_x   = dd->nc[XX];
2419             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
2420         }
2421         else
2422         {
2423             /* In case nc is 1 in both x and y we could still choose to
2424              * decompose pme in y instead of x, but we use x for simplicity.
2425              */
2426             comm->npmedecompdim = 1;
2427             if (dd->dim[0] == YY)
2428             {
2429                 comm->npmenodes_x = 1;
2430                 comm->npmenodes_y = comm->npmenodes;
2431             }
2432             else
2433             {
2434                 comm->npmenodes_x = comm->npmenodes;
2435                 comm->npmenodes_y = 1;
2436             }
2437         }
2438         GMX_LOG(mdlog.info).appendTextFormatted(
2439                 "PME domain decomposition: %d x %d x %d",
2440                 comm->npmenodes_x, comm->npmenodes_y, 1);
2441     }
2442     else
2443     {
2444         comm->npmedecompdim = 0;
2445         comm->npmenodes_x   = 0;
2446         comm->npmenodes_y   = 0;
2447     }
2448
2449     snew(comm->slb_frac, DIM);
2450     if (isDlbDisabled(comm))
2451     {
2452         comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->nc[XX], options.cellSizeX);
2453         comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->nc[YY], options.cellSizeY);
2454         comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->nc[ZZ], options.cellSizeZ);
2455     }
2456
2457     if (comm->haveInterDomainBondeds && comm->cutoff_mbody == 0)
2458     {
2459         if (comm->bBondComm || !isDlbDisabled(comm))
2460         {
2461             /* Set the bonded communication distance to halfway
2462              * the minimum and the maximum,
2463              * since the extra communication cost is nearly zero.
2464              */
2465             real acs           = average_cellsize_min(dd, ddbox);
2466             comm->cutoff_mbody = 0.5*(r_bonded + acs);
2467             if (!isDlbDisabled(comm))
2468             {
2469                 /* Check if this does not limit the scaling */
2470                 comm->cutoff_mbody = std::min(comm->cutoff_mbody,
2471                                               options.dlbScaling*acs);
2472             }
2473             if (!comm->bBondComm)
2474             {
2475                 /* Without bBondComm do not go beyond the n.b. cut-off */
2476                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
2477                 if (comm->cellsize_limit >= comm->cutoff)
2478                 {
2479                     /* We don't loose a lot of efficieny
2480                      * when increasing it to the n.b. cut-off.
2481                      * It can even be slightly faster, because we need
2482                      * less checks for the communication setup.
2483                      */
2484                     comm->cutoff_mbody = comm->cutoff;
2485                 }
2486             }
2487             /* Check if we did not end up below our original limit */
2488             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
2489
2490             if (comm->cutoff_mbody > comm->cellsize_limit)
2491             {
2492                 comm->cellsize_limit = comm->cutoff_mbody;
2493             }
2494         }
2495         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2496     }
2497
2498     if (debug)
2499     {
2500         fprintf(debug, "Bonded atom communication beyond the cut-off: %s\n"
2501                 "cellsize limit %f\n",
2502                 gmx::boolToString(comm->bBondComm), comm->cellsize_limit);
2503     }
2504
2505     if (MASTER(cr))
2506     {
2507         check_dd_restrictions(dd, ir, mdlog);
2508     }
2509 }
2510
2511 static char *init_bLocalCG(const gmx_mtop_t *mtop)
2512 {
2513     int   ncg, cg;
2514     char *bLocalCG;
2515
2516     ncg = ncg_mtop(mtop);
2517     snew(bLocalCG, ncg);
2518     for (cg = 0; cg < ncg; cg++)
2519     {
2520         bLocalCG[cg] = FALSE;
2521     }
2522
2523     return bLocalCG;
2524 }
2525
2526 void dd_init_bondeds(FILE *fplog,
2527                      gmx_domdec_t *dd,
2528                      const gmx_mtop_t *mtop,
2529                      const gmx_vsite_t *vsite,
2530                      const t_inputrec *ir,
2531                      gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
2532 {
2533     gmx_domdec_comm_t *comm;
2534
2535     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
2536
2537     comm = dd->comm;
2538
2539     if (comm->bBondComm)
2540     {
2541         /* Communicate atoms beyond the cut-off for bonded interactions */
2542         comm = dd->comm;
2543
2544         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
2545
2546         comm->bLocalCG = init_bLocalCG(mtop);
2547     }
2548     else
2549     {
2550         /* Only communicate atoms based on cut-off */
2551         comm->cglink   = nullptr;
2552         comm->bLocalCG = nullptr;
2553     }
2554 }
2555
2556 static void writeSettings(gmx::TextWriter       *log,
2557                           gmx_domdec_t          *dd,
2558                           const gmx_mtop_t      *mtop,
2559                           const t_inputrec      *ir,
2560                           gmx_bool               bDynLoadBal,
2561                           real                   dlb_scale,
2562                           const gmx_ddbox_t     *ddbox)
2563 {
2564     gmx_domdec_comm_t *comm;
2565     int                d;
2566     ivec               np;
2567     real               limit, shrink;
2568
2569     comm = dd->comm;
2570
2571     if (bDynLoadBal)
2572     {
2573         log->writeString("The maximum number of communication pulses is:");
2574         for (d = 0; d < dd->ndim; d++)
2575         {
2576             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2577         }
2578         log->ensureLineBreak();
2579         log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm", comm->cellsize_limit);
2580         log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2581         log->writeString("The allowed shrink of domain decomposition cells is:");
2582         for (d = 0; d < DIM; d++)
2583         {
2584             if (dd->nc[d] > 1)
2585             {
2586                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
2587                 {
2588                     shrink = 0;
2589                 }
2590                 else
2591                 {
2592                     shrink =
2593                         comm->cellsize_min_dlb[d]/
2594                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
2595                 }
2596                 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2597             }
2598         }
2599         log->ensureLineBreak();
2600     }
2601     else
2602     {
2603         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2604         log->writeString("The initial number of communication pulses is:");
2605         for (d = 0; d < dd->ndim; d++)
2606         {
2607             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2608         }
2609         log->ensureLineBreak();
2610         log->writeString("The initial domain decomposition cell size is:");
2611         for (d = 0; d < DIM; d++)
2612         {
2613             if (dd->nc[d] > 1)
2614             {
2615                 log->writeStringFormatted(" %c %.2f nm",
2616                                           dim2char(d), dd->comm->cellsize_min[d]);
2617             }
2618         }
2619         log->ensureLineBreak();
2620         log->writeLine();
2621     }
2622
2623     const bool haveInterDomainVsites =
2624         (countInterUpdategroupVsites(*mtop, comm->updateGroupingPerMoleculetype) != 0);
2625
2626     if (comm->haveInterDomainBondeds ||
2627         haveInterDomainVsites ||
2628         dd->splitConstraints || dd->splitSettles)
2629     {
2630         std::string decompUnits;
2631         if (comm->useUpdateGroups)
2632         {
2633             decompUnits = "atom groups";
2634         }
2635         else
2636         {
2637             decompUnits = "atoms";
2638         }
2639
2640         log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:", decompUnits.c_str());
2641         log->writeLineFormatted("%40s  %-7s %6.3f nm", "non-bonded interactions", "", comm->cutoff);
2642
2643         if (bDynLoadBal)
2644         {
2645             limit = dd->comm->cellsize_limit;
2646         }
2647         else
2648         {
2649             if (dd->unitCellInfo.ddBoxIsDynamic)
2650             {
2651                 log->writeLine("(the following are initial values, they could change due to box deformation)");
2652             }
2653             limit = dd->comm->cellsize_min[XX];
2654             for (d = 1; d < DIM; d++)
2655             {
2656                 limit = std::min(limit, dd->comm->cellsize_min[d]);
2657             }
2658         }
2659
2660         if (comm->haveInterDomainBondeds)
2661         {
2662             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2663                                     "two-body bonded interactions", "(-rdd)",
2664                                     std::max(comm->cutoff, comm->cutoff_mbody));
2665             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2666                                     "multi-body bonded interactions", "(-rdd)",
2667                                     (comm->bBondComm || isDlbOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
2668         }
2669         if (haveInterDomainVsites)
2670         {
2671             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2672                                     "virtual site constructions", "(-rcon)", limit);
2673         }
2674         if (dd->splitConstraints || dd->splitSettles)
2675         {
2676             std::string separation = gmx::formatString("atoms separated by up to %d constraints",
2677                                                        1+ir->nProjOrder);
2678             log->writeLineFormatted("%40s  %-7s %6.3f nm\n",
2679                                     separation.c_str(), "(-rcon)", limit);
2680         }
2681         log->ensureLineBreak();
2682     }
2683 }
2684
2685 static void logSettings(const gmx::MDLogger &mdlog,
2686                         gmx_domdec_t        *dd,
2687                         const gmx_mtop_t    *mtop,
2688                         const t_inputrec    *ir,
2689                         real                 dlb_scale,
2690                         const gmx_ddbox_t   *ddbox)
2691 {
2692     gmx::StringOutputStream stream;
2693     gmx::TextWriter         log(&stream);
2694     writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2695     if (dd->comm->dlbState == DlbState::offCanTurnOn)
2696     {
2697         {
2698             log.ensureEmptyLine();
2699             log.writeLine("When dynamic load balancing gets turned on, these settings will change to:");
2700         }
2701         writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2702     }
2703     GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2704 }
2705
2706 static void set_cell_limits_dlb(const gmx::MDLogger &mdlog,
2707                                 gmx_domdec_t        *dd,
2708                                 real                 dlb_scale,
2709                                 const t_inputrec    *ir,
2710                                 const gmx_ddbox_t   *ddbox)
2711 {
2712     gmx_domdec_comm_t *comm;
2713     int                d, dim, npulse, npulse_d_max, npulse_d;
2714     gmx_bool           bNoCutOff;
2715
2716     comm = dd->comm;
2717
2718     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
2719
2720     /* Determine the maximum number of comm. pulses in one dimension */
2721
2722     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2723
2724     /* Determine the maximum required number of grid pulses */
2725     if (comm->cellsize_limit >= comm->cutoff)
2726     {
2727         /* Only a single pulse is required */
2728         npulse = 1;
2729     }
2730     else if (!bNoCutOff && comm->cellsize_limit > 0)
2731     {
2732         /* We round down slightly here to avoid overhead due to the latency
2733          * of extra communication calls when the cut-off
2734          * would be only slightly longer than the cell size.
2735          * Later cellsize_limit is redetermined,
2736          * so we can not miss interactions due to this rounding.
2737          */
2738         npulse = static_cast<int>(0.96 + comm->cutoff/comm->cellsize_limit);
2739     }
2740     else
2741     {
2742         /* There is no cell size limit */
2743         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
2744     }
2745
2746     if (!bNoCutOff && npulse > 1)
2747     {
2748         /* See if we can do with less pulses, based on dlb_scale */
2749         npulse_d_max = 0;
2750         for (d = 0; d < dd->ndim; d++)
2751         {
2752             dim      = dd->dim[d];
2753             npulse_d = static_cast<int>(1 + dd->nc[dim]*comm->cutoff
2754                                         /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
2755             npulse_d_max = std::max(npulse_d_max, npulse_d);
2756         }
2757         npulse = std::min(npulse, npulse_d_max);
2758     }
2759
2760     /* This env var can override npulse */
2761     d = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2762     if (d > 0)
2763     {
2764         npulse = d;
2765     }
2766
2767     comm->maxpulse       = 1;
2768     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
2769     for (d = 0; d < dd->ndim; d++)
2770     {
2771         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
2772         comm->maxpulse        = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2773         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
2774         {
2775             comm->bVacDLBNoLimit = FALSE;
2776         }
2777     }
2778
2779     /* cellsize_limit is set for LINCS in init_domain_decomposition */
2780     if (!comm->bVacDLBNoLimit)
2781     {
2782         comm->cellsize_limit = std::max(comm->cellsize_limit,
2783                                         comm->cutoff/comm->maxpulse);
2784     }
2785     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2786     /* Set the minimum cell size for each DD dimension */
2787     for (d = 0; d < dd->ndim; d++)
2788     {
2789         if (comm->bVacDLBNoLimit ||
2790             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
2791         {
2792             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2793         }
2794         else
2795         {
2796             comm->cellsize_min_dlb[dd->dim[d]] =
2797                 comm->cutoff/comm->cd[d].np_dlb;
2798         }
2799     }
2800     if (comm->cutoff_mbody <= 0)
2801     {
2802         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
2803     }
2804     if (isDlbOn(comm))
2805     {
2806         set_dlb_limits(dd);
2807     }
2808 }
2809
2810 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
2811 {
2812     /* If each molecule is a single charge group
2813      * or we use domain decomposition for each periodic dimension,
2814      * we do not need to take pbc into account for the bonded interactions.
2815      */
2816     return (ePBC != epbcNONE && dd->comm->haveInterDomainBondeds &&
2817             !(dd->nc[XX] > 1 &&
2818               dd->nc[YY] > 1 &&
2819               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
2820 }
2821
2822 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2823 static void set_ddgrid_parameters(const gmx::MDLogger &mdlog,
2824                                   gmx_domdec_t *dd, real dlb_scale,
2825                                   const gmx_mtop_t *mtop, const t_inputrec *ir,
2826                                   const gmx_ddbox_t *ddbox)
2827 {
2828     gmx_domdec_comm_t *comm;
2829     int                natoms_tot;
2830     real               vol_frac;
2831
2832     comm = dd->comm;
2833
2834     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2835     {
2836         init_ddpme(dd, &comm->ddpme[0], 0);
2837         if (comm->npmedecompdim >= 2)
2838         {
2839             init_ddpme(dd, &comm->ddpme[1], 1);
2840         }
2841     }
2842     else
2843     {
2844         comm->npmenodes = 0;
2845         if (dd->pme_nodeid >= 0)
2846         {
2847             gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2848                                  "Can not have separate PME ranks without PME electrostatics");
2849         }
2850     }
2851
2852     if (debug)
2853     {
2854         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
2855     }
2856     if (!isDlbDisabled(comm))
2857     {
2858         set_cell_limits_dlb(mdlog, dd, dlb_scale, ir, ddbox);
2859     }
2860
2861     logSettings(mdlog, dd, mtop, ir, dlb_scale, ddbox);
2862
2863     if (ir->ePBC == epbcNONE)
2864     {
2865         vol_frac = 1 - 1/static_cast<double>(dd->nnodes);
2866     }
2867     else
2868     {
2869         vol_frac =
2870             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/static_cast<double>(dd->nnodes);
2871     }
2872     if (debug)
2873     {
2874         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2875     }
2876     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
2877
2878     dd->ga2la  = new gmx_ga2la_t(natoms_tot,
2879                                  static_cast<int>(vol_frac*natoms_tot));
2880 }
2881
2882 /*! \brief Set some important DD parameters that can be modified by env.vars */
2883 static void set_dd_envvar_options(const gmx::MDLogger &mdlog,
2884                                   gmx_domdec_t *dd, int rank_mysim)
2885 {
2886     gmx_domdec_comm_t *comm = dd->comm;
2887
2888     dd->bSendRecv2      = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2889     comm->dlb_scale_lim = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2890     comm->eFlop         = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2891     int recload         = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2892     comm->nstDDDump     = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2893     comm->nstDDDumpGrid = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2894     comm->DD_debug      = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2895
2896     if (dd->bSendRecv2)
2897     {
2898         GMX_LOG(mdlog.info).appendText("Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication");
2899     }
2900
2901     if (comm->eFlop)
2902     {
2903         GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2904         if (comm->eFlop > 1)
2905         {
2906             srand(1 + rank_mysim);
2907         }
2908         comm->bRecordLoad = TRUE;
2909     }
2910     else
2911     {
2912         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
2913     }
2914 }
2915
2916 gmx_domdec_t::gmx_domdec_t(const t_inputrec &ir) :
2917     unitCellInfo(ir)
2918 {
2919 }
2920
2921 gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger           &mdlog,
2922                                         t_commrec                     *cr,
2923                                         const DomdecOptions           &options,
2924                                         const gmx::MdrunOptions       &mdrunOptions,
2925                                         const gmx_mtop_t              *mtop,
2926                                         const t_inputrec              *ir,
2927                                         const matrix                   box,
2928                                         gmx::ArrayRef<const gmx::RVec> xGlobal,
2929                                         gmx::LocalAtomSetManager      *atomSets)
2930 {
2931     gmx_domdec_t      *dd;
2932
2933     GMX_LOG(mdlog.info).appendTextFormatted(
2934             "\nInitializing Domain Decomposition on %d ranks", cr->nnodes);
2935
2936     dd = new gmx_domdec_t(*ir);
2937
2938     dd->comm = init_dd_comm();
2939
2940     /* Initialize DD paritioning counters */
2941     dd->comm->partition_step = INT_MIN;
2942     dd->ddp_count            = 0;
2943
2944     set_dd_envvar_options(mdlog, dd, cr->nodeid);
2945
2946     gmx_ddbox_t ddbox = {0};
2947     set_dd_limits_and_grid(mdlog, cr, dd, options, mdrunOptions,
2948                            mtop, ir,
2949                            box, xGlobal,
2950                            &ddbox);
2951
2952     make_dd_communicators(mdlog, cr, dd, options.rankOrder);
2953
2954     if (thisRankHasDuty(cr, DUTY_PP))
2955     {
2956         set_ddgrid_parameters(mdlog, dd, options.dlbScaling, mtop, ir, &ddbox);
2957
2958         setup_neighbor_relations(dd);
2959     }
2960
2961     /* Set overallocation to avoid frequent reallocation of arrays */
2962     set_over_alloc_dd(TRUE);
2963
2964     clear_dd_cycle_counts(dd);
2965
2966     dd->atomSets = atomSets;
2967
2968     return dd;
2969 }
2970
2971 static gmx_bool test_dd_cutoff(t_commrec     *cr,
2972                                const t_state &state,
2973                                real           cutoffRequested)
2974 {
2975     gmx_domdec_t *dd;
2976     gmx_ddbox_t   ddbox;
2977     int           d, dim, np;
2978     real          inv_cell_size;
2979     int           LocallyLimited;
2980
2981     dd = cr->dd;
2982
2983     set_ddbox(*dd, false, state.box, true, state.x, &ddbox);
2984
2985     LocallyLimited = 0;
2986
2987     for (d = 0; d < dd->ndim; d++)
2988     {
2989         dim = dd->dim[d];
2990
2991         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
2992         if (dd->unitCellInfo.ddBoxIsDynamic)
2993         {
2994             inv_cell_size *= DD_PRES_SCALE_MARGIN;
2995         }
2996
2997         np = 1 + static_cast<int>(cutoffRequested*inv_cell_size*ddbox.skew_fac[dim]);
2998
2999         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3000         {
3001             if (np > dd->comm->cd[d].np_dlb)
3002             {
3003                 return FALSE;
3004             }
3005
3006             /* If a current local cell size is smaller than the requested
3007              * cut-off, we could still fix it, but this gets very complicated.
3008              * Without fixing here, we might actually need more checks.
3009              */
3010             real cellSizeAlongDim = (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim];
3011             if (cellSizeAlongDim*dd->comm->cd[d].np_dlb < cutoffRequested)
3012             {
3013                 LocallyLimited = 1;
3014             }
3015         }
3016     }
3017
3018     if (!isDlbDisabled(dd->comm))
3019     {
3020         /* If DLB is not active yet, we don't need to check the grid jumps.
3021          * Actually we shouldn't, because then the grid jump data is not set.
3022          */
3023         if (isDlbOn(dd->comm) &&
3024             check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3025         {
3026             LocallyLimited = 1;
3027         }
3028
3029         gmx_sumi(1, &LocallyLimited, cr);
3030
3031         if (LocallyLimited > 0)
3032         {
3033             return FALSE;
3034         }
3035     }
3036
3037     return TRUE;
3038 }
3039
3040 gmx_bool change_dd_cutoff(t_commrec     *cr,
3041                           const t_state &state,
3042                           real           cutoffRequested)
3043 {
3044     gmx_bool bCutoffAllowed;
3045
3046     bCutoffAllowed = test_dd_cutoff(cr, state, cutoffRequested);
3047
3048     if (bCutoffAllowed)
3049     {
3050         cr->dd->comm->cutoff = cutoffRequested;
3051     }
3052
3053     return bCutoffAllowed;
3054 }