src/gromacs/domdec/partition.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief This file defines functions for mdrun to call to make a new
  38  * domain decomposition, and check it.
  39  *
  40  * \author Berk Hess <hess@kth.se>
  41  * \ingroup module_domdec
  42  */
  43
  44 #include "gmxpre.h"
  45
  46 #include "partition.h"
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cstdio>
  52
  53 #include <algorithm>
  54
  55 #include "gromacs/domdec/collect.h"
  56 #include "gromacs/domdec/dlb.h"
  57 #include "gromacs/domdec/dlbtiming.h"
  58 #include "gromacs/domdec/domdec.h"
  59 #include "gromacs/domdec/domdec_network.h"
  60 #include "gromacs/domdec/ga2la.h"
  61 #include "gromacs/domdec/localatomsetmanager.h"
  62 #include "gromacs/domdec/mdsetup.h"
  63 #include "gromacs/ewald/pme.h"
  64 #include "gromacs/gmxlib/chargegroup.h"
  65 #include "gromacs/gmxlib/network.h"
  66 #include "gromacs/gmxlib/nrnb.h"
  67 #include "gromacs/imd/imd.h"
  68 #include "gromacs/math/functions.h"
  69 #include "gromacs/math/vec.h"
  70 #include "gromacs/mdlib/forcerec.h"
  71 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  72 #include "gromacs/mdlib/mdatoms.h"
  73 #include "gromacs/mdlib/nsgrid.h"
  74 #include "gromacs/mdlib/vsite.h"
  75 #include "gromacs/mdtypes/commrec.h"
  76 #include "gromacs/mdtypes/forcerec.h"
  77 #include "gromacs/mdtypes/inputrec.h"
  78 #include "gromacs/mdtypes/md_enums.h"
  79 #include "gromacs/mdtypes/nblist.h"
  80 #include "gromacs/mdtypes/state.h"
  81 #include "gromacs/nbnxm/nbnxm.h"
  82 #include "gromacs/pulling/pull.h"
  83 #include "gromacs/timing/wallcycle.h"
  84 #include "gromacs/topology/mtop_util.h"
  85 #include "gromacs/topology/topology.h"
  86 #include "gromacs/utility/cstringutil.h"
  87 #include "gromacs/utility/fatalerror.h"
  88 #include "gromacs/utility/logger.h"
  89 #include "gromacs/utility/real.h"
  90 #include "gromacs/utility/smalloc.h"
  91 #include "gromacs/utility/strconvert.h"
  92 #include "gromacs/utility/stringstream.h"
  93 #include "gromacs/utility/stringutil.h"
  94 #include "gromacs/utility/textwriter.h"
  95
  96 #include "box.h"
  97 #include "cellsizes.h"
  98 #include "distribute.h"
  99 #include "domdec_constraints.h"
 100 #include "domdec_internal.h"
 101 #include "domdec_vsite.h"
 102 #include "dump.h"
 103 #include "redistribute.h"
 104 #include "utility.h"
 105
 106 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
 107  *
 108  * There is a bit of overhead with DLB and it's difficult to achieve
 109  * a load imbalance of less than 2% with DLB.
 110  */
 111 #define DD_PERF_LOSS_DLB_ON  0.02
 112
 113 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
 114 #define DD_PERF_LOSS_WARN    0.05
 115
 116
 117 //! Debug helper printing a DD zone
 118 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 119 {
 120     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 121             d, i, j,
 122             zone->min0, zone->max1,
 123             zone->mch0, zone->mch0,
 124             zone->p1_0, zone->p1_1);
 125 }
 126
 127 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
 128  * takes the extremes over all home and remote zones in the halo
 129  * and returns the results in cell_ns_x0 and cell_ns_x1.
 130  * Note: only used with the group cut-off scheme.
 131  */
 132 static void dd_move_cellx(gmx_domdec_t      *dd,
 133                           const gmx_ddbox_t *ddbox,
 134                           rvec               cell_ns_x0,
 135                           rvec               cell_ns_x1)
 136 {
 137     constexpr int      c_ddZoneCommMaxNumZones = 5;
 138     gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
 139     gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
 140     gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
 141     gmx_domdec_comm_t *comm = dd->comm;
 142
 143     rvec               extr_s[2];
 144     rvec               extr_r[2];
 145     for (int d = 1; d < dd->ndim; d++)
 146     {
 147         int           dim = dd->dim[d];
 148         gmx_ddzone_t &zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 149
 150         /* Copy the base sizes of the home zone */
 151         zp.min0    = cell_ns_x0[dim];
 152         zp.max1    = cell_ns_x1[dim];
 153         zp.min1    = cell_ns_x1[dim];
 154         zp.mch0    = cell_ns_x0[dim];
 155         zp.mch1    = cell_ns_x1[dim];
 156         zp.p1_0    = cell_ns_x0[dim];
 157         zp.p1_1    = cell_ns_x1[dim];
 158         zp.dataSet = 1;
 159     }
 160
 161     gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
 162
 163     /* Loop backward over the dimensions and aggregate the extremes
 164      * of the cell sizes.
 165      */
 166     for (int d = dd->ndim - 2; d >= 0; d--)
 167     {
 168         const int  dim      = dd->dim[d];
 169         const bool applyPbc = (dim < ddbox->npbcdim);
 170
 171         /* Use an rvec to store two reals */
 172         extr_s[d][0] = cellsizes[d + 1].fracLower;
 173         extr_s[d][1] = cellsizes[d + 1].fracUpper;
 174         extr_s[d][2] = cellsizes[d + 1].fracUpper;
 175
 176         int pos = 0;
 177         GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
 178         /* Store the extremes in the backward sending buffer,
 179          * so they get updated separately from the forward communication.
 180          */
 181         for (int d1 = d; d1 < dd->ndim-1; d1++)
 182         {
 183             gmx_ddzone_t &buf = buf_s[pos];
 184
 185             /* We invert the order to be able to use the same loop for buf_e */
 186             buf.min0    = extr_s[d1][1];
 187             buf.max1    = extr_s[d1][0];
 188             buf.min1    = extr_s[d1][2];
 189             buf.mch0    = 0;
 190             buf.mch1    = 0;
 191             /* Store the cell corner of the dimension we communicate along */
 192             buf.p1_0    = comm->cell_x0[dim];
 193             buf.p1_1    = 0;
 194             buf.dataSet = 1;
 195             pos++;
 196         }
 197
 198         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 199         pos++;
 200
 201         if (dd->ndim == 3 && d == 0)
 202         {
 203             buf_s[pos] = comm->zone_d2[0][1];
 204             pos++;
 205             buf_s[pos] = comm->zone_d1[0];
 206             pos++;
 207         }
 208
 209         /* We only need to communicate the extremes
 210          * in the forward direction
 211          */
 212         int numPulses = comm->cd[d].numPulses();
 213         int numPulsesMin;
 214         if (applyPbc)
 215         {
 216             /* Take the minimum to avoid double communication */
 217             numPulsesMin = std::min(numPulses, dd->nc[dim] - 1 - numPulses);
 218         }
 219         else
 220         {
 221             /* Without PBC we should really not communicate over
 222              * the boundaries, but implementing that complicates
 223              * the communication setup and therefore we simply
 224              * do all communication, but ignore some data.
 225              */
 226             numPulsesMin = numPulses;
 227         }
 228         for (int pulse = 0; pulse < numPulsesMin; pulse++)
 229         {
 230             /* Communicate the extremes forward */
 231             bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
 232
 233             int  numElements      = dd->ndim - d - 1;
 234             ddSendrecv(dd, d, dddirForward,
 235                        extr_s + d, numElements,
 236                        extr_r + d, numElements);
 237
 238             if (receiveValidData)
 239             {
 240                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
 241                 {
 242                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 243                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 244                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 245                 }
 246             }
 247         }
 248
 249         const int numElementsInBuffer = pos;
 250         for (int pulse = 0; pulse < numPulses; pulse++)
 251         {
 252             /* Communicate all the zone information backward */
 253             bool receiveValidData = (applyPbc || dd->ci[dim] < dd->nc[dim] - 1);
 254
 255             static_assert(sizeof(gmx_ddzone_t) == c_ddzoneNumReals*sizeof(real), "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
 256
 257             int numReals = numElementsInBuffer*c_ddzoneNumReals;
 258             ddSendrecv(dd, d, dddirBackward,
 259                        gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
 260                        gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
 261
 262             rvec dh = { 0 };
 263             if (pulse > 0)
 264             {
 265                 for (int d1 = d + 1; d1 < dd->ndim; d1++)
 266                 {
 267                     /* Determine the decrease of maximum required
 268                      * communication height along d1 due to the distance along d,
 269                      * this avoids a lot of useless atom communication.
 270                      */
 271                     real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 272
 273                     real c;
 274                     if (ddbox->tric_dir[dim])
 275                     {
 276                         /* c is the off-diagonal coupling between the cell planes
 277                          * along directions d and d1.
 278                          */
 279                         c = ddbox->v[dim][dd->dim[d1]][dim];
 280                     }
 281                     else
 282                     {
 283                         c = 0;
 284                     }
 285                     real det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 286                     if (det > 0)
 287                     {
 288                         dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
 289                     }
 290                     else
 291                     {
 292                         /* A negative value signals out of range */
 293                         dh[d1] = -1;
 294                     }
 295                 }
 296             }
 297
 298             /* Accumulate the extremes over all pulses */
 299             for (int i = 0; i < numElementsInBuffer; i++)
 300             {
 301                 if (pulse == 0)
 302                 {
 303                     buf_e[i] = buf_r[i];
 304                 }
 305                 else
 306                 {
 307                     if (receiveValidData)
 308                     {
 309                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 310                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 311                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 312                     }
 313
 314                     int d1;
 315                     if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
 316                     {
 317                         d1 = 1;
 318                     }
 319                     else
 320                     {
 321                         d1 = d + 1;
 322                     }
 323                     if (receiveValidData && dh[d1] >= 0)
 324                     {
 325                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 326                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 327                     }
 328                 }
 329                 /* Copy the received buffer to the send buffer,
 330                  * to pass the data through with the next pulse.
 331                  */
 332                 buf_s[i] = buf_r[i];
 333             }
 334             if (((applyPbc || dd->ci[dim] + numPulses < dd->nc[dim]) && pulse == numPulses - 1) ||
 335                 (!applyPbc && dd->ci[dim] + 1 + pulse == dd->nc[dim] - 1))
 336             {
 337                 /* Store the extremes */
 338                 int pos = 0;
 339
 340                 for (int d1 = d; d1 < dd->ndim-1; d1++)
 341                 {
 342                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 343                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 344                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
 345                     pos++;
 346                 }
 347
 348                 if (d == 1 || (d == 0 && dd->ndim == 3))
 349                 {
 350                     for (int i = d; i < 2; i++)
 351                     {
 352                         comm->zone_d2[1-d][i] = buf_e[pos];
 353                         pos++;
 354                     }
 355                 }
 356                 if (d == 0)
 357                 {
 358                     comm->zone_d1[1] = buf_e[pos];
 359                     pos++;
 360                 }
 361             }
 362             else
 363             {
 364                 if (d == 1 || (d == 0 && dd->ndim == 3))
 365                 {
 366                     for (int i = d; i < 2; i++)
 367                     {
 368                         comm->zone_d2[1 - d][i].dataSet = 0;
 369                     }
 370                 }
 371                 if (d == 0)
 372                 {
 373                     comm->zone_d1[1].dataSet = 0;
 374                 }
 375             }
 376         }
 377     }
 378
 379     if (dd->ndim >= 2)
 380     {
 381         int dim = dd->dim[1];
 382         for (int i = 0; i < 2; i++)
 383         {
 384             if (comm->zone_d1[i].dataSet != 0)
 385             {
 386                 if (debug)
 387                 {
 388                     print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 389                 }
 390                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 391                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 392             }
 393         }
 394     }
 395     if (dd->ndim >= 3)
 396     {
 397         int dim = dd->dim[2];
 398         for (int i = 0; i < 2; i++)
 399         {
 400             for (int j = 0; j < 2; j++)
 401             {
 402                 if (comm->zone_d2[i][j].dataSet != 0)
 403                 {
 404                     if (debug)
 405                     {
 406                         print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 407                     }
 408                     cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 409                     cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 410                 }
 411             }
 412         }
 413     }
 414     for (int d = 1; d < dd->ndim; d++)
 415     {
 416         cellsizes[d].fracLowerMax = extr_s[d-1][0];
 417         cellsizes[d].fracUpperMin = extr_s[d-1][1];
 418         if (debug)
 419         {
 420             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
 421                     d, cellsizes[d].fracLowerMax, cellsizes[d].fracUpperMin);
 422         }
 423     }
 424 }
 425
 426 //! Sets the charge-group zones to be equal to the home zone.
 427 static void set_zones_ncg_home(gmx_domdec_t *dd)
 428 {
 429     gmx_domdec_zones_t *zones;
 430     int                 i;
 431
 432     zones = &dd->comm->zones;
 433
 434     zones->cg_range[0] = 0;
 435     for (i = 1; i < zones->n+1; i++)
 436     {
 437         zones->cg_range[i] = dd->ncg_home;
 438     }
 439     /* zone_ncg1[0] should always be equal to ncg_home */
 440     dd->comm->zone_ncg1[0] = dd->ncg_home;
 441 }
 442
 443 //! Restore atom groups for the charge groups.
 444 static void restoreAtomGroups(gmx_domdec_t  *dd,
 445                               const t_state *state)
 446 {
 447     gmx::ArrayRef<const int>  atomGroupsState        = state->cg_gl;
 448
 449     std::vector<int>         &globalAtomGroupIndices = dd->globalAtomGroupIndices;
 450
 451     globalAtomGroupIndices.resize(atomGroupsState.size());
 452
 453     /* Copy back the global charge group indices from state
 454      * and rebuild the local charge group to atom index.
 455      */
 456     for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
 457     {
 458         globalAtomGroupIndices[i] = atomGroupsState[i];
 459     }
 460
 461     dd->ncg_home = atomGroupsState.size();
 462     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGroupsState.ssize());
 463
 464     set_zones_ncg_home(dd);
 465 }
 466
 467 //! Sets the cginfo structures.
 468 static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1,
 469                           t_forcerec *fr, char *bLocalCG)
 470 {
 471     if (fr != nullptr)
 472     {
 473         const cginfo_mb_t *cginfo_mb = fr->cginfo_mb;
 474         gmx::ArrayRef<int> cginfo    = fr->cginfo;
 475
 476         for (int cg = cg0; cg < cg1; cg++)
 477         {
 478             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 479         }
 480     }
 481
 482     if (bLocalCG != nullptr)
 483     {
 484         for (int cg = cg0; cg < cg1; cg++)
 485         {
 486             bLocalCG[index_gl[cg]] = TRUE;
 487         }
 488     }
 489 }
 490
 491 //! Makes the mappings between global and local atom indices during DD repartioning.
 492 static void make_dd_indices(gmx_domdec_t *dd,
 493                             const int     atomStart)
 494 {
 495     const int                 numZones               = dd->comm->zones.n;
 496     const int                *zone2cg                = dd->comm->zones.cg_range;
 497     const int                *zone_ncg1              = dd->comm->zone_ncg1;
 498     gmx::ArrayRef<const int>  globalAtomGroupIndices = dd->globalAtomGroupIndices;
 499
 500     std::vector<int>         &globalAtomIndices      = dd->globalAtomIndices;
 501     gmx_ga2la_t              &ga2la                  = *dd->ga2la;
 502
 503     if (zone2cg[1] != dd->ncg_home)
 504     {
 505         gmx_incons("dd->ncg_zone is not up to date");
 506     }
 507
 508     /* Make the local to global and global to local atom index */
 509     int a = atomStart;
 510     globalAtomIndices.resize(a);
 511     for (int zone = 0; zone < numZones; zone++)
 512     {
 513         int cg0;
 514         if (zone == 0)
 515         {
 516             cg0 = atomStart;
 517         }
 518         else
 519         {
 520             cg0 = zone2cg[zone];
 521         }
 522         int cg1    = zone2cg[zone+1];
 523         int cg1_p1 = cg0 + zone_ncg1[zone];
 524
 525         for (int cg = cg0; cg < cg1; cg++)
 526         {
 527             int zone1 = zone;
 528             if (cg >= cg1_p1)
 529             {
 530                 /* Signal that this cg is from more than one pulse away */
 531                 zone1 += numZones;
 532             }
 533             int cg_gl = globalAtomGroupIndices[cg];
 534             globalAtomIndices.push_back(cg_gl);
 535             ga2la.insert(cg_gl, {a, zone1});
 536             a++;
 537         }
 538     }
 539 }
 540
 541 //! Checks the charge-group assignements.
 542 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
 543                           const char *where)
 544 {
 545     int nerr = 0;
 546     if (bLocalCG == nullptr)
 547     {
 548         return nerr;
 549     }
 550     for (size_t i = 0; i < dd->globalAtomGroupIndices.size(); i++)
 551     {
 552         if (!bLocalCG[dd->globalAtomGroupIndices[i]])
 553         {
 554             fprintf(stderr,
 555                     "DD rank %d, %s: atom group %zu, global atom group %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i + 1, dd->globalAtomGroupIndices[i] + 1, dd->ncg_home);
 556             nerr++;
 557         }
 558     }
 559     size_t ngl = 0;
 560     for (int i = 0; i < ncg_sys; i++)
 561     {
 562         if (bLocalCG[i])
 563         {
 564             ngl++;
 565         }
 566     }
 567     if (ngl != dd->globalAtomGroupIndices.size())
 568     {
 569         fprintf(stderr, "DD rank %d, %s: In bLocalCG %zu atom groups are marked as local, whereas there are %zu\n", dd->rank, where, ngl, dd->globalAtomGroupIndices.size());
 570         nerr++;
 571     }
 572
 573     return nerr;
 574 }
 575
 576 //! Checks whether global and local atom indices are consistent.
 577 static void check_index_consistency(gmx_domdec_t *dd,
 578                                     int natoms_sys, int ncg_sys,
 579                                     const char *where)
 580 {
 581     int       nerr = 0;
 582
 583     const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 584
 585     if (dd->comm->DD_debug > 1)
 586     {
 587         std::vector<int> have(natoms_sys);
 588         for (int a = 0; a < numAtomsInZones; a++)
 589         {
 590             int globalAtomIndex = dd->globalAtomIndices[a];
 591             if (have[globalAtomIndex] > 0)
 592             {
 593                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, globalAtomIndex + 1, have[globalAtomIndex], a+1);
 594             }
 595             else
 596             {
 597                 have[globalAtomIndex] = a + 1;
 598             }
 599         }
 600     }
 601
 602     std::vector<int> have(numAtomsInZones);
 603
 604     int              ngl = 0;
 605     for (int i = 0; i < natoms_sys; i++)
 606     {
 607         if (const auto entry = dd->ga2la->find(i))
 608         {
 609             const int a = entry->la;
 610             if (a >= numAtomsInZones)
 611             {
 612                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, numAtomsInZones);
 613                 nerr++;
 614             }
 615             else
 616             {
 617                 have[a] = 1;
 618                 if (dd->globalAtomIndices[a] != i)
 619                 {
 620                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->globalAtomIndices[a]+1);
 621                     nerr++;
 622                 }
 623             }
 624             ngl++;
 625         }
 626     }
 627     if (ngl != numAtomsInZones)
 628     {
 629         fprintf(stderr,
 630                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
 631                 dd->rank, where, ngl, numAtomsInZones);
 632     }
 633     for (int a = 0; a < numAtomsInZones; a++)
 634     {
 635         if (have[a] == 0)
 636         {
 637             fprintf(stderr,
 638                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
 639                     dd->rank, where, a + 1, dd->globalAtomIndices[a] + 1);
 640         }
 641     }
 642
 643     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
 644
 645     if (nerr > 0)
 646     {
 647         gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies",
 648                   dd->rank, where, nerr);
 649     }
 650 }
 651
 652 //! Clear all DD global state indices, starting from \p atomGroupStart and \p atomStart
 653 static void clearDDStateIndices(gmx_domdec_t *dd,
 654                                 int           atomGroupStart,
 655                                 int           atomStart)
 656 {
 657     gmx_ga2la_t &ga2la = *dd->ga2la;
 658
 659     if (atomStart == 0)
 660     {
 661         /* Clear the whole list without the overhead of searching */
 662         ga2la.clear();
 663     }
 664     else
 665     {
 666         const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 667         for (int i = 0; i < numAtomsInZones; i++)
 668         {
 669             ga2la.erase(dd->globalAtomIndices[i]);
 670         }
 671     }
 672
 673     char *bLocalCG = dd->comm->bLocalCG;
 674     if (bLocalCG)
 675     {
 676         for (size_t atomGroup = atomGroupStart; atomGroup < dd->globalAtomGroupIndices.size(); atomGroup++)
 677         {
 678             bLocalCG[dd->globalAtomGroupIndices[atomGroup]] = FALSE;
 679         }
 680     }
 681
 682     dd_clear_local_vsite_indices(dd);
 683
 684     if (dd->constraints)
 685     {
 686         dd_clear_local_constraint_indices(dd);
 687     }
 688 }
 689
 690 bool check_grid_jump(int64_t             step,
 691                      const gmx_domdec_t *dd,
 692                      real                cutoff,
 693                      const gmx_ddbox_t  *ddbox,
 694                      gmx_bool            bFatal)
 695 {
 696     gmx_domdec_comm_t *comm    = dd->comm;
 697     bool               invalid = false;
 698
 699     for (int d = 1; d < dd->ndim; d++)
 700     {
 701         const DDCellsizesWithDlb &cellsizes = comm->cellsizesWithDlb[d];
 702         const int                 dim       = dd->dim[d];
 703         const real                limit     = grid_jump_limit(comm, cutoff, d);
 704         real                      bfac      = ddbox->box_size[dim];
 705         if (ddbox->tric_dir[dim])
 706         {
 707             bfac *= ddbox->skew_fac[dim];
 708         }
 709         if ((cellsizes.fracUpper - cellsizes.fracLowerMax)*bfac <  limit ||
 710             (cellsizes.fracLower - cellsizes.fracUpperMin)*bfac > -limit)
 711         {
 712             invalid = true;
 713
 714             if (bFatal)
 715             {
 716                 char buf[22];
 717
 718                 /* This error should never be triggered under normal
 719                  * circumstances, but you never know ...
 720                  */
 721                 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
 722                           gmx_step_str(step, buf),
 723                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 724             }
 725         }
 726     }
 727
 728     return invalid;
 729 }
 730 //! Return the duration of force calculations on this rank.
 731 static float dd_force_load(gmx_domdec_comm_t *comm)
 732 {
 733     float load;
 734
 735     if (comm->eFlop)
 736     {
 737         load = comm->flop;
 738         if (comm->eFlop > 1)
 739         {
 740             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 741         }
 742     }
 743     else
 744     {
 745         load = comm->cycl[ddCyclF];
 746         if (comm->cycl_n[ddCyclF] > 1)
 747         {
 748             /* Subtract the maximum of the last n cycle counts
 749              * to get rid of possible high counts due to other sources,
 750              * for instance system activity, that would otherwise
 751              * affect the dynamic load balancing.
 752              */
 753             load -= comm->cycl_max[ddCyclF];
 754         }
 755
 756 #if GMX_MPI
 757         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
 758         {
 759             float gpu_wait, gpu_wait_sum;
 760
 761             gpu_wait = comm->cycl[ddCyclWaitGPU];
 762             if (comm->cycl_n[ddCyclF] > 1)
 763             {
 764                 /* We should remove the WaitGPU time of the same MD step
 765                  * as the one with the maximum F time, since the F time
 766                  * and the wait time are not independent.
 767                  * Furthermore, the step for the max F time should be chosen
 768                  * the same on all ranks that share the same GPU.
 769                  * But to keep the code simple, we remove the average instead.
 770                  * The main reason for artificially long times at some steps
 771                  * is spurious CPU activity or MPI time, so we don't expect
 772                  * that changes in the GPU wait time matter a lot here.
 773                  */
 774                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/static_cast<float>(comm->cycl_n[ddCyclF]);
 775             }
 776             /* Sum the wait times over the ranks that share the same GPU */
 777             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
 778                           comm->mpi_comm_gpu_shared);
 779             /* Replace the wait time by the average over the ranks */
 780             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
 781         }
 782 #endif
 783     }
 784
 785     return load;
 786 }
 787
 788 //! Runs cell size checks and communicates the boundaries.
 789 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 790                                   gmx_ddbox_t *ddbox,
 791                                   rvec cell_ns_x0, rvec cell_ns_x1,
 792                                   int64_t step)
 793 {
 794     gmx_domdec_comm_t *comm;
 795     int                dim_ind, dim;
 796
 797     comm = dd->comm;
 798
 799     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 800     {
 801         dim = dd->dim[dim_ind];
 802
 803         /* Without PBC we don't have restrictions on the outer cells */
 804         if (!(dim >= ddbox->npbcdim &&
 805               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 806             isDlbOn(comm) &&
 807             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 808             comm->cellsize_min[dim])
 809         {
 810             char buf[22];
 811             gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 812                       gmx_step_str(step, buf), dim2char(dim),
 813                       comm->cell_x1[dim] - comm->cell_x0[dim],
 814                       ddbox->skew_fac[dim],
 815                       dd->comm->cellsize_min[dim],
 816                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 817         }
 818     }
 819
 820     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 821     {
 822         /* Communicate the boundaries and update cell_ns_x0/1 */
 823         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 824         if (isDlbOn(dd->comm) && dd->ndim > 1)
 825         {
 826             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
 827         }
 828     }
 829 }
 830
 831 //! Compute and communicate to determine the load distribution across PP ranks.
 832 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
 833 {
 834     gmx_domdec_comm_t *comm;
 835     domdec_load_t     *load;
 836     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 837     gmx_bool           bSepPME;
 838
 839     if (debug)
 840     {
 841         fprintf(debug, "get_load_distribution start\n");
 842     }
 843
 844     wallcycle_start(wcycle, ewcDDCOMMLOAD);
 845
 846     comm = dd->comm;
 847
 848     bSepPME = (dd->pme_nodeid >= 0);
 849
 850     if (dd->ndim == 0 && bSepPME)
 851     {
 852         /* Without decomposition, but with PME nodes, we need the load */
 853         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
 854         comm->load[0].pme = comm->cycl[ddCyclPME];
 855     }
 856
 857     for (int d = dd->ndim - 1; d >= 0; d--)
 858     {
 859         const DDCellsizesWithDlb *cellsizes = (isDlbOn(dd->comm) ? &comm->cellsizesWithDlb[d] : nullptr);
 860         const int                 dim       = dd->dim[d];
 861         /* Check if we participate in the communication in this dimension */
 862         if (d == dd->ndim-1 ||
 863             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
 864         {
 865             load = &comm->load[d];
 866             if (isDlbOn(dd->comm))
 867             {
 868                 cell_frac = cellsizes->fracUpper - cellsizes->fracLower;
 869             }
 870             int pos = 0;
 871             if (d == dd->ndim-1)
 872             {
 873                 sbuf[pos++] = dd_force_load(comm);
 874                 sbuf[pos++] = sbuf[0];
 875                 if (isDlbOn(dd->comm))
 876                 {
 877                     sbuf[pos++] = sbuf[0];
 878                     sbuf[pos++] = cell_frac;
 879                     if (d > 0)
 880                     {
 881                         sbuf[pos++] = cellsizes->fracLowerMax;
 882                         sbuf[pos++] = cellsizes->fracUpperMin;
 883                     }
 884                 }
 885                 if (bSepPME)
 886                 {
 887                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 888                     sbuf[pos++] = comm->cycl[ddCyclPME];
 889                 }
 890             }
 891             else
 892             {
 893                 sbuf[pos++] = comm->load[d+1].sum;
 894                 sbuf[pos++] = comm->load[d+1].max;
 895                 if (isDlbOn(dd->comm))
 896                 {
 897                     sbuf[pos++] = comm->load[d+1].sum_m;
 898                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 899                     sbuf[pos++] = comm->load[d+1].flags;
 900                     if (d > 0)
 901                     {
 902                         sbuf[pos++] = cellsizes->fracLowerMax;
 903                         sbuf[pos++] = cellsizes->fracUpperMin;
 904                     }
 905                 }
 906                 if (bSepPME)
 907                 {
 908                     sbuf[pos++] = comm->load[d+1].mdf;
 909                     sbuf[pos++] = comm->load[d+1].pme;
 910                 }
 911             }
 912             load->nload = pos;
 913             /* Communicate a row in DD direction d.
 914              * The communicators are setup such that the root always has rank 0.
 915              */
 916 #if GMX_MPI
 917             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
 918                        load->load, load->nload*sizeof(float), MPI_BYTE,
 919                        0, comm->mpi_comm_load[d]);
 920 #endif
 921             if (dd->ci[dim] == dd->master_ci[dim])
 922             {
 923                 /* We are the master along this row, process this row */
 924                 RowMaster *rowMaster = nullptr;
 925
 926                 if (isDlbOn(comm))
 927                 {
 928                     rowMaster = cellsizes->rowMaster.get();
 929                 }
 930                 load->sum      = 0;
 931                 load->max      = 0;
 932                 load->sum_m    = 0;
 933                 load->cvol_min = 1;
 934                 load->flags    = 0;
 935                 load->mdf      = 0;
 936                 load->pme      = 0;
 937                 int pos        = 0;
 938                 for (int i = 0; i < dd->nc[dim]; i++)
 939                 {
 940                     load->sum += load->load[pos++];
 941                     load->max  = std::max(load->max, load->load[pos]);
 942                     pos++;
 943                     if (isDlbOn(dd->comm))
 944                     {
 945                         if (rowMaster->dlbIsLimited)
 946                         {
 947                             /* This direction could not be load balanced properly,
 948                              * therefore we need to use the maximum iso the average load.
 949                              */
 950                             load->sum_m = std::max(load->sum_m, load->load[pos]);
 951                         }
 952                         else
 953                         {
 954                             load->sum_m += load->load[pos];
 955                         }
 956                         pos++;
 957                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
 958                         pos++;
 959                         if (d < dd->ndim-1)
 960                         {
 961                             load->flags = gmx::roundToInt(load->load[pos++]);
 962                         }
 963                         if (d > 0)
 964                         {
 965                             rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
 966                             rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
 967                         }
 968                     }
 969                     if (bSepPME)
 970                     {
 971                         load->mdf = std::max(load->mdf, load->load[pos]);
 972                         pos++;
 973                         load->pme = std::max(load->pme, load->load[pos]);
 974                         pos++;
 975                     }
 976                 }
 977                 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
 978                 {
 979                     load->sum_m *= dd->nc[dim];
 980                     load->flags |= (1<<d);
 981                 }
 982             }
 983         }
 984     }
 985
 986     if (DDMASTER(dd))
 987     {
 988         comm->nload      += dd_load_count(comm);
 989         comm->load_step  += comm->cycl[ddCyclStep];
 990         comm->load_sum   += comm->load[0].sum;
 991         comm->load_max   += comm->load[0].max;
 992         if (isDlbOn(comm))
 993         {
 994             for (int d = 0; d < dd->ndim; d++)
 995             {
 996                 if (comm->load[0].flags & (1<<d))
 997                 {
 998                     comm->load_lim[d]++;
 999                 }
1000             }
1001         }
1002         if (bSepPME)
1003         {
1004             comm->load_mdf += comm->load[0].mdf;
1005             comm->load_pme += comm->load[0].pme;
1006         }
1007     }
1008
1009     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
1010
1011     if (debug)
1012     {
1013         fprintf(debug, "get_load_distribution finished\n");
1014     }
1015 }
1016
1017 /*! \brief Return the relative performance loss on the total run time
1018  * due to the force calculation load imbalance. */
1019 static float dd_force_load_fraction(gmx_domdec_t *dd)
1020 {
1021     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
1022     {
1023         return dd->comm->load_sum/(dd->comm->load_step*dd->nnodes);
1024     }
1025     else
1026     {
1027         return 0;
1028     }
1029 }
1030
1031 /*! \brief Return the relative performance loss on the total run time
1032  * due to the force calculation load imbalance. */
1033 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
1034 {
1035     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
1036     {
1037         return
1038             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
1039             (dd->comm->load_step*dd->nnodes);
1040     }
1041     else
1042     {
1043         return 0;
1044     }
1045 }
1046
1047 //! Print load-balance report e.g. at the end of a run.
1048 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
1049 {
1050     gmx_domdec_comm_t *comm = dd->comm;
1051
1052     /* Only the master rank prints loads and only if we measured loads */
1053     if (!DDMASTER(dd) || comm->nload == 0)
1054     {
1055         return;
1056     }
1057
1058     char  buf[STRLEN];
1059     int   numPpRanks   = dd->nnodes;
1060     int   numPmeRanks  = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
1061     int   numRanks     = numPpRanks + numPmeRanks;
1062     float lossFraction = 0;
1063
1064     /* Print the average load imbalance and performance loss */
1065     if (dd->nnodes > 1 && comm->load_sum > 0)
1066     {
1067         float imbalance = comm->load_max*numPpRanks/comm->load_sum - 1;
1068         lossFraction    = dd_force_imb_perf_loss(dd);
1069
1070         std::string msg         = "\nDynamic load balancing report:\n";
1071         std::string dlbStateStr;
1072
1073         switch (dd->comm->dlbState)
1074         {
1075             case DlbState::offUser:
1076                 dlbStateStr = "DLB was off during the run per user request.";
1077                 break;
1078             case DlbState::offForever:
1079                 /* Currectly this can happen due to performance loss observed, cell size
1080                  * limitations or incompatibility with other settings observed during
1081                  * determineInitialDlbState(). */
1082                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1083                 break;
1084             case DlbState::offCanTurnOn:
1085                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1086                 break;
1087             case DlbState::offTemporarilyLocked:
1088                 dlbStateStr = "DLB was locked at the end of the run due to unfinished PP-PME balancing.";
1089                 break;
1090             case DlbState::onCanTurnOff:
1091                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1092                 break;
1093             case DlbState::onUser:
1094                 dlbStateStr = "DLB was permanently on during the run per user request.";
1095                 break;
1096             default:
1097                 GMX_ASSERT(false, "Undocumented DLB state");
1098         }
1099
1100         msg += " " + dlbStateStr + "\n";
1101         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance*100);
1102         msg += gmx::formatString(" The balanceable part of the MD step is %d%%, load imbalance is computed from this.\n",
1103                                  gmx::roundToInt(dd_force_load_fraction(dd)*100));
1104         msg += gmx::formatString(" Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1105                                  lossFraction*100);
1106         fprintf(fplog, "%s", msg.c_str());
1107         fprintf(stderr, "\n%s", msg.c_str());
1108     }
1109
1110     /* Print during what percentage of steps the  load balancing was limited */
1111     bool dlbWasLimited = false;
1112     if (isDlbOn(comm))
1113     {
1114         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1115         for (int d = 0; d < dd->ndim; d++)
1116         {
1117             int limitPercentage = (200*comm->load_lim[d] + 1)/(2*comm->nload);
1118             sprintf(buf+strlen(buf), " %c %d %%",
1119                     dim2char(dd->dim[d]), limitPercentage);
1120             if (limitPercentage >= 50)
1121             {
1122                 dlbWasLimited = true;
1123             }
1124         }
1125         sprintf(buf + strlen(buf), "\n");
1126         fprintf(fplog, "%s", buf);
1127         fprintf(stderr, "%s", buf);
1128     }
1129
1130     /* Print the performance loss due to separate PME - PP rank imbalance */
1131     float lossFractionPme = 0;
1132     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1133     {
1134         float pmeForceRatio = comm->load_pme/comm->load_mdf;
1135         lossFractionPme     = (comm->load_pme - comm->load_mdf)/comm->load_step;
1136         if (lossFractionPme <= 0)
1137         {
1138             lossFractionPme *= numPmeRanks/static_cast<float>(numRanks);
1139         }
1140         else
1141         {
1142             lossFractionPme *= numPpRanks/static_cast<float>(numRanks);
1143         }
1144         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1145         fprintf(fplog, "%s", buf);
1146         fprintf(stderr, "%s", buf);
1147         sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", std::fabs(lossFractionPme)*100);
1148         fprintf(fplog, "%s", buf);
1149         fprintf(stderr, "%s", buf);
1150     }
1151     fprintf(fplog, "\n");
1152     fprintf(stderr, "\n");
1153
1154     if (lossFraction >= DD_PERF_LOSS_WARN)
1155     {
1156         std::string message = gmx::formatString(
1157                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1158                     "      in the domain decomposition.\n", lossFraction*100);
1159
1160         bool hadSuggestion = false;
1161         if (!isDlbOn(comm))
1162         {
1163             message      += "      You might want to use dynamic load balancing (option -dlb.)\n";
1164             hadSuggestion = true;
1165         }
1166         else if (dlbWasLimited)
1167         {
1168             message      += "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n";
1169             hadSuggestion = true;
1170         }
1171         message += gmx::formatString(
1172                     "      You can %sconsider manually changing the decomposition (option -dd);\n"
1173                     "      e.g. by using fewer domains along the box dimension in which there is\n"
1174                     "      considerable inhomogeneity in the simulated system.",
1175                     hadSuggestion ? "also " : "");
1176
1177
1178         fprintf(fplog, "%s\n", message.c_str());
1179         fprintf(stderr, "%s\n", message.c_str());
1180     }
1181     if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1182     {
1183         sprintf(buf,
1184                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1185                 "      had %s work to do than the PP ranks.\n"
1186                 "      You might want to %s the number of PME ranks\n"
1187                 "      or %s the cut-off and the grid spacing.\n",
1188                 std::fabs(lossFractionPme*100),
1189                 (lossFractionPme < 0) ? "less"     : "more",
1190                 (lossFractionPme < 0) ? "decrease" : "increase",
1191                 (lossFractionPme < 0) ? "decrease" : "increase");
1192         fprintf(fplog, "%s\n", buf);
1193         fprintf(stderr, "%s\n", buf);
1194     }
1195 }
1196
1197 //! Return the minimum communication volume.
1198 static float dd_vol_min(gmx_domdec_t *dd)
1199 {
1200     return dd->comm->load[0].cvol_min*dd->nnodes;
1201 }
1202
1203 //! Return the DD load flags.
1204 static int dd_load_flags(gmx_domdec_t *dd)
1205 {
1206     return dd->comm->load[0].flags;
1207 }
1208
1209 //! Return the reported load imbalance in force calculations.
1210 static float dd_f_imbal(gmx_domdec_t *dd)
1211 {
1212     if (dd->comm->load[0].sum > 0)
1213     {
1214         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0F;
1215     }
1216     else
1217     {
1218         /* Something is wrong in the cycle counting, report no load imbalance */
1219         return 0.0F;
1220     }
1221 }
1222
1223 //! Returns DD load balance report.
1224 static std::string
1225 dd_print_load(gmx_domdec_t *dd,
1226               int64_t       step)
1227 {
1228     gmx::StringOutputStream stream;
1229     gmx::TextWriter         log(&stream);
1230
1231     int                     flags = dd_load_flags(dd);
1232     if (flags)
1233     {
1234         log.writeString("DD  load balancing is limited by minimum cell size in dimension");
1235         for (int d = 0; d < dd->ndim; d++)
1236         {
1237             if (flags & (1<<d))
1238             {
1239                 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1240             }
1241         }
1242         log.ensureLineBreak();
1243     }
1244     log.writeString("DD  step " + gmx::toString(step));
1245     if (isDlbOn(dd->comm))
1246     {
1247         log.writeStringFormatted("  vol min/aver %5.3f%c",
1248                                  dd_vol_min(dd), flags ? '!' : ' ');
1249     }
1250     if (dd->nnodes > 1)
1251     {
1252         log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
1253     }
1254     if (dd->comm->cycl_n[ddCyclPME])
1255     {
1256         log.writeStringFormatted("  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1257     }
1258     log.ensureLineBreak();
1259     return stream.toString();
1260 }
1261
1262 //! Prints DD load balance report in mdrun verbose mode.
1263 static void dd_print_load_verbose(gmx_domdec_t *dd)
1264 {
1265     if (isDlbOn(dd->comm))
1266     {
1267         fprintf(stderr, "vol %4.2f%c ",
1268                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1269     }
1270     if (dd->nnodes > 1)
1271     {
1272         fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd)*100));
1273     }
1274     if (dd->comm->cycl_n[ddCyclPME])
1275     {
1276         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1277     }
1278 }
1279
1280 //! Turns on dynamic load balancing if possible and needed.
1281 static void turn_on_dlb(const gmx::MDLogger &mdlog,
1282                         gmx_domdec_t        *dd,
1283                         int64_t              step)
1284 {
1285     gmx_domdec_comm_t *comm         = dd->comm;
1286
1287     real               cellsize_min = comm->cellsize_min[dd->dim[0]];
1288     for (int d = 1; d < dd->ndim; d++)
1289     {
1290         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1291     }
1292
1293     /* Turn off DLB if we're too close to the cell size limit. */
1294     if (cellsize_min < comm->cellsize_limit*1.05)
1295     {
1296         GMX_LOG(mdlog.info).appendTextFormatted(
1297                 "step %s Measured %.1f %% performance loss due to load imbalance, "
1298                 "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1299                 "Will no longer try dynamic load balancing.",
1300                 gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd)*100);
1301
1302         comm->dlbState = DlbState::offForever;
1303         return;
1304     }
1305
1306     GMX_LOG(mdlog.info).appendTextFormatted(
1307             "step %s Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.",
1308             gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd)*100);
1309     comm->dlbState = DlbState::onCanTurnOff;
1310
1311     /* Store the non-DLB performance, so we can check if DLB actually
1312      * improves performance.
1313      */
1314     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
1315     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
1316
1317     set_dlb_limits(dd);
1318
1319     /* We can set the required cell size info here,
1320      * so we do not need to communicate this.
1321      * The grid is completely uniform.
1322      */
1323     for (int d = 0; d < dd->ndim; d++)
1324     {
1325         RowMaster *rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1326
1327         if (rowMaster)
1328         {
1329             comm->load[d].sum_m = comm->load[d].sum;
1330
1331             int nc = dd->nc[dd->dim[d]];
1332             for (int i = 0; i < nc; i++)
1333             {
1334                 rowMaster->cellFrac[i] = i/static_cast<real>(nc);
1335                 if (d > 0)
1336                 {
1337                     rowMaster->bounds[i].cellFracLowerMax =  i     /static_cast<real>(nc);
1338                     rowMaster->bounds[i].cellFracUpperMin = (i + 1)/static_cast<real>(nc);
1339                 }
1340             }
1341             rowMaster->cellFrac[nc] = 1.0;
1342         }
1343     }
1344 }
1345
1346 //! Turns off dynamic load balancing (but leave it able to turn back on).
1347 static void turn_off_dlb(const gmx::MDLogger &mdlog,
1348                          gmx_domdec_t        *dd,
1349                          int64_t              step)
1350 {
1351     GMX_LOG(mdlog.info).appendText(
1352             "step " + gmx::toString(step) + " Turning off dynamic load balancing, because it is degrading performance.");
1353     dd->comm->dlbState                     = DlbState::offCanTurnOn;
1354     dd->comm->haveTurnedOffDlb             = true;
1355     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1356 }
1357
1358 //! Turns off dynamic load balancing permanently.
1359 static void turn_off_dlb_forever(const gmx::MDLogger &mdlog,
1360                                  gmx_domdec_t        *dd,
1361                                  int64_t              step)
1362 {
1363     GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
1364     GMX_LOG(mdlog.info).appendText(
1365             "step " + gmx::toString(step) + " Will no longer try dynamic load balancing, as it degraded performance.");
1366     dd->comm->dlbState = DlbState::offForever;
1367 }
1368
1369 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
1370 {
1371     gmx_domdec_comm_t *comm;
1372
1373     comm = cr->dd->comm;
1374
1375     /* Turn on the DLB limiting (might have been on already) */
1376     comm->bPMELoadBalDLBLimits = TRUE;
1377
1378     /* Change the cut-off limit */
1379     comm->PMELoadBal_max_cutoff = cutoff;
1380
1381     if (debug)
1382     {
1383         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
1384                 comm->PMELoadBal_max_cutoff);
1385     }
1386 }
1387
1388 //! Merge atom buffers.
1389 static void merge_cg_buffers(int ncell,
1390                              gmx_domdec_comm_dim_t *cd, int pulse,
1391                              int  *ncg_cell,
1392                              gmx::ArrayRef<int> index_gl,
1393                              const int  *recv_i,
1394                              gmx::ArrayRef<gmx::RVec> x,
1395                              gmx::ArrayRef<const gmx::RVec> recv_vr,
1396                              cginfo_mb_t *cginfo_mb,
1397                              gmx::ArrayRef<int> cginfo)
1398 {
1399     gmx_domdec_ind_t *ind, *ind_p;
1400     int               p, cell, c, cg, cg0, cg1, cg_gl;
1401     int               shift;
1402
1403     ind = &cd->ind[pulse];
1404
1405     /* First correct the already stored data */
1406     shift = ind->nrecv[ncell];
1407     for (cell = ncell-1; cell >= 0; cell--)
1408     {
1409         shift -= ind->nrecv[cell];
1410         if (shift > 0)
1411         {
1412             /* Move the cg's present from previous grid pulses */
1413             cg0                = ncg_cell[ncell+cell];
1414             cg1                = ncg_cell[ncell+cell+1];
1415             for (cg = cg1-1; cg >= cg0; cg--)
1416             {
1417                 index_gl[cg + shift] = index_gl[cg];
1418                 x[cg + shift]        = x[cg];
1419                 cginfo[cg + shift]   = cginfo[cg];
1420             }
1421             /* Correct the already stored send indices for the shift */
1422             for (p = 1; p <= pulse; p++)
1423             {
1424                 ind_p = &cd->ind[p];
1425                 cg0   = 0;
1426                 for (c = 0; c < cell; c++)
1427                 {
1428                     cg0 += ind_p->nsend[c];
1429                 }
1430                 cg1 = cg0 + ind_p->nsend[cell];
1431                 for (cg = cg0; cg < cg1; cg++)
1432                 {
1433                     ind_p->index[cg] += shift;
1434                 }
1435             }
1436         }
1437     }
1438
1439     /* Merge in the communicated buffers */
1440     shift    = 0;
1441     cg0      = 0;
1442     for (cell = 0; cell < ncell; cell++)
1443     {
1444         cg1 = ncg_cell[ncell+cell+1] + shift;
1445         for (cg = 0; cg < ind->nrecv[cell]; cg++)
1446         {
1447             /* Copy this atom from the buffer */
1448             index_gl[cg1] = recv_i[cg0];
1449             x[cg1]        = recv_vr[cg0];
1450             /* Copy information */
1451             cg_gl          = index_gl[cg1];
1452             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
1453             cg0++;
1454             cg1++;
1455         }
1456         shift                 += ind->nrecv[cell];
1457         ncg_cell[ncell+cell+1] = cg1;
1458     }
1459 }
1460
1461 //! Makes a range partitioning for the atom groups wthin a cell
1462 static void make_cell2at_index(gmx_domdec_comm_dim_t        *cd,
1463                                int                           nzone,
1464                                int                           atomGroupStart)
1465 {
1466     /* Store the atom block boundaries for easy copying of communication buffers
1467      */
1468     int g = atomGroupStart;
1469     for (int zone = 0; zone < nzone; zone++)
1470     {
1471         for (gmx_domdec_ind_t &ind : cd->ind)
1472         {
1473             ind.cell2at0[zone]  = g;
1474             g                  += ind.nrecv[zone];
1475             ind.cell2at1[zone]  = g;
1476         }
1477     }
1478 }
1479
1480 //! Returns whether a link is missing.
1481 static gmx_bool missing_link(t_blocka *link, int cg_gl, const char *bLocalCG)
1482 {
1483     int      i;
1484     gmx_bool bMiss;
1485
1486     bMiss = FALSE;
1487     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
1488     {
1489         if (!bLocalCG[link->a[i]])
1490         {
1491             bMiss = TRUE;
1492         }
1493     }
1494
1495     return bMiss;
1496 }
1497
1498 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1499 typedef struct {
1500     //! The corners for the non-bonded communication.
1501     real c[DIM][4];
1502     //! Corner for rounding.
1503     real cr0;
1504     //! Corners for rounding.
1505     real cr1[4];
1506     //! Corners for bounded communication.
1507     real bc[DIM];
1508     //! Corner for rounding for bonded communication.
1509     real bcr1;
1510 } dd_corners_t;
1511
1512 //! Determine the corners of the domain(s) we are communicating with.
1513 static void
1514 set_dd_corners(const gmx_domdec_t *dd,
1515                int dim0, int dim1, int dim2,
1516                gmx_bool bDistMB,
1517                dd_corners_t *c)
1518 {
1519     const gmx_domdec_comm_t  *comm;
1520     const gmx_domdec_zones_t *zones;
1521     int i, j;
1522
1523     comm = dd->comm;
1524
1525     zones = &comm->zones;
1526
1527     /* Keep the compiler happy */
1528     c->cr0  = 0;
1529     c->bcr1 = 0;
1530
1531     /* The first dimension is equal for all cells */
1532     c->c[0][0] = comm->cell_x0[dim0];
1533     if (bDistMB)
1534     {
1535         c->bc[0] = c->c[0][0];
1536     }
1537     if (dd->ndim >= 2)
1538     {
1539         dim1 = dd->dim[1];
1540         /* This cell row is only seen from the first row */
1541         c->c[1][0] = comm->cell_x0[dim1];
1542         /* All rows can see this row */
1543         c->c[1][1] = comm->cell_x0[dim1];
1544         if (isDlbOn(dd->comm))
1545         {
1546             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1547             if (bDistMB)
1548             {
1549                 /* For the multi-body distance we need the maximum */
1550                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1551             }
1552         }
1553         /* Set the upper-right corner for rounding */
1554         c->cr0 = comm->cell_x1[dim0];
1555
1556         if (dd->ndim >= 3)
1557         {
1558             dim2 = dd->dim[2];
1559             for (j = 0; j < 4; j++)
1560             {
1561                 c->c[2][j] = comm->cell_x0[dim2];
1562             }
1563             if (isDlbOn(dd->comm))
1564             {
1565                 /* Use the maximum of the i-cells that see a j-cell */
1566                 for (i = 0; i < zones->nizone; i++)
1567                 {
1568                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
1569                     {
1570                         if (j >= 4)
1571                         {
1572                             c->c[2][j-4] =
1573                                 std::max(c->c[2][j-4],
1574                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
1575                         }
1576                     }
1577                 }
1578                 if (bDistMB)
1579                 {
1580                     /* For the multi-body distance we need the maximum */
1581                     c->bc[2] = comm->cell_x0[dim2];
1582                     for (i = 0; i < 2; i++)
1583                     {
1584                         for (j = 0; j < 2; j++)
1585                         {
1586                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1587                         }
1588                     }
1589                 }
1590             }
1591
1592             /* Set the upper-right corner for rounding */
1593             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1594              * Only cell (0,0,0) can see cell 7 (1,1,1)
1595              */
1596             c->cr1[0] = comm->cell_x1[dim1];
1597             c->cr1[3] = comm->cell_x1[dim1];
1598             if (isDlbOn(dd->comm))
1599             {
1600                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1601                 if (bDistMB)
1602                 {
1603                     /* For the multi-body distance we need the maximum */
1604                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1605                 }
1606             }
1607         }
1608     }
1609 }
1610
1611 /*! \brief Add the atom groups we need to send in this pulse from this
1612  * zone to \p localAtomGroups and \p work. */
1613 static void
1614 get_zone_pulse_cgs(gmx_domdec_t *dd,
1615                    int zonei, int zone,
1616                    int cg0, int cg1,
1617                    gmx::ArrayRef<const int> globalAtomGroupIndices,
1618                    int dim, int dim_ind,
1619                    int dim0, int dim1, int dim2,
1620                    real r_comm2, real r_bcomm2,
1621                    matrix box,
1622                    bool distanceIsTriclinic,
1623                    rvec *normal,
1624                    real skew_fac2_d, real skew_fac_01,
1625                    rvec *v_d, rvec *v_0, rvec *v_1,
1626                    const dd_corners_t *c,
1627                    const rvec sf2_round,
1628                    gmx_bool bDistBonded,
1629                    gmx_bool bBondComm,
1630                    gmx_bool bDist2B,
1631                    gmx_bool bDistMB,
1632                    rvec *cg_cm,
1633                    gmx::ArrayRef<const int> cginfo,
1634                    std::vector<int>     *localAtomGroups,
1635                    dd_comm_setup_work_t *work)
1636 {
1637     gmx_domdec_comm_t *comm;
1638     gmx_bool           bScrew;
1639     gmx_bool           bDistMB_pulse;
1640     int                cg, i;
1641     real               r2, rb2, r, tric_sh;
1642     rvec               rn, rb;
1643     int                dimd;
1644     int                nsend_z, nat;
1645
1646     comm = dd->comm;
1647
1648     bScrew = (dd->bScrewPBC && dim == XX);
1649
1650     bDistMB_pulse = (bDistMB && bDistBonded);
1651
1652     /* Unpack the work data */
1653     std::vector<int>       &ibuf = work->atomGroupBuffer;
1654     std::vector<gmx::RVec> &vbuf = work->positionBuffer;
1655     nsend_z                      = 0;
1656     nat                          = work->nat;
1657
1658     for (cg = cg0; cg < cg1; cg++)
1659     {
1660         r2  = 0;
1661         rb2 = 0;
1662         if (!distanceIsTriclinic)
1663         {
1664             /* Rectangular direction, easy */
1665             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
1666             if (r > 0)
1667             {
1668                 r2 += r*r;
1669             }
1670             if (bDistMB_pulse)
1671             {
1672                 r = cg_cm[cg][dim] - c->bc[dim_ind];
1673                 if (r > 0)
1674                 {
1675                     rb2 += r*r;
1676                 }
1677             }
1678             /* Rounding gives at most a 16% reduction
1679              * in communicated atoms
1680              */
1681             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1682             {
1683                 r = cg_cm[cg][dim0] - c->cr0;
1684                 /* This is the first dimension, so always r >= 0 */
1685                 r2 += r*r;
1686                 if (bDistMB_pulse)
1687                 {
1688                     rb2 += r*r;
1689                 }
1690             }
1691             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1692             {
1693                 r = cg_cm[cg][dim1] - c->cr1[zone];
1694                 if (r > 0)
1695                 {
1696                     r2 += r*r;
1697                 }
1698                 if (bDistMB_pulse)
1699                 {
1700                     r = cg_cm[cg][dim1] - c->bcr1;
1701                     if (r > 0)
1702                     {
1703                         rb2 += r*r;
1704                     }
1705                 }
1706             }
1707         }
1708         else
1709         {
1710             /* Triclinic direction, more complicated */
1711             clear_rvec(rn);
1712             clear_rvec(rb);
1713             /* Rounding, conservative as the skew_fac multiplication
1714              * will slightly underestimate the distance.
1715              */
1716             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1717             {
1718                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
1719                 for (i = dim0+1; i < DIM; i++)
1720                 {
1721                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
1722                 }
1723                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
1724                 if (bDistMB_pulse)
1725                 {
1726                     rb[dim0] = rn[dim0];
1727                     rb2      = r2;
1728                 }
1729                 /* Take care that the cell planes along dim0 might not
1730                  * be orthogonal to those along dim1 and dim2.
1731                  */
1732                 for (i = 1; i <= dim_ind; i++)
1733                 {
1734                     dimd = dd->dim[i];
1735                     if (normal[dim0][dimd] > 0)
1736                     {
1737                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
1738                         if (bDistMB_pulse)
1739                         {
1740                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
1741                         }
1742                     }
1743                 }
1744             }
1745             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1746             {
1747                 GMX_ASSERT(dim1 >= 0 && dim1 < DIM, "Must have a valid dimension index");
1748                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
1749                 tric_sh   = 0;
1750                 for (i = dim1+1; i < DIM; i++)
1751                 {
1752                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
1753                 }
1754                 rn[dim1] += tric_sh;
1755                 if (rn[dim1] > 0)
1756                 {
1757                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
1758                     /* Take care of coupling of the distances
1759                      * to the planes along dim0 and dim1 through dim2.
1760                      */
1761                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
1762                     /* Take care that the cell planes along dim1
1763                      * might not be orthogonal to that along dim2.
1764                      */
1765                     if (normal[dim1][dim2] > 0)
1766                     {
1767                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
1768                     }
1769                 }
1770                 if (bDistMB_pulse)
1771                 {
1772                     rb[dim1] +=
1773                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
1774                     if (rb[dim1] > 0)
1775                     {
1776                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
1777                         /* Take care of coupling of the distances
1778                          * to the planes along dim0 and dim1 through dim2.
1779                          */
1780                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
1781                         /* Take care that the cell planes along dim1
1782                          * might not be orthogonal to that along dim2.
1783                          */
1784                         if (normal[dim1][dim2] > 0)
1785                         {
1786                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
1787                         }
1788                     }
1789                 }
1790             }
1791             /* The distance along the communication direction */
1792             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
1793             tric_sh  = 0;
1794             for (i = dim+1; i < DIM; i++)
1795             {
1796                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
1797             }
1798             rn[dim] += tric_sh;
1799             if (rn[dim] > 0)
1800             {
1801                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
1802                 /* Take care of coupling of the distances
1803                  * to the planes along dim0 and dim1 through dim2.
1804                  */
1805                 if (dim_ind == 1 && zonei == 1)
1806                 {
1807                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
1808                 }
1809             }
1810             if (bDistMB_pulse)
1811             {
1812                 clear_rvec(rb);
1813                 GMX_ASSERT(dim >= 0 && dim < DIM, "Must have a valid dimension index");
1814                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
1815                 if (rb[dim] > 0)
1816                 {
1817                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
1818                     /* Take care of coupling of the distances
1819                      * to the planes along dim0 and dim1 through dim2.
1820                      */
1821                     if (dim_ind == 1 && zonei == 1)
1822                     {
1823                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
1824                     }
1825                 }
1826             }
1827         }
1828
1829         if (r2 < r_comm2 ||
1830             (bDistBonded &&
1831              ((bDistMB && rb2 < r_bcomm2) ||
1832               (bDist2B && r2  < r_bcomm2)) &&
1833              (!bBondComm ||
1834               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
1835                missing_link(comm->cglink, globalAtomGroupIndices[cg],
1836                             comm->bLocalCG)))))
1837         {
1838             /* Store the local and global atom group indices and position */
1839             localAtomGroups->push_back(cg);
1840             ibuf.push_back(globalAtomGroupIndices[cg]);
1841             nsend_z++;
1842
1843             rvec posPbc;
1844             if (dd->ci[dim] == 0)
1845             {
1846                 /* Correct cg_cm for pbc */
1847                 rvec_add(cg_cm[cg], box[dim], posPbc);
1848                 if (bScrew)
1849                 {
1850                     posPbc[YY] = box[YY][YY] - posPbc[YY];
1851                     posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1852                 }
1853             }
1854             else
1855             {
1856                 copy_rvec(cg_cm[cg], posPbc);
1857             }
1858             vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1859
1860             nat += 1;
1861         }
1862     }
1863
1864     work->nat        = nat;
1865     work->nsend_zone = nsend_z;
1866 }
1867
1868 //! Clear data.
1869 static void clearCommSetupData(dd_comm_setup_work_t *work)
1870 {
1871     work->localAtomGroupBuffer.clear();
1872     work->atomGroupBuffer.clear();
1873     work->positionBuffer.clear();
1874     work->nat        = 0;
1875     work->nsend_zone = 0;
1876 }
1877
1878 //! Prepare DD communication.
1879 static void setup_dd_communication(gmx_domdec_t *dd,
1880                                    matrix box, gmx_ddbox_t *ddbox,
1881                                    t_forcerec *fr,
1882                                    t_state *state,
1883                                    PaddedVector<gmx::RVec> *f)
1884 {
1885     int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1886     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
1887     int                    c, i, cg, cg_gl;
1888     int                   *zone_cg_range, pos_cg;
1889     gmx_domdec_comm_t     *comm;
1890     gmx_domdec_zones_t    *zones;
1891     gmx_domdec_comm_dim_t *cd;
1892     cginfo_mb_t           *cginfo_mb;
1893     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
1894     dd_corners_t           corners;
1895     rvec                  *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1896     real                   skew_fac2_d, skew_fac_01;
1897     rvec                   sf2_round;
1898
1899     if (debug)
1900     {
1901         fprintf(debug, "Setting up DD communication\n");
1902     }
1903
1904     comm  = dd->comm;
1905
1906     if (comm->dth.empty())
1907     {
1908         /* Initialize the thread data.
1909          * This can not be done in init_domain_decomposition,
1910          * as the numbers of threads is determined later.
1911          */
1912         int numThreads = gmx_omp_nthreads_get(emntDomdec);
1913         comm->dth.resize(numThreads);
1914     }
1915
1916     bBondComm = comm->bBondComm;
1917
1918     /* Do we need to determine extra distances for multi-body bondeds? */
1919     bDistMB = (comm->haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
1920
1921     /* Do we need to determine extra distances for only two-body bondeds? */
1922     bDist2B = (bBondComm && !bDistMB);
1923
1924     const real r_comm2  = gmx::square(domainToDomainIntoAtomToDomainCutoff(*comm, comm->cutoff));
1925     const real r_bcomm2 = gmx::square(domainToDomainIntoAtomToDomainCutoff(*comm, comm->cutoff_mbody));
1926
1927     if (debug)
1928     {
1929         fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1930     }
1931
1932     zones = &comm->zones;
1933
1934     dim0 = dd->dim[0];
1935     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1936     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1937
1938     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1939
1940     /* Triclinic stuff */
1941     normal      = ddbox->normal;
1942     skew_fac_01 = 0;
1943     if (dd->ndim >= 2)
1944     {
1945         v_0 = ddbox->v[dim0];
1946         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1947         {
1948             /* Determine the coupling coefficient for the distances
1949              * to the cell planes along dim0 and dim1 through dim2.
1950              * This is required for correct rounding.
1951              */
1952             skew_fac_01 =
1953                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
1954             if (debug)
1955             {
1956                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
1957             }
1958         }
1959     }
1960     if (dd->ndim >= 3)
1961     {
1962         v_1 = ddbox->v[dim1];
1963     }
1964
1965     zone_cg_range = zones->cg_range;
1966     cginfo_mb     = fr->cginfo_mb;
1967
1968     zone_cg_range[0]   = 0;
1969     zone_cg_range[1]   = dd->ncg_home;
1970     comm->zone_ncg1[0] = dd->ncg_home;
1971     pos_cg             = dd->ncg_home;
1972
1973     nat_tot = comm->atomRanges.numHomeAtoms();
1974     nzone   = 1;
1975     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
1976     {
1977         dim = dd->dim[dim_ind];
1978         cd  = &comm->cd[dim_ind];
1979
1980         /* Check if we need to compute triclinic distances along this dim */
1981         bool distanceIsTriclinic = false;
1982         for (i = 0; i <= dim_ind; i++)
1983         {
1984             if (ddbox->tric_dir[dd->dim[i]])
1985             {
1986                 distanceIsTriclinic = true;
1987             }
1988         }
1989
1990         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
1991         {
1992             /* No pbc in this dimension, the first node should not comm. */
1993             nzone_send = 0;
1994         }
1995         else
1996         {
1997             nzone_send = nzone;
1998         }
1999
2000         v_d                = ddbox->v[dim];
2001         skew_fac2_d        = gmx::square(ddbox->skew_fac[dim]);
2002
2003         cd->receiveInPlace = true;
2004         for (int p = 0; p < cd->numPulses(); p++)
2005         {
2006             /* Only atoms communicated in the first pulse are used
2007              * for multi-body bonded interactions or for bBondComm.
2008              */
2009             bDistBonded = ((bDistMB || bDist2B) && p == 0);
2010
2011             gmx_domdec_ind_t *ind = &cd->ind[p];
2012
2013             /* Thread 0 writes in the global index array */
2014             ind->index.clear();
2015             clearCommSetupData(&comm->dth[0]);
2016
2017             for (zone = 0; zone < nzone_send; zone++)
2018             {
2019                 if (dim_ind > 0 && distanceIsTriclinic)
2020                 {
2021                     /* Determine slightly more optimized skew_fac's
2022                      * for rounding.
2023                      * This reduces the number of communicated atoms
2024                      * by about 10% for 3D DD of rhombic dodecahedra.
2025                      */
2026                     for (dimd = 0; dimd < dim; dimd++)
2027                     {
2028                         sf2_round[dimd] = 1;
2029                         if (ddbox->tric_dir[dimd])
2030                         {
2031                             for (i = dd->dim[dimd]+1; i < DIM; i++)
2032                             {
2033                                 /* If we are shifted in dimension i
2034                                  * and the cell plane is tilted forward
2035                                  * in dimension i, skip this coupling.
2036                                  */
2037                                 if (!(zones->shift[nzone+zone][i] &&
2038                                       ddbox->v[dimd][i][dimd] >= 0))
2039                                 {
2040                                     sf2_round[dimd] +=
2041                                         gmx::square(ddbox->v[dimd][i][dimd]);
2042                                 }
2043                             }
2044                             sf2_round[dimd] = 1/sf2_round[dimd];
2045                         }
2046                     }
2047                 }
2048
2049                 zonei = zone_perm[dim_ind][zone];
2050                 if (p == 0)
2051                 {
2052                     /* Here we permutate the zones to obtain a convenient order
2053                      * for neighbor searching
2054                      */
2055                     cg0 = zone_cg_range[zonei];
2056                     cg1 = zone_cg_range[zonei+1];
2057                 }
2058                 else
2059                 {
2060                     /* Look only at the cg's received in the previous grid pulse
2061                      */
2062                     cg1 = zone_cg_range[nzone+zone+1];
2063                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
2064                 }
2065
2066                 const int numThreads = gmx::ssize(comm->dth);
2067 #pragma omp parallel for num_threads(numThreads) schedule(static)
2068                 for (int th = 0; th < numThreads; th++)
2069                 {
2070                     try
2071                     {
2072                         dd_comm_setup_work_t &work = comm->dth[th];
2073
2074                         /* Retain data accumulated into buffers of thread 0 */
2075                         if (th > 0)
2076                         {
2077                             clearCommSetupData(&work);
2078                         }
2079
2080                         int cg0_th = cg0 + ((cg1 - cg0)* th   )/numThreads;
2081                         int cg1_th = cg0 + ((cg1 - cg0)*(th+1))/numThreads;
2082
2083                         /* Get the cg's for this pulse in this zone */
2084                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
2085                                            dd->globalAtomGroupIndices,
2086                                            dim, dim_ind, dim0, dim1, dim2,
2087                                            r_comm2, r_bcomm2,
2088                                            box, distanceIsTriclinic,
2089                                            normal, skew_fac2_d, skew_fac_01,
2090                                            v_d, v_0, v_1, &corners, sf2_round,
2091                                            bDistBonded, bBondComm,
2092                                            bDist2B, bDistMB,
2093                                            state->x.rvec_array(),
2094                                            fr->cginfo,
2095                                            th == 0 ? &ind->index : &work.localAtomGroupBuffer,
2096                                            &work);
2097                     }
2098                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2099                 } // END
2100
2101                 std::vector<int>       &atomGroups = comm->dth[0].atomGroupBuffer;
2102                 std::vector<gmx::RVec> &positions  = comm->dth[0].positionBuffer;
2103                 ind->nsend[zone]  = comm->dth[0].nsend_zone;
2104                 /* Append data of threads>=1 to the communication buffers */
2105                 for (int th = 1; th < numThreads; th++)
2106                 {
2107                     const dd_comm_setup_work_t &dth = comm->dth[th];
2108
2109                     ind->index.insert(ind->index.end(), dth.localAtomGroupBuffer.begin(), dth.localAtomGroupBuffer.end());
2110                     atomGroups.insert(atomGroups.end(), dth.atomGroupBuffer.begin(), dth.atomGroupBuffer.end());
2111                     positions.insert(positions.end(), dth.positionBuffer.begin(), dth.positionBuffer.end());
2112                     comm->dth[0].nat += dth.nat;
2113                     ind->nsend[zone] += dth.nsend_zone;
2114                 }
2115             }
2116             /* Clear the counts in case we do not have pbc */
2117             for (zone = nzone_send; zone < nzone; zone++)
2118             {
2119                 ind->nsend[zone] = 0;
2120             }
2121             ind->nsend[nzone]     = ind->index.size();
2122             ind->nsend[nzone + 1] = comm->dth[0].nat;
2123             /* Communicate the number of cg's and atoms to receive */
2124             ddSendrecv(dd, dim_ind, dddirBackward,
2125                        ind->nsend, nzone+2,
2126                        ind->nrecv, nzone+2);
2127
2128             if (p > 0)
2129             {
2130                 /* We can receive in place if only the last zone is not empty */
2131                 for (zone = 0; zone < nzone-1; zone++)
2132                 {
2133                     if (ind->nrecv[zone] > 0)
2134                     {
2135                         cd->receiveInPlace = false;
2136                     }
2137                 }
2138             }
2139
2140             int receiveBufferSize = 0;
2141             if (!cd->receiveInPlace)
2142             {
2143                 receiveBufferSize = ind->nrecv[nzone];
2144             }
2145             /* These buffer are actually only needed with in-place */
2146             DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2147             DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2148
2149             dd_comm_setup_work_t     &work = comm->dth[0];
2150
2151             /* Make space for the global cg indices */
2152             int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2153             dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2154             /* Communicate the global cg indices */
2155             gmx::ArrayRef<int> integerBufferRef;
2156             if (cd->receiveInPlace)
2157             {
2158                 integerBufferRef = gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2159             }
2160             else
2161             {
2162                 integerBufferRef = globalAtomGroupBuffer.buffer;
2163             }
2164             ddSendrecv<int>(dd, dim_ind, dddirBackward,
2165                             work.atomGroupBuffer, integerBufferRef);
2166
2167             /* Make space for cg_cm */
2168             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
2169
2170             /* Communicate the coordinates */
2171             gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2172             if (cd->receiveInPlace)
2173             {
2174                 rvecBufferRef = gmx::makeArrayRef(state->x).subArray(pos_cg, ind->nrecv[nzone]);
2175             }
2176             else
2177             {
2178                 rvecBufferRef = rvecBuffer.buffer;
2179             }
2180             ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward,
2181                                   work.positionBuffer, rvecBufferRef);
2182
2183             /* Make the charge group index */
2184             if (cd->receiveInPlace)
2185             {
2186                 zone = (p == 0 ? 0 : nzone - 1);
2187                 while (zone < nzone)
2188                 {
2189                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
2190                     {
2191                         cg_gl                              = dd->globalAtomGroupIndices[pos_cg];
2192                         fr->cginfo[pos_cg]                 = ddcginfo(cginfo_mb, cg_gl);
2193                         if (bBondComm)
2194                         {
2195                             /* Update the charge group presence,
2196                              * so we can use it in the next pass of the loop.
2197                              */
2198                             comm->bLocalCG[cg_gl] = TRUE;
2199                         }
2200                         pos_cg++;
2201                     }
2202                     if (p == 0)
2203                     {
2204                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
2205                     }
2206                     zone++;
2207                     zone_cg_range[nzone+zone] = pos_cg;
2208                 }
2209             }
2210             else
2211             {
2212                 /* This part of the code is never executed with bBondComm. */
2213                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
2214                                  dd->globalAtomGroupIndices, integerBufferRef.data(),
2215                                  state->x, rvecBufferRef,
2216                                  fr->cginfo_mb, fr->cginfo);
2217                 pos_cg += ind->nrecv[nzone];
2218             }
2219             nat_tot += ind->nrecv[nzone+1];
2220         }
2221         if (!cd->receiveInPlace)
2222         {
2223             /* Store the atom block for easy copying of communication buffers */
2224             make_cell2at_index(cd, nzone, zone_cg_range[nzone]);
2225         }
2226         nzone += nzone;
2227     }
2228
2229     comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2230
2231     if (!bBondComm)
2232     {
2233         /* We don't need to update cginfo, since that was alrady done above.
2234          * So we pass NULL for the forcerec.
2235          */
2236         dd_set_cginfo(dd->globalAtomGroupIndices,
2237                       dd->ncg_home, dd->globalAtomGroupIndices.size(),
2238                       nullptr, comm->bLocalCG);
2239     }
2240
2241     if (debug)
2242     {
2243         fprintf(debug, "Finished setting up DD communication, zones:");
2244         for (c = 0; c < zones->n; c++)
2245         {
2246             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
2247         }
2248         fprintf(debug, "\n");
2249     }
2250 }
2251
2252 //! Set boundaries for the charge group range.
2253 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
2254 {
2255     int c;
2256
2257     for (c = 0; c < zones->nizone; c++)
2258     {
2259         zones->izone[c].cg1  = zones->cg_range[c+1];
2260         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
2261         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
2262     }
2263 }
2264
2265 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2266  *
2267  * Also sets the atom density for the home zone when \p zone_start=0.
2268  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2269  * how many charge groups will move but are still part of the current range.
2270  * \todo When converting domdec to use proper classes, all these variables
2271  *       should be private and a method should return the correct count
2272  *       depending on an internal state.
2273  *
2274  * \param[in,out] dd          The domain decomposition struct
2275  * \param[in]     box         The box
2276  * \param[in]     ddbox       The domain decomposition box struct
2277  * \param[in]     zone_start  The start of the zone range to set sizes for
2278  * \param[in]     zone_end    The end of the zone range to set sizes for
2279  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2280  */
2281 static void set_zones_size(gmx_domdec_t *dd,
2282                            matrix box, const gmx_ddbox_t *ddbox,
2283                            int zone_start, int zone_end,
2284                            int numMovedChargeGroupsInHomeZone)
2285 {
2286     gmx_domdec_comm_t  *comm;
2287     gmx_domdec_zones_t *zones;
2288     gmx_bool            bDistMB;
2289     int                 z, zi, d, dim;
2290     real                rcs, rcmbs;
2291     int                 i, j;
2292     real                vol;
2293
2294     comm = dd->comm;
2295
2296     zones = &comm->zones;
2297
2298     /* Do we need to determine extra distances for multi-body bondeds? */
2299     bDistMB = (comm->haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
2300
2301     for (z = zone_start; z < zone_end; z++)
2302     {
2303         /* Copy cell limits to zone limits.
2304          * Valid for non-DD dims and non-shifted dims.
2305          */
2306         copy_rvec(comm->cell_x0, zones->size[z].x0);
2307         copy_rvec(comm->cell_x1, zones->size[z].x1);
2308     }
2309
2310     for (d = 0; d < dd->ndim; d++)
2311     {
2312         dim = dd->dim[d];
2313
2314         for (z = 0; z < zones->n; z++)
2315         {
2316             /* With a staggered grid we have different sizes
2317              * for non-shifted dimensions.
2318              */
2319             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2320             {
2321                 if (d == 1)
2322                 {
2323                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
2324                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
2325                 }
2326                 else if (d == 2)
2327                 {
2328                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
2329                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
2330                 }
2331             }
2332         }
2333
2334         rcs   = comm->cutoff;
2335         rcmbs = comm->cutoff_mbody;
2336         if (ddbox->tric_dir[dim])
2337         {
2338             rcs   /= ddbox->skew_fac[dim];
2339             rcmbs /= ddbox->skew_fac[dim];
2340         }
2341
2342         /* Set the lower limit for the shifted zone dimensions */
2343         for (z = zone_start; z < zone_end; z++)
2344         {
2345             if (zones->shift[z][dim] > 0)
2346             {
2347                 dim = dd->dim[d];
2348                 if (!isDlbOn(dd->comm) || d == 0)
2349                 {
2350                     zones->size[z].x0[dim] = comm->cell_x1[dim];
2351                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2352                 }
2353                 else
2354                 {
2355                     /* Here we take the lower limit of the zone from
2356                      * the lowest domain of the zone below.
2357                      */
2358                     if (z < 4)
2359                     {
2360                         zones->size[z].x0[dim] =
2361                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
2362                     }
2363                     else
2364                     {
2365                         if (d == 1)
2366                         {
2367                             zones->size[z].x0[dim] =
2368                                 zones->size[zone_perm[2][z-4]].x0[dim];
2369                         }
2370                         else
2371                         {
2372                             zones->size[z].x0[dim] =
2373                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
2374                         }
2375                     }
2376                     /* A temporary limit, is updated below */
2377                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
2378
2379                     if (bDistMB)
2380                     {
2381                         for (zi = 0; zi < zones->nizone; zi++)
2382                         {
2383                             if (zones->shift[zi][dim] == 0)
2384                             {
2385                                 /* This takes the whole zone into account.
2386                                  * With multiple pulses this will lead
2387                                  * to a larger zone then strictly necessary.
2388                                  */
2389                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2390                                                                   zones->size[zi].x1[dim]+rcmbs);
2391                             }
2392                         }
2393                     }
2394                 }
2395             }
2396         }
2397
2398         /* Loop over the i-zones to set the upper limit of each
2399          * j-zone they see.
2400          */
2401         for (zi = 0; zi < zones->nizone; zi++)
2402         {
2403             if (zones->shift[zi][dim] == 0)
2404             {
2405                 /* We should only use zones up to zone_end */
2406                 int jZoneEnd = std::min(zones->izone[zi].j1, zone_end);
2407                 for (z = zones->izone[zi].j0; z < jZoneEnd; z++)
2408                 {
2409                     if (zones->shift[z][dim] > 0)
2410                     {
2411                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2412                                                           zones->size[zi].x1[dim]+rcs);
2413                     }
2414                 }
2415             }
2416         }
2417     }
2418
2419     for (z = zone_start; z < zone_end; z++)
2420     {
2421         /* Initialization only required to keep the compiler happy */
2422         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
2423         int  nc, c;
2424
2425         /* To determine the bounding box for a zone we need to find
2426          * the extreme corners of 4, 2 or 1 corners.
2427          */
2428         nc = 1 << (ddbox->nboundeddim - 1);
2429
2430         for (c = 0; c < nc; c++)
2431         {
2432             /* Set up a zone corner at x=0, ignoring trilinic couplings */
2433             corner[XX] = 0;
2434             if ((c & 1) == 0)
2435             {
2436                 corner[YY] = zones->size[z].x0[YY];
2437             }
2438             else
2439             {
2440                 corner[YY] = zones->size[z].x1[YY];
2441             }
2442             if ((c & 2) == 0)
2443             {
2444                 corner[ZZ] = zones->size[z].x0[ZZ];
2445             }
2446             else
2447             {
2448                 corner[ZZ] = zones->size[z].x1[ZZ];
2449             }
2450             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
2451                 box[ZZ][1 - dd->dim[0]] != 0)
2452             {
2453                 /* With 1D domain decomposition the cg's are not in
2454                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
2455                  * Shift the corner of the z-vector back to along the box
2456                  * vector of dimension d, so it will later end up at 0 along d.
2457                  * This can affect the location of this corner along dd->dim[0]
2458                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
2459                  */
2460                 int d = 1 - dd->dim[0];
2461
2462                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
2463             }
2464             /* Apply the triclinic couplings */
2465             for (i = YY; i < ddbox->npbcdim && i < DIM; i++)
2466             {
2467                 for (j = XX; j < i; j++)
2468                 {
2469                     corner[j] += corner[i]*box[i][j]/box[i][i];
2470                 }
2471             }
2472             if (c == 0)
2473             {
2474                 copy_rvec(corner, corner_min);
2475                 copy_rvec(corner, corner_max);
2476             }
2477             else
2478             {
2479                 for (i = 0; i < DIM; i++)
2480                 {
2481                     corner_min[i] = std::min(corner_min[i], corner[i]);
2482                     corner_max[i] = std::max(corner_max[i], corner[i]);
2483                 }
2484             }
2485         }
2486         /* Copy the extreme cornes without offset along x */
2487         for (i = 0; i < DIM; i++)
2488         {
2489             zones->size[z].bb_x0[i] = corner_min[i];
2490             zones->size[z].bb_x1[i] = corner_max[i];
2491         }
2492         /* Add the offset along x */
2493         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2494         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2495     }
2496
2497     if (zone_start == 0)
2498     {
2499         vol = 1;
2500         for (dim = 0; dim < DIM; dim++)
2501         {
2502             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2503         }
2504         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone)/vol;
2505     }
2506
2507     if (debug)
2508     {
2509         for (z = zone_start; z < zone_end; z++)
2510         {
2511             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2512                     z,
2513                     zones->size[z].x0[XX], zones->size[z].x1[XX],
2514                     zones->size[z].x0[YY], zones->size[z].x1[YY],
2515                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
2516             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2517                     z,
2518                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
2519                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
2520                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
2521         }
2522     }
2523 }
2524
2525 /*! \brief Order data in \p dataToSort according to \p sort
2526  *
2527  * Note: both buffers should have at least \p sort.size() elements.
2528  */
2529 template <typename T>
2530 static void
2531 orderVector(gmx::ArrayRef<const gmx_cgsort_t>  sort,
2532             gmx::ArrayRef<T>                   dataToSort,
2533             gmx::ArrayRef<T>                   sortBuffer)
2534 {
2535     GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2536     GMX_ASSERT(sortBuffer.size() >= sort.size(), "The sorting buffer needs to be sufficiently large");
2537
2538     /* Order the data into the temporary buffer */
2539     size_t i = 0;
2540     for (const gmx_cgsort_t &entry : sort)
2541     {
2542         sortBuffer[i++] = dataToSort[entry.ind];
2543     }
2544
2545     /* Copy back to the original array */
2546     std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(),
2547               dataToSort.begin());
2548 }
2549
2550 /*! \brief Order data in \p dataToSort according to \p sort
2551  *
2552  * Note: \p vectorToSort should have at least \p sort.size() elements,
2553  *       \p workVector is resized when it is too small.
2554  */
2555 template <typename T>
2556 static void
2557 orderVector(gmx::ArrayRef<const gmx_cgsort_t>  sort,
2558             gmx::ArrayRef<T>                   vectorToSort,
2559             std::vector<T>                    *workVector)
2560 {
2561     if (gmx::index(workVector->size()) < sort.ssize())
2562     {
2563         workVector->resize(sort.size());
2564     }
2565     orderVector<T>(sort, vectorToSort, *workVector);
2566 }
2567
2568 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2569 static void dd_sort_order_nbnxn(const t_forcerec          *fr,
2570                                 std::vector<gmx_cgsort_t> *sort)
2571 {
2572     gmx::ArrayRef<const int> atomOrder = fr->nbv->getLocalAtomOrder();
2573
2574     /* Using push_back() instead of this resize results in much slower code */
2575     sort->resize(atomOrder.size());
2576     gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
2577     size_t                      numSorted = 0;
2578     for (int i : atomOrder)
2579     {
2580         if (i >= 0)
2581         {
2582             /* The values of nsc and ind_gl are not used in this case */
2583             buffer[numSorted++].ind = i;
2584         }
2585     }
2586     sort->resize(numSorted);
2587 }
2588
2589 //! Returns the sorting state for DD.
2590 static void dd_sort_state(gmx_domdec_t *dd, t_forcerec *fr, t_state *state)
2591 {
2592     gmx_domdec_sort_t *sort = dd->comm->sort.get();
2593
2594     dd_sort_order_nbnxn(fr, &sort->sorted);
2595
2596     /* We alloc with the old size, since cgindex is still old */
2597     DDBufferAccess<gmx::RVec>     rvecBuffer(dd->comm->rvecBuffer, dd->ncg_home);
2598
2599     /* Set the new home atom/charge group count */
2600     dd->ncg_home = sort->sorted.size();
2601     if (debug)
2602     {
2603         fprintf(debug, "Set the new home atom count to %d\n",
2604                 dd->ncg_home);
2605     }
2606
2607     /* Reorder the state */
2608     gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2609     GMX_RELEASE_ASSERT(cgsort.ssize() == dd->ncg_home, "We should sort all the home atom groups");
2610
2611     if (state->flags & (1 << estX))
2612     {
2613         orderVector(cgsort, makeArrayRef(state->x), rvecBuffer.buffer);
2614     }
2615     if (state->flags & (1 << estV))
2616     {
2617         orderVector(cgsort, makeArrayRef(state->v), rvecBuffer.buffer);
2618     }
2619     if (state->flags & (1 << estCGP))
2620     {
2621         orderVector(cgsort, makeArrayRef(state->cg_p), rvecBuffer.buffer);
2622     }
2623
2624     /* Reorder the global cg index */
2625     orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2626     /* Reorder the cginfo */
2627     orderVector<int>(cgsort, fr->cginfo, &sort->intBuffer);
2628     /* Set the home atom number */
2629     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->ncg_home);
2630
2631     /* The atoms are now exactly in grid order, update the grid order */
2632     fr->nbv->setLocalAtomOrder();
2633 }
2634
2635 //! Accumulates load statistics.
2636 static void add_dd_statistics(gmx_domdec_t *dd)
2637 {
2638     gmx_domdec_comm_t *comm = dd->comm;
2639
2640     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2641     {
2642         auto range = static_cast<DDAtomRanges::Type>(i);
2643         comm->sum_nat[i] +=
2644             comm->atomRanges.end(range) - comm->atomRanges.start(range);
2645     }
2646     comm->ndecomp++;
2647 }
2648
2649 void reset_dd_statistics_counters(gmx_domdec_t *dd)
2650 {
2651     gmx_domdec_comm_t *comm = dd->comm;
2652
2653     /* Reset all the statistics and counters for total run counting */
2654     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2655     {
2656         comm->sum_nat[i] = 0;
2657     }
2658     comm->ndecomp   = 0;
2659     comm->nload     = 0;
2660     comm->load_step = 0;
2661     comm->load_sum  = 0;
2662     comm->load_max  = 0;
2663     clear_ivec(comm->load_lim);
2664     comm->load_mdf = 0;
2665     comm->load_pme = 0;
2666 }
2667
2668 void print_dd_statistics(const t_commrec *cr, const t_inputrec *ir, FILE *fplog)
2669 {
2670     gmx_domdec_comm_t *comm      = cr->dd->comm;
2671
2672     const int          numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2673     gmx_sumd(numRanges, comm->sum_nat, cr);
2674
2675     if (fplog == nullptr)
2676     {
2677         return;
2678     }
2679
2680     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
2681
2682     for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2683     {
2684         auto   range = static_cast<DDAtomRanges::Type>(i);
2685         double av    = comm->sum_nat[i]/comm->ndecomp;
2686         switch (range)
2687         {
2688             case DDAtomRanges::Type::Zones:
2689                 fprintf(fplog,
2690                         " av. #atoms communicated per step for force:  %d x %.1f\n",
2691                         2, av);
2692                 break;
2693             case DDAtomRanges::Type::Vsites:
2694                 if (cr->dd->vsite_comm)
2695                 {
2696                     fprintf(fplog,
2697                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
2698                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
2699                             av);
2700                 }
2701                 break;
2702             case DDAtomRanges::Type::Constraints:
2703                 if (cr->dd->constraint_comm)
2704                 {
2705                     fprintf(fplog,
2706                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
2707                             1 + ir->nLincsIter, av);
2708                 }
2709                 break;
2710             default:
2711                 gmx_incons(" Unknown type for DD statistics");
2712         }
2713     }
2714     fprintf(fplog, "\n");
2715
2716     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
2717     {
2718         print_dd_load_av(fplog, cr->dd);
2719     }
2720 }
2721
2722 // TODO Remove fplog when group scheme and charge groups are gone
2723 void dd_partition_system(FILE                    *fplog,
2724                          const gmx::MDLogger     &mdlog,
2725                          int64_t                  step,
2726                          const t_commrec         *cr,
2727                          gmx_bool                 bMasterState,
2728                          int                      nstglobalcomm,
2729                          t_state                 *state_global,
2730                          const gmx_mtop_t        &top_global,
2731                          const t_inputrec        *ir,
2732                          gmx::ImdSession         *imdSession,
2733                          pull_t                  *pull_work,
2734                          t_state                 *state_local,
2735                          PaddedVector<gmx::RVec> *f,
2736                          gmx::MDAtoms            *mdAtoms,
2737                          gmx_localtop_t          *top_local,
2738                          t_forcerec              *fr,
2739                          gmx_vsite_t             *vsite,
2740                          gmx::Constraints        *constr,
2741                          t_nrnb                  *nrnb,
2742                          gmx_wallcycle           *wcycle,
2743                          gmx_bool                 bVerbose)
2744 {
2745     gmx_domdec_t      *dd;
2746     gmx_domdec_comm_t *comm;
2747     gmx_ddbox_t        ddbox = {0};
2748     int64_t            step_pcoupl;
2749     rvec               cell_ns_x0, cell_ns_x1;
2750     int                ncgindex_set, ncg_moved, nat_f_novirsum;
2751     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
2752     gmx_bool           bRedist;
2753     ivec               np;
2754     real               grid_density;
2755     char               sbuf[22];
2756
2757     wallcycle_start(wcycle, ewcDOMDEC);
2758
2759     dd   = cr->dd;
2760     comm = dd->comm;
2761
2762     // TODO if the update code becomes accessible here, use
2763     // upd->deform for this logic.
2764     bBoxChanged = (bMasterState || inputrecDeform(ir));
2765     if (ir->epc != epcNO)
2766     {
2767         /* With nstpcouple > 1 pressure coupling happens.
2768          * one step after calculating the pressure.
2769          * Box scaling happens at the end of the MD step,
2770          * after the DD partitioning.
2771          * We therefore have to do DLB in the first partitioning
2772          * after an MD step where P-coupling occurred.
2773          * We need to determine the last step in which p-coupling occurred.
2774          * MRS -- need to validate this for vv?
2775          */
2776         int n = ir->nstpcouple;
2777         if (n == 1)
2778         {
2779             step_pcoupl = step - 1;
2780         }
2781         else
2782         {
2783             step_pcoupl = ((step - 1)/n)*n + 1;
2784         }
2785         if (step_pcoupl >= comm->partition_step)
2786         {
2787             bBoxChanged = TRUE;
2788         }
2789     }
2790
2791     bNStGlobalComm = (step % nstglobalcomm == 0);
2792
2793     if (!isDlbOn(comm))
2794     {
2795         bDoDLB = FALSE;
2796     }
2797     else
2798     {
2799         /* Should we do dynamic load balacing this step?
2800          * Since it requires (possibly expensive) global communication,
2801          * we might want to do DLB less frequently.
2802          */
2803         if (bBoxChanged || ir->epc != epcNO)
2804         {
2805             bDoDLB = bBoxChanged;
2806         }
2807         else
2808         {
2809             bDoDLB = bNStGlobalComm;
2810         }
2811     }
2812
2813     /* Check if we have recorded loads on the nodes */
2814     if (comm->bRecordLoad && dd_load_count(comm) > 0)
2815     {
2816         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
2817
2818         /* Print load every nstlog, first and last step to the log file */
2819         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
2820                     comm->n_load_collect == 0 ||
2821                     (ir->nsteps >= 0 &&
2822                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
2823
2824         /* Avoid extra communication due to verbose screen output
2825          * when nstglobalcomm is set.
2826          */
2827         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
2828             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
2829         {
2830             get_load_distribution(dd, wcycle);
2831             if (DDMASTER(dd))
2832             {
2833                 if (bLogLoad)
2834                 {
2835                     GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step-1));
2836                 }
2837                 if (bVerbose)
2838                 {
2839                     dd_print_load_verbose(dd);
2840                 }
2841             }
2842             comm->n_load_collect++;
2843
2844             if (isDlbOn(comm))
2845             {
2846                 if (DDMASTER(dd))
2847                 {
2848                     /* Add the measured cycles to the running average */
2849                     const float averageFactor        = 0.1F;
2850                     comm->cyclesPerStepDlbExpAverage =
2851                         (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
2852                         averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
2853                 }
2854                 if (comm->dlbState == DlbState::onCanTurnOff &&
2855                     dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
2856                 {
2857                     gmx_bool turnOffDlb;
2858                     if (DDMASTER(dd))
2859                     {
2860                         /* If the running averaged cycles with DLB are more
2861                          * than before we turned on DLB, turn off DLB.
2862                          * We will again run and check the cycles without DLB
2863                          * and we can then decide if to turn off DLB forever.
2864                          */
2865                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
2866                                       comm->cyclesPerStepBeforeDLB);
2867                     }
2868                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
2869                     if (turnOffDlb)
2870                     {
2871                         /* To turn off DLB, we need to redistribute the atoms */
2872                         dd_collect_state(dd, state_local, state_global);
2873                         bMasterState = TRUE;
2874                         turn_off_dlb(mdlog, dd, step);
2875                     }
2876                 }
2877             }
2878             else if (bCheckWhetherToTurnDlbOn)
2879             {
2880                 gmx_bool turnOffDlbForever = FALSE;
2881                 gmx_bool turnOnDlb         = FALSE;
2882
2883                 /* Since the timings are node dependent, the master decides */
2884                 if (DDMASTER(dd))
2885                 {
2886                     /* If we recently turned off DLB, we want to check if
2887                      * performance is better without DLB. We want to do this
2888                      * ASAP to minimize the chance that external factors
2889                      * slowed down the DLB step are gone here and we
2890                      * incorrectly conclude that DLB was causing the slowdown.
2891                      * So we measure one nstlist block, no running average.
2892                      */
2893                     if (comm->haveTurnedOffDlb &&
2894                         comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
2895                         comm->cyclesPerStepDlbExpAverage)
2896                     {
2897                         /* After turning off DLB we ran nstlist steps in fewer
2898                          * cycles than with DLB. This likely means that DLB
2899                          * in not benefical, but this could be due to a one
2900                          * time unlucky fluctuation, so we require two such
2901                          * observations in close succession to turn off DLB
2902                          * forever.
2903                          */
2904                         if (comm->dlbSlowerPartitioningCount > 0 &&
2905                             dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
2906                         {
2907                             turnOffDlbForever = TRUE;
2908                         }
2909                         comm->haveTurnedOffDlb           = false;
2910                         /* Register when we last measured DLB slowdown */
2911                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
2912                     }
2913                     else
2914                     {
2915                         /* Here we check if the max PME rank load is more than 0.98
2916                          * the max PP force load. If so, PP DLB will not help,
2917                          * since we are (almost) limited by PME. Furthermore,
2918                          * DLB will cause a significant extra x/f redistribution
2919                          * cost on the PME ranks, which will then surely result
2920                          * in lower total performance.
2921                          */
2922                         if (cr->npmenodes > 0 &&
2923                             dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
2924                         {
2925                             turnOnDlb = FALSE;
2926                         }
2927                         else
2928                         {
2929                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
2930                         }
2931                     }
2932                 }
2933                 struct
2934                 {
2935                     gmx_bool turnOffDlbForever;
2936                     gmx_bool turnOnDlb;
2937                 }
2938                 bools {
2939                     turnOffDlbForever, turnOnDlb
2940                 };
2941                 dd_bcast(dd, sizeof(bools), &bools);
2942                 if (bools.turnOffDlbForever)
2943                 {
2944                     turn_off_dlb_forever(mdlog, dd, step);
2945                 }
2946                 else if (bools.turnOnDlb)
2947                 {
2948                     turn_on_dlb(mdlog, dd, step);
2949                     bDoDLB = TRUE;
2950                 }
2951             }
2952         }
2953         comm->n_load_have++;
2954     }
2955
2956     bRedist = FALSE;
2957     if (bMasterState)
2958     {
2959         /* Clear the old state */
2960         clearDDStateIndices(dd, 0, 0);
2961         ncgindex_set = 0;
2962
2963         auto xGlobal = positionsFromStatePointer(state_global);
2964
2965         set_ddbox(*dd, true,
2966                   DDMASTER(dd) ? state_global->box : nullptr,
2967                   true, xGlobal,
2968                   &ddbox);
2969
2970         distributeState(mdlog, dd, top_global, state_global, ddbox, state_local, f);
2971
2972         /* Ensure that we have space for the new distribution */
2973         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
2974
2975         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2976
2977         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr, comm->bLocalCG);
2978     }
2979     else if (state_local->ddp_count != dd->ddp_count)
2980     {
2981         if (state_local->ddp_count > dd->ddp_count)
2982         {
2983             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64 ")", state_local->ddp_count, dd->ddp_count);
2984         }
2985
2986         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
2987         {
2988             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
2989         }
2990
2991         /* Clear the old state */
2992         clearDDStateIndices(dd, 0, 0);
2993
2994         /* Restore the atom group indices from state_local */
2995         restoreAtomGroups(dd, state_local);
2996         make_dd_indices(dd, 0);
2997         ncgindex_set = dd->ncg_home;
2998
2999         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3000
3001         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr, comm->bLocalCG);
3002
3003         set_ddbox(*dd, bMasterState, state_local->box,
3004                   true, state_local->x, &ddbox);
3005
3006         bRedist = isDlbOn(comm);
3007     }
3008     else
3009     {
3010         /* We have the full state, only redistribute the cgs */
3011
3012         /* Clear the non-home indices */
3013         clearDDStateIndices(dd, dd->ncg_home, comm->atomRanges.numHomeAtoms());
3014         ncgindex_set = 0;
3015
3016         /* To avoid global communication, we do not recompute the extent
3017          * of the system for dims without pbc. Therefore we need to copy
3018          * the previously computed values when we do not communicate.
3019          */
3020         if (!bNStGlobalComm)
3021         {
3022             copy_rvec(comm->box0, ddbox.box0    );
3023             copy_rvec(comm->box_size, ddbox.box_size);
3024         }
3025         set_ddbox(*dd, bMasterState, state_local->box,
3026                   bNStGlobalComm, state_local->x, &ddbox);
3027
3028         bBoxChanged = TRUE;
3029         bRedist     = TRUE;
3030     }
3031     /* Copy needed for dim's without pbc when avoiding communication */
3032     copy_rvec(ddbox.box0, comm->box0    );
3033     copy_rvec(ddbox.box_size, comm->box_size);
3034
3035     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(*dd), bMasterState, bDoDLB,
3036                       step, wcycle);
3037
3038     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
3039     {
3040         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
3041     }
3042
3043     if (comm->useUpdateGroups)
3044     {
3045         comm->updateGroupsCog->addCogs(gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3046                                        state_local->x);
3047     }
3048
3049     /* Check if we should sort the charge groups */
3050     const bool bSortCG = (bMasterState || bRedist);
3051
3052     /* When repartitioning we mark atom groups that will move to neighboring
3053      * DD cells, but we do not move them right away for performance reasons.
3054      * Thus we need to keep track of how many charge groups will move for
3055      * obtaining correct local charge group / atom counts.
3056      */
3057     ncg_moved = 0;
3058     if (bRedist)
3059     {
3060         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
3061
3062         ncgindex_set = dd->ncg_home;
3063         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
3064                            state_local, f, fr,
3065                            nrnb, &ncg_moved);
3066
3067         GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
3068
3069         if (comm->useUpdateGroups)
3070         {
3071             comm->updateGroupsCog->addCogs(gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3072                                            state_local->x);
3073         }
3074
3075         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
3076     }
3077
3078     // TODO: Integrate this code in the nbnxm module
3079     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
3080                           dd, &ddbox,
3081                           &comm->cell_x0, &comm->cell_x1,
3082                           dd->ncg_home, as_rvec_array(state_local->x.data()),
3083                           cell_ns_x0, cell_ns_x1, &grid_density);
3084
3085     if (bBoxChanged)
3086     {
3087         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
3088     }
3089
3090     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
3091     copy_ivec(ddbox.tric_dir, comm->tric_dir);
3092
3093     if (bSortCG)
3094     {
3095         wallcycle_sub_start(wcycle, ewcsDD_GRID);
3096
3097         /* Sort the state on charge group position.
3098          * This enables exact restarts from this step.
3099          * It also improves performance by about 15% with larger numbers
3100          * of atoms per node.
3101          */
3102
3103         /* Fill the ns grid with the home cell,
3104          * so we can sort with the indices.
3105          */
3106         set_zones_ncg_home(dd);
3107
3108         set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3109
3110         nbnxn_put_on_grid(fr->nbv.get(), state_local->box,
3111                           0,
3112                           comm->zones.size[0].bb_x0,
3113                           comm->zones.size[0].bb_x1,
3114                           comm->updateGroupsCog.get(),
3115                           0, dd->ncg_home,
3116                           comm->zones.dens_zone0,
3117                           fr->cginfo,
3118                           state_local->x,
3119                           ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr);
3120
3121         if (debug)
3122         {
3123             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
3124                     gmx_step_str(step, sbuf), dd->ncg_home);
3125         }
3126         dd_sort_state(dd, fr, state_local);
3127
3128         /* After sorting and compacting we set the correct size */
3129         dd_resize_state(state_local, f, comm->atomRanges.numHomeAtoms());
3130
3131         /* Rebuild all the indices */
3132         dd->ga2la->clear();
3133         ncgindex_set = 0;
3134
3135         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
3136     }
3137     else
3138     {
3139         /* With the group scheme the sorting array is part of the DD state,
3140          * but it just got out of sync, so mark as invalid by emptying it.
3141          */
3142         if (ir->cutoff_scheme == ecutsGROUP)
3143         {
3144             comm->sort->sorted.clear();
3145         }
3146     }
3147
3148     if (comm->useUpdateGroups)
3149     {
3150         /* The update groups cog's are invalid after sorting
3151          * and need to be cleared before the next partitioning anyhow.
3152          */
3153         comm->updateGroupsCog->clear();
3154     }
3155
3156     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
3157
3158     /* Setup up the communication and communicate the coordinates */
3159     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
3160
3161     /* Set the indices */
3162     make_dd_indices(dd, ncgindex_set);
3163
3164     /* Set the charge group boundaries for neighbor searching */
3165     set_cg_boundaries(&comm->zones);
3166
3167     if (fr->cutoff_scheme == ecutsVERLET)
3168     {
3169         /* When bSortCG=true, we have already set the size for zone 0 */
3170         set_zones_size(dd, state_local->box, &ddbox,
3171                        bSortCG ? 1 : 0, comm->zones.n,
3172                        0);
3173     }
3174
3175     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
3176
3177     /*
3178        write_dd_pdb("dd_home",step,"dump",top_global,cr,
3179                  -1,state_local->x.rvec_array(),state_local->box);
3180      */
3181
3182     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
3183
3184     /* Extract a local topology from the global topology */
3185     for (int i = 0; i < dd->ndim; i++)
3186     {
3187         np[dd->dim[i]] = comm->cd[i].numPulses();
3188     }
3189     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
3190                       comm->cellsize_min, np,
3191                       fr,
3192                       state_local->x.rvec_array(),
3193                       top_global, top_local);
3194
3195     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
3196
3197     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
3198
3199     /* Set up the special atom communication */
3200     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3201     for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
3202     {
3203         auto range = static_cast<DDAtomRanges::Type>(i);
3204         switch (range)
3205         {
3206             case DDAtomRanges::Type::Vsites:
3207                 if (vsite && vsite->numInterUpdategroupVsites)
3208                 {
3209                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
3210                 }
3211                 break;
3212             case DDAtomRanges::Type::Constraints:
3213                 if (dd->splitConstraints || dd->splitSettles)
3214                 {
3215                     /* Only for inter-cg constraints we need special code */
3216                     n = dd_make_local_constraints(dd, n, &top_global, fr->cginfo.data(),
3217                                                   constr, ir->nProjOrder,
3218                                                   top_local->idef.il);
3219                 }
3220                 break;
3221             default:
3222                 gmx_incons("Unknown special atom type setup");
3223         }
3224         comm->atomRanges.setEnd(range, n);
3225     }
3226
3227     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
3228
3229     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
3230
3231     /* Make space for the extra coordinates for virtual site
3232      * or constraint communication.
3233      */
3234     state_local->natoms = comm->atomRanges.numAtomsTotal();
3235
3236     dd_resize_state(state_local, f, state_local->natoms);
3237
3238     if (fr->haveDirectVirialContributions)
3239     {
3240         if (vsite && vsite->numInterUpdategroupVsites)
3241         {
3242             nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3243         }
3244         else
3245         {
3246             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
3247             {
3248                 nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3249             }
3250             else
3251             {
3252                 nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3253             }
3254         }
3255     }
3256     else
3257     {
3258         nat_f_novirsum = 0;
3259     }
3260
3261     /* Set the number of atoms required for the force calculation.
3262      * Forces need to be constrained when doing energy
3263      * minimization. For simple simulations we could avoid some
3264      * allocation, zeroing and copying, but this is probably not worth
3265      * the complications and checking.
3266      */
3267     forcerec_set_ranges(fr, dd->ncg_home, dd->globalAtomGroupIndices.size(),
3268                         comm->atomRanges.end(DDAtomRanges::Type::Zones),
3269                         comm->atomRanges.end(DDAtomRanges::Type::Constraints),
3270                         nat_f_novirsum);
3271
3272     /* Update atom data for mdatoms and several algorithms */
3273     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
3274                               nullptr, mdAtoms, constr, vsite, nullptr);
3275
3276     auto mdatoms = mdAtoms->mdatoms();
3277     if (!thisRankHasDuty(cr, DUTY_PME))
3278     {
3279         /* Send the charges and/or c6/sigmas to our PME only node */
3280         gmx_pme_send_parameters(cr,
3281                                 fr->ic,
3282                                 mdatoms->nChargePerturbed != 0, mdatoms->nTypePerturbed != 0,
3283                                 mdatoms->chargeA, mdatoms->chargeB,
3284                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
3285                                 mdatoms->sigmaA, mdatoms->sigmaB,
3286                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
3287     }
3288
3289     if (ir->bPull)
3290     {
3291         /* Update the local pull groups */
3292         dd_make_local_pull_groups(cr, pull_work);
3293     }
3294
3295     if (dd->atomSets != nullptr)
3296     {
3297         /* Update the local atom sets */
3298         dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3299     }
3300
3301     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
3302     imdSession->dd_make_local_IMD_atoms(dd);
3303
3304     add_dd_statistics(dd);
3305
3306     /* Make sure we only count the cycles for this DD partitioning */
3307     clear_dd_cycle_counts(dd);
3308
3309     /* Because the order of the atoms might have changed since
3310      * the last vsite construction, we need to communicate the constructing
3311      * atom coordinates again (for spreading the forces this MD step).
3312      */
3313     dd_move_x_vsites(dd, state_local->box, state_local->x.rvec_array());
3314
3315     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
3316
3317     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
3318     {
3319         dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
3320         write_dd_pdb("dd_dump", step, "dump", &top_global, cr,
3321                      -1, state_local->x.rvec_array(), state_local->box);
3322     }
3323
3324     /* Store the partitioning step */
3325     comm->partition_step = step;
3326
3327     /* Increase the DD partitioning counter */
3328     dd->ddp_count++;
3329     /* The state currently matches this DD partitioning count, store it */
3330     state_local->ddp_count = dd->ddp_count;
3331     if (bMasterState)
3332     {
3333         /* The DD master node knows the complete cg distribution,
3334          * store the count so we can possibly skip the cg info communication.
3335          */
3336         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3337     }
3338
3339     if (comm->DD_debug > 0)
3340     {
3341         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3342         check_index_consistency(dd, top_global.natoms, ncg_mtop(&top_global),
3343                                 "after partitioning");
3344     }
3345
3346     wallcycle_stop(wcycle, ewcDOMDEC);
3347 }
3348
3349 /*! \brief Check whether bonded interactions are missing, if appropriate */
3350 void checkNumberOfBondedInteractions(const gmx::MDLogger  &mdlog,
3351                                      t_commrec            *cr,
3352                                      int                   totalNumberOfBondedInteractions,
3353                                      const gmx_mtop_t     *top_global,
3354                                      const gmx_localtop_t *top_local,
3355                                      const rvec           *x,
3356                                      const matrix          box,
3357                                      bool                 *shouldCheckNumberOfBondedInteractions)
3358 {
3359     if (*shouldCheckNumberOfBondedInteractions)
3360     {
3361         if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
3362         {
3363             dd_print_missing_interactions(mdlog, cr, totalNumberOfBondedInteractions, top_global, top_local, x, box); // Does not return
3364         }
3365         *shouldCheckNumberOfBondedInteractions = false;
3366     }
3367 }