src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <limits.h>
  44 #include <stdio.h>
  45 #include <stdlib.h>
  46 #include <string.h>
  47
  48 #include <cmath>
  49
  50 #include <algorithm>
  51
  52 #include "gromacs/domdec/domdec_network.h"
  53 #include "gromacs/domdec/ga2la.h"
  54 #include "gromacs/ewald/pme.h"
  55 #include "gromacs/fileio/gmxfio.h"
  56 #include "gromacs/fileio/pdbio.h"
  57 #include "gromacs/gmxlib/chargegroup.h"
  58 #include "gromacs/gmxlib/network.h"
  59 #include "gromacs/gmxlib/nrnb.h"
  60 #include "gromacs/gpu_utils/gpu_utils.h"
  61 #include "gromacs/hardware/hw_info.h"
  62 #include "gromacs/imd/imd.h"
  63 #include "gromacs/listed-forces/manage-threading.h"
  64 #include "gromacs/math/functions.h"
  65 #include "gromacs/math/vec.h"
  66 #include "gromacs/math/vectypes.h"
  67 #include "gromacs/mdlib/constr.h"
  68 #include "gromacs/mdlib/constraintrange.h"
  69 #include "gromacs/mdlib/force.h"
  70 #include "gromacs/mdlib/forcerec.h"
  71 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  72 #include "gromacs/mdlib/lincs.h"
  73 #include "gromacs/mdlib/mdatoms.h"
  74 #include "gromacs/mdlib/mdrun.h"
  75 #include "gromacs/mdlib/mdsetup.h"
  76 #include "gromacs/mdlib/nb_verlet.h"
  77 #include "gromacs/mdlib/nbnxn_grid.h"
  78 #include "gromacs/mdlib/nsgrid.h"
  79 #include "gromacs/mdlib/vsite.h"
  80 #include "gromacs/mdtypes/commrec.h"
  81 #include "gromacs/mdtypes/df_history.h"
  82 #include "gromacs/mdtypes/forcerec.h"
  83 #include "gromacs/mdtypes/inputrec.h"
  84 #include "gromacs/mdtypes/md_enums.h"
  85 #include "gromacs/mdtypes/mdatom.h"
  86 #include "gromacs/mdtypes/nblist.h"
  87 #include "gromacs/mdtypes/state.h"
  88 #include "gromacs/pbcutil/ishift.h"
  89 #include "gromacs/pbcutil/pbc.h"
  90 #include "gromacs/pulling/pull.h"
  91 #include "gromacs/pulling/pull_rotation.h"
  92 #include "gromacs/swap/swapcoords.h"
  93 #include "gromacs/timing/wallcycle.h"
  94 #include "gromacs/topology/block.h"
  95 #include "gromacs/topology/idef.h"
  96 #include "gromacs/topology/ifunc.h"
  97 #include "gromacs/topology/mtop_lookup.h"
  98 #include "gromacs/topology/mtop_util.h"
  99 #include "gromacs/topology/topology.h"
 100 #include "gromacs/utility/basedefinitions.h"
 101 #include "gromacs/utility/basenetwork.h"
 102 #include "gromacs/utility/cstringutil.h"
 103 #include "gromacs/utility/exceptions.h"
 104 #include "gromacs/utility/fatalerror.h"
 105 #include "gromacs/utility/gmxmpi.h"
 106 #include "gromacs/utility/qsort_threadsafe.h"
 107 #include "gromacs/utility/real.h"
 108 #include "gromacs/utility/smalloc.h"
 109 #include "gromacs/utility/stringutil.h"
 110
 111 #include "domdec_constraints.h"
 112 #include "domdec_internal.h"
 113 #include "domdec_vsite.h"
 114
 115 #define DDRANK(dd, rank)    (rank)
 116 #define DDMASTERRANK(dd)   (dd->masterrank)
 117
 118 struct gmx_domdec_master_t
 119 {
 120     /* The cell boundaries */
 121     real **cell_x;
 122     /* The global charge group division */
 123     int   *ncg;    /* Number of home charge groups for each node */
 124     int   *index;  /* Index of nnodes+1 into cg */
 125     int   *cg;     /* Global charge group index */
 126     int   *nat;    /* Number of home atoms for each node. */
 127     int   *ibuf;   /* Buffer for communication */
 128     rvec  *vbuf;   /* Buffer for state scattering and gathering */
 129 };
 130
 131 #define DD_NLOAD_MAX 9
 132
 133 const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on", "on" };
 134
 135 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 136 #define DD_CGIBS 2
 137
 138 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 139 #define DD_FLAG_NRCG  65535
 140 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 141 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 142
 143 /* The DD zone order */
 144 static const ivec dd_zo[DD_MAXZONE] =
 145 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 146
 147 /* The non-bonded zone-pair setup for domain decomposition
 148  * The first number is the i-zone, the second number the first j-zone seen by
 149  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 150  * As is, this is for 3D decomposition, where there are 4 i-zones.
 151  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 152  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 153  */
 154 static const int
 155     ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
 156                                                  {1, 3, 6},
 157                                                  {2, 5, 6},
 158                                                  {3, 5, 7}};
 159
 160 /* Factors used to avoid problems due to rounding issues */
 161 #define DD_CELL_MARGIN       1.0001
 162 #define DD_CELL_MARGIN2      1.00005
 163 /* Factor to account for pressure scaling during nstlist steps */
 164 #define DD_PRES_SCALE_MARGIN 1.02
 165
 166 /* Turn on DLB when the load imbalance causes this amount of total loss.
 167  * There is a bit of overhead with DLB and it's difficult to achieve
 168  * a load imbalance of less than 2% with DLB.
 169  */
 170 #define DD_PERF_LOSS_DLB_ON  0.02
 171
 172 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 173 #define DD_PERF_LOSS_WARN    0.05
 174
 175 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 176
 177 /* Use separate MPI send and receive commands
 178  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 179  * This saves memory (and some copying for small nnodes).
 180  * For high parallelization scatter and gather calls are used.
 181  */
 182 #define GMX_DD_NNODES_SENDRECV 4
 183
 184
 185 /* We check if to turn on DLB at the first and every 100 DD partitionings.
 186  * With large imbalance DLB will turn on at the first step, so we can
 187  * make the interval so large that the MPI overhead of the check is negligible.
 188  */
 189 static const int c_checkTurnDlbOnInterval  = 100;
 190 /* We need to check if DLB results in worse performance and then turn it off.
 191  * We check this more often then for turning DLB on, because the DLB can scale
 192  * the domains very rapidly, so if unlucky the load imbalance can go up quickly
 193  * and furthermore, we are already synchronizing often with DLB, so
 194  * the overhead of the MPI Bcast is not that high.
 195  */
 196 static const int c_checkTurnDlbOffInterval =  20;
 197
 198 /* Forward declaration */
 199 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue);
 200
 201
 202 /*
 203    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 204
 205    static void index2xyz(ivec nc,int ind,ivec xyz)
 206    {
 207    xyz[XX] = ind % nc[XX];
 208    xyz[YY] = (ind / nc[XX]) % nc[YY];
 209    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 210    }
 211  */
 212
 213 /* This order is required to minimize the coordinate communication in PME
 214  * which uses decomposition in the x direction.
 215  */
 216 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 217
 218 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 219 {
 220     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 221     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 222     xyz[ZZ] = ind % nc[ZZ];
 223 }
 224
 225 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 226 {
 227     int ddindex;
 228     int ddnodeid = -1;
 229
 230     ddindex = dd_index(dd->nc, c);
 231     if (dd->comm->bCartesianPP_PME)
 232     {
 233         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 234     }
 235     else if (dd->comm->bCartesianPP)
 236     {
 237 #if GMX_MPI
 238         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 239 #endif
 240     }
 241     else
 242     {
 243         ddnodeid = ddindex;
 244     }
 245
 246     return ddnodeid;
 247 }
 248
 249 static gmx_bool dynamic_dd_box(const gmx_ddbox_t *ddbox, const t_inputrec *ir)
 250 {
 251     return (ddbox->nboundeddim < DIM || inputrecDynamicBox(ir));
 252 }
 253
 254 int ddglatnr(const gmx_domdec_t *dd, int i)
 255 {
 256     int atnr;
 257
 258     if (dd == nullptr)
 259     {
 260         atnr = i + 1;
 261     }
 262     else
 263     {
 264         if (i >= dd->comm->nat[ddnatNR-1])
 265         {
 266             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 267         }
 268         atnr = dd->gatindex[i] + 1;
 269     }
 270
 271     return atnr;
 272 }
 273
 274 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 275 {
 276     return &dd->comm->cgs_gl;
 277 }
 278
 279 /*! \brief Returns true if the DLB state indicates that the balancer is on. */
 280 static bool isDlbOn(const gmx_domdec_comm_t *comm)
 281 {
 282     return (comm->dlbState == edlbsOnCanTurnOff ||
 283             comm->dlbState == edlbsOnUser);
 284 }
 285 /*! \brief Returns true if the DLB state indicates that the balancer is off/disabled.
 286  */
 287 static bool isDlbDisabled(const gmx_domdec_comm_t *comm)
 288 {
 289     return (comm->dlbState == edlbsOffUser ||
 290             comm->dlbState == edlbsOffForever);
 291 }
 292
 293 static void vec_rvec_init(vec_rvec_t *v)
 294 {
 295     v->nalloc = 0;
 296     v->v      = nullptr;
 297 }
 298
 299 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 300 {
 301     if (n > v->nalloc)
 302     {
 303         v->nalloc = over_alloc_dd(n);
 304         srenew(v->v, v->nalloc);
 305     }
 306 }
 307
 308 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 309 {
 310     int i;
 311
 312     if (state->ddp_count != dd->ddp_count)
 313     {
 314         gmx_incons("The MD state does not match the domain decomposition state");
 315     }
 316
 317     state->cg_gl.resize(dd->ncg_home);
 318     for (i = 0; i < dd->ncg_home; i++)
 319     {
 320         state->cg_gl[i] = dd->index_gl[i];
 321     }
 322
 323     state->ddp_count_cg_gl = dd->ddp_count;
 324 }
 325
 326 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 327 {
 328     return &dd->comm->zones;
 329 }
 330
 331 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
 332                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 333 {
 334     gmx_domdec_zones_t *zones;
 335     int                 izone, d, dim;
 336
 337     zones = &dd->comm->zones;
 338
 339     izone = 0;
 340     while (icg >= zones->izone[izone].cg1)
 341     {
 342         izone++;
 343     }
 344
 345     if (izone == 0)
 346     {
 347         *jcg0 = icg;
 348     }
 349     else if (izone < zones->nizone)
 350     {
 351         *jcg0 = zones->izone[izone].jcg0;
 352     }
 353     else
 354     {
 355         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 356                   icg, izone, zones->nizone);
 357     }
 358
 359     *jcg1 = zones->izone[izone].jcg1;
 360
 361     for (d = 0; d < dd->ndim; d++)
 362     {
 363         dim         = dd->dim[d];
 364         shift0[dim] = zones->izone[izone].shift0[dim];
 365         shift1[dim] = zones->izone[izone].shift1[dim];
 366         if (dd->comm->tric_dir[dim] || (isDlbOn(dd->comm) && d > 0))
 367         {
 368             /* A conservative approach, this can be optimized */
 369             shift0[dim] -= 1;
 370             shift1[dim] += 1;
 371         }
 372     }
 373 }
 374
 375 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
 376 {
 377     /* We currently set mdatoms entries for all atoms:
 378      * local + non-local + communicated for vsite + constraints
 379      */
 380
 381     return dd->comm->nat[ddnatNR - 1];
 382 }
 383
 384 int dd_natoms_vsite(const gmx_domdec_t *dd)
 385 {
 386     return dd->comm->nat[ddnatVSITE];
 387 }
 388
 389 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
 390 {
 391     *at_start = dd->comm->nat[ddnatCON-1];
 392     *at_end   = dd->comm->nat[ddnatCON];
 393 }
 394
 395 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[], gmx_wallcycle *wcycle)
 396 {
 397     wallcycle_start(wcycle, ewcMOVEX);
 398
 399     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 400     int                   *index, *cgindex;
 401     gmx_domdec_comm_t     *comm;
 402     gmx_domdec_comm_dim_t *cd;
 403     gmx_domdec_ind_t      *ind;
 404     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 405     gmx_bool               bPBC, bScrew;
 406
 407     comm = dd->comm;
 408
 409     cgindex = dd->cgindex;
 410
 411     buf = comm->vbuf.v;
 412
 413     nzone   = 1;
 414     nat_tot = dd->nat_home;
 415     for (d = 0; d < dd->ndim; d++)
 416     {
 417         bPBC   = (dd->ci[dd->dim[d]] == 0);
 418         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 419         if (bPBC)
 420         {
 421             copy_rvec(box[dd->dim[d]], shift);
 422         }
 423         cd = &comm->cd[d];
 424         for (p = 0; p < cd->np; p++)
 425         {
 426             ind   = &cd->ind[p];
 427             index = ind->index;
 428             n     = 0;
 429             if (!bPBC)
 430             {
 431                 for (i = 0; i < ind->nsend[nzone]; i++)
 432                 {
 433                     at0 = cgindex[index[i]];
 434                     at1 = cgindex[index[i]+1];
 435                     for (j = at0; j < at1; j++)
 436                     {
 437                         copy_rvec(x[j], buf[n]);
 438                         n++;
 439                     }
 440                 }
 441             }
 442             else if (!bScrew)
 443             {
 444                 for (i = 0; i < ind->nsend[nzone]; i++)
 445                 {
 446                     at0 = cgindex[index[i]];
 447                     at1 = cgindex[index[i]+1];
 448                     for (j = at0; j < at1; j++)
 449                     {
 450                         /* We need to shift the coordinates */
 451                         rvec_add(x[j], shift, buf[n]);
 452                         n++;
 453                     }
 454                 }
 455             }
 456             else
 457             {
 458                 for (i = 0; i < ind->nsend[nzone]; i++)
 459                 {
 460                     at0 = cgindex[index[i]];
 461                     at1 = cgindex[index[i]+1];
 462                     for (j = at0; j < at1; j++)
 463                     {
 464                         /* Shift x */
 465                         buf[n][XX] = x[j][XX] + shift[XX];
 466                         /* Rotate y and z.
 467                          * This operation requires a special shift force
 468                          * treatment, which is performed in calc_vir.
 469                          */
 470                         buf[n][YY] = box[YY][YY] - x[j][YY];
 471                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 472                         n++;
 473                     }
 474                 }
 475             }
 476
 477             if (cd->bInPlace)
 478             {
 479                 rbuf = x + nat_tot;
 480             }
 481             else
 482             {
 483                 rbuf = comm->vbuf2.v;
 484             }
 485             /* Send and receive the coordinates */
 486             dd_sendrecv_rvec(dd, d, dddirBackward,
 487                              buf,  ind->nsend[nzone+1],
 488                              rbuf, ind->nrecv[nzone+1]);
 489             if (!cd->bInPlace)
 490             {
 491                 j = 0;
 492                 for (zone = 0; zone < nzone; zone++)
 493                 {
 494                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 495                     {
 496                         copy_rvec(rbuf[j], x[i]);
 497                         j++;
 498                     }
 499                 }
 500             }
 501             nat_tot += ind->nrecv[nzone+1];
 502         }
 503         nzone += nzone;
 504     }
 505
 506     wallcycle_stop(wcycle, ewcMOVEX);
 507 }
 508
 509 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift, gmx_wallcycle *wcycle)
 510 {
 511     wallcycle_start(wcycle, ewcMOVEF);
 512
 513     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 514     int                   *index, *cgindex;
 515     gmx_domdec_comm_t     *comm;
 516     gmx_domdec_comm_dim_t *cd;
 517     gmx_domdec_ind_t      *ind;
 518     rvec                  *buf, *sbuf;
 519     ivec                   vis;
 520     int                    is;
 521     gmx_bool               bShiftForcesNeedPbc, bScrew;
 522
 523     comm = dd->comm;
 524
 525     cgindex = dd->cgindex;
 526
 527     buf = comm->vbuf.v;
 528
 529     nzone   = comm->zones.n/2;
 530     nat_tot = dd->nat_tot;
 531     for (d = dd->ndim-1; d >= 0; d--)
 532     {
 533         /* Only forces in domains near the PBC boundaries need to
 534            consider PBC in the treatment of fshift */
 535         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 536         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 537         if (fshift == nullptr && !bScrew)
 538         {
 539             bShiftForcesNeedPbc = FALSE;
 540         }
 541         /* Determine which shift vector we need */
 542         clear_ivec(vis);
 543         vis[dd->dim[d]] = 1;
 544         is              = IVEC2IS(vis);
 545
 546         cd = &comm->cd[d];
 547         for (p = cd->np-1; p >= 0; p--)
 548         {
 549             ind      = &cd->ind[p];
 550             nat_tot -= ind->nrecv[nzone+1];
 551             if (cd->bInPlace)
 552             {
 553                 sbuf = f + nat_tot;
 554             }
 555             else
 556             {
 557                 sbuf = comm->vbuf2.v;
 558                 j    = 0;
 559                 for (zone = 0; zone < nzone; zone++)
 560                 {
 561                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 562                     {
 563                         copy_rvec(f[i], sbuf[j]);
 564                         j++;
 565                     }
 566                 }
 567             }
 568             /* Communicate the forces */
 569             dd_sendrecv_rvec(dd, d, dddirForward,
 570                              sbuf, ind->nrecv[nzone+1],
 571                              buf,  ind->nsend[nzone+1]);
 572             index = ind->index;
 573             /* Add the received forces */
 574             n = 0;
 575             if (!bShiftForcesNeedPbc)
 576             {
 577                 for (i = 0; i < ind->nsend[nzone]; i++)
 578                 {
 579                     at0 = cgindex[index[i]];
 580                     at1 = cgindex[index[i]+1];
 581                     for (j = at0; j < at1; j++)
 582                     {
 583                         rvec_inc(f[j], buf[n]);
 584                         n++;
 585                     }
 586                 }
 587             }
 588             else if (!bScrew)
 589             {
 590                 /* fshift should always be defined if this function is
 591                  * called when bShiftForcesNeedPbc is true */
 592                 assert(NULL != fshift);
 593                 for (i = 0; i < ind->nsend[nzone]; i++)
 594                 {
 595                     at0 = cgindex[index[i]];
 596                     at1 = cgindex[index[i]+1];
 597                     for (j = at0; j < at1; j++)
 598                     {
 599                         rvec_inc(f[j], buf[n]);
 600                         /* Add this force to the shift force */
 601                         rvec_inc(fshift[is], buf[n]);
 602                         n++;
 603                     }
 604                 }
 605             }
 606             else
 607             {
 608                 for (i = 0; i < ind->nsend[nzone]; i++)
 609                 {
 610                     at0 = cgindex[index[i]];
 611                     at1 = cgindex[index[i]+1];
 612                     for (j = at0; j < at1; j++)
 613                     {
 614                         /* Rotate the force */
 615                         f[j][XX] += buf[n][XX];
 616                         f[j][YY] -= buf[n][YY];
 617                         f[j][ZZ] -= buf[n][ZZ];
 618                         if (fshift)
 619                         {
 620                             /* Add this force to the shift force */
 621                             rvec_inc(fshift[is], buf[n]);
 622                         }
 623                         n++;
 624                     }
 625                 }
 626             }
 627         }
 628         nzone /= 2;
 629     }
 630     wallcycle_stop(wcycle, ewcMOVEF);
 631 }
 632
 633 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 634 {
 635     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 636     int                   *index, *cgindex;
 637     gmx_domdec_comm_t     *comm;
 638     gmx_domdec_comm_dim_t *cd;
 639     gmx_domdec_ind_t      *ind;
 640     real                  *buf, *rbuf;
 641
 642     comm = dd->comm;
 643
 644     cgindex = dd->cgindex;
 645
 646     buf = &comm->vbuf.v[0][0];
 647
 648     nzone   = 1;
 649     nat_tot = dd->nat_home;
 650     for (d = 0; d < dd->ndim; d++)
 651     {
 652         cd = &comm->cd[d];
 653         for (p = 0; p < cd->np; p++)
 654         {
 655             ind   = &cd->ind[p];
 656             index = ind->index;
 657             n     = 0;
 658             for (i = 0; i < ind->nsend[nzone]; i++)
 659             {
 660                 at0 = cgindex[index[i]];
 661                 at1 = cgindex[index[i]+1];
 662                 for (j = at0; j < at1; j++)
 663                 {
 664                     buf[n] = v[j];
 665                     n++;
 666                 }
 667             }
 668
 669             if (cd->bInPlace)
 670             {
 671                 rbuf = v + nat_tot;
 672             }
 673             else
 674             {
 675                 rbuf = &comm->vbuf2.v[0][0];
 676             }
 677             /* Send and receive the coordinates */
 678             dd_sendrecv_real(dd, d, dddirBackward,
 679                              buf,  ind->nsend[nzone+1],
 680                              rbuf, ind->nrecv[nzone+1]);
 681             if (!cd->bInPlace)
 682             {
 683                 j = 0;
 684                 for (zone = 0; zone < nzone; zone++)
 685                 {
 686                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 687                     {
 688                         v[i] = rbuf[j];
 689                         j++;
 690                     }
 691                 }
 692             }
 693             nat_tot += ind->nrecv[nzone+1];
 694         }
 695         nzone += nzone;
 696     }
 697 }
 698
 699 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 700 {
 701     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 702     int                   *index, *cgindex;
 703     gmx_domdec_comm_t     *comm;
 704     gmx_domdec_comm_dim_t *cd;
 705     gmx_domdec_ind_t      *ind;
 706     real                  *buf, *sbuf;
 707
 708     comm = dd->comm;
 709
 710     cgindex = dd->cgindex;
 711
 712     buf = &comm->vbuf.v[0][0];
 713
 714     nzone   = comm->zones.n/2;
 715     nat_tot = dd->nat_tot;
 716     for (d = dd->ndim-1; d >= 0; d--)
 717     {
 718         cd = &comm->cd[d];
 719         for (p = cd->np-1; p >= 0; p--)
 720         {
 721             ind      = &cd->ind[p];
 722             nat_tot -= ind->nrecv[nzone+1];
 723             if (cd->bInPlace)
 724             {
 725                 sbuf = v + nat_tot;
 726             }
 727             else
 728             {
 729                 sbuf = &comm->vbuf2.v[0][0];
 730                 j    = 0;
 731                 for (zone = 0; zone < nzone; zone++)
 732                 {
 733                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 734                     {
 735                         sbuf[j] = v[i];
 736                         j++;
 737                     }
 738                 }
 739             }
 740             /* Communicate the forces */
 741             dd_sendrecv_real(dd, d, dddirForward,
 742                              sbuf, ind->nrecv[nzone+1],
 743                              buf,  ind->nsend[nzone+1]);
 744             index = ind->index;
 745             /* Add the received forces */
 746             n = 0;
 747             for (i = 0; i < ind->nsend[nzone]; i++)
 748             {
 749                 at0 = cgindex[index[i]];
 750                 at1 = cgindex[index[i]+1];
 751                 for (j = at0; j < at1; j++)
 752                 {
 753                     v[j] += buf[n];
 754                     n++;
 755                 }
 756             }
 757         }
 758         nzone /= 2;
 759     }
 760 }
 761
 762 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 763 {
 764     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 765             d, i, j,
 766             zone->min0, zone->max1,
 767             zone->mch0, zone->mch0,
 768             zone->p1_0, zone->p1_1);
 769 }
 770
 771
 772 #define DDZONECOMM_MAXZONE  5
 773 #define DDZONECOMM_BUFSIZE  3
 774
 775 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 776                                int ddimind, int direction,
 777                                gmx_ddzone_t *buf_s, int n_s,
 778                                gmx_ddzone_t *buf_r, int n_r)
 779 {
 780 #define ZBS  DDZONECOMM_BUFSIZE
 781     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 782     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 783     int  i;
 784
 785     for (i = 0; i < n_s; i++)
 786     {
 787         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 788         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 789         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 790         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 791         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 792         vbuf_s[i*ZBS+1][2] = 0;
 793         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 794         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 795         vbuf_s[i*ZBS+2][2] = 0;
 796     }
 797
 798     dd_sendrecv_rvec(dd, ddimind, direction,
 799                      vbuf_s, n_s*ZBS,
 800                      vbuf_r, n_r*ZBS);
 801
 802     for (i = 0; i < n_r; i++)
 803     {
 804         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 805         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 806         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 807         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 808         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 809         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 810         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 811     }
 812
 813 #undef ZBS
 814 }
 815
 816 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 817                           rvec cell_ns_x0, rvec cell_ns_x1)
 818 {
 819     int                d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
 820     gmx_ddzone_t      *zp;
 821     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 822     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 823     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 824     rvec               extr_s[2], extr_r[2];
 825     rvec               dh;
 826     real               dist_d, c = 0, det;
 827     gmx_domdec_comm_t *comm;
 828     gmx_bool           bPBC, bUse;
 829
 830     comm = dd->comm;
 831
 832     for (d = 1; d < dd->ndim; d++)
 833     {
 834         dim      = dd->dim[d];
 835         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 836         zp->min0 = cell_ns_x0[dim];
 837         zp->max1 = cell_ns_x1[dim];
 838         zp->min1 = cell_ns_x1[dim];
 839         zp->mch0 = cell_ns_x0[dim];
 840         zp->mch1 = cell_ns_x1[dim];
 841         zp->p1_0 = cell_ns_x0[dim];
 842         zp->p1_1 = cell_ns_x1[dim];
 843     }
 844
 845     for (d = dd->ndim-2; d >= 0; d--)
 846     {
 847         dim  = dd->dim[d];
 848         bPBC = (dim < ddbox->npbcdim);
 849
 850         /* Use an rvec to store two reals */
 851         extr_s[d][0] = comm->cell_f0[d+1];
 852         extr_s[d][1] = comm->cell_f1[d+1];
 853         extr_s[d][2] = comm->cell_f1[d+1];
 854
 855         pos = 0;
 856         /* Store the extremes in the backward sending buffer,
 857          * so the get updated separately from the forward communication.
 858          */
 859         for (d1 = d; d1 < dd->ndim-1; d1++)
 860         {
 861             /* We invert the order to be able to use the same loop for buf_e */
 862             buf_s[pos].min0 = extr_s[d1][1];
 863             buf_s[pos].max1 = extr_s[d1][0];
 864             buf_s[pos].min1 = extr_s[d1][2];
 865             buf_s[pos].mch0 = 0;
 866             buf_s[pos].mch1 = 0;
 867             /* Store the cell corner of the dimension we communicate along */
 868             buf_s[pos].p1_0 = comm->cell_x0[dim];
 869             buf_s[pos].p1_1 = 0;
 870             pos++;
 871         }
 872
 873         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 874         pos++;
 875
 876         if (dd->ndim == 3 && d == 0)
 877         {
 878             buf_s[pos] = comm->zone_d2[0][1];
 879             pos++;
 880             buf_s[pos] = comm->zone_d1[0];
 881             pos++;
 882         }
 883
 884         /* We only need to communicate the extremes
 885          * in the forward direction
 886          */
 887         npulse = comm->cd[d].np;
 888         if (bPBC)
 889         {
 890             /* Take the minimum to avoid double communication */
 891             npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
 892         }
 893         else
 894         {
 895             /* Without PBC we should really not communicate over
 896              * the boundaries, but implementing that complicates
 897              * the communication setup and therefore we simply
 898              * do all communication, but ignore some data.
 899              */
 900             npulse_min = npulse;
 901         }
 902         for (p = 0; p < npulse_min; p++)
 903         {
 904             /* Communicate the extremes forward */
 905             bUse = (bPBC || dd->ci[dim] > 0);
 906
 907             dd_sendrecv_rvec(dd, d, dddirForward,
 908                              extr_s+d, dd->ndim-d-1,
 909                              extr_r+d, dd->ndim-d-1);
 910
 911             if (bUse)
 912             {
 913                 for (d1 = d; d1 < dd->ndim-1; d1++)
 914                 {
 915                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 916                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 917                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 918                 }
 919             }
 920         }
 921
 922         buf_size = pos;
 923         for (p = 0; p < npulse; p++)
 924         {
 925             /* Communicate all the zone information backward */
 926             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 927
 928             dd_sendrecv_ddzone(dd, d, dddirBackward,
 929                                buf_s, buf_size,
 930                                buf_r, buf_size);
 931
 932             clear_rvec(dh);
 933             if (p > 0)
 934             {
 935                 for (d1 = d+1; d1 < dd->ndim; d1++)
 936                 {
 937                     /* Determine the decrease of maximum required
 938                      * communication height along d1 due to the distance along d,
 939                      * this avoids a lot of useless atom communication.
 940                      */
 941                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 942
 943                     if (ddbox->tric_dir[dim])
 944                     {
 945                         /* c is the off-diagonal coupling between the cell planes
 946                          * along directions d and d1.
 947                          */
 948                         c = ddbox->v[dim][dd->dim[d1]][dim];
 949                     }
 950                     else
 951                     {
 952                         c = 0;
 953                     }
 954                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 955                     if (det > 0)
 956                     {
 957                         dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
 958                     }
 959                     else
 960                     {
 961                         /* A negative value signals out of range */
 962                         dh[d1] = -1;
 963                     }
 964                 }
 965             }
 966
 967             /* Accumulate the extremes over all pulses */
 968             for (i = 0; i < buf_size; i++)
 969             {
 970                 if (p == 0)
 971                 {
 972                     buf_e[i] = buf_r[i];
 973                 }
 974                 else
 975                 {
 976                     if (bUse)
 977                     {
 978                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 979                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 980                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 981                     }
 982
 983                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 984                     {
 985                         d1 = 1;
 986                     }
 987                     else
 988                     {
 989                         d1 = d + 1;
 990                     }
 991                     if (bUse && dh[d1] >= 0)
 992                     {
 993                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 994                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 995                     }
 996                 }
 997                 /* Copy the received buffer to the send buffer,
 998                  * to pass the data through with the next pulse.
 999                  */
1000                 buf_s[i] = buf_r[i];
1001             }
1002             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1003                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1004             {
1005                 /* Store the extremes */
1006                 pos = 0;
1007
1008                 for (d1 = d; d1 < dd->ndim-1; d1++)
1009                 {
1010                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
1011                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
1012                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
1013                     pos++;
1014                 }
1015
1016                 if (d == 1 || (d == 0 && dd->ndim == 3))
1017                 {
1018                     for (i = d; i < 2; i++)
1019                     {
1020                         comm->zone_d2[1-d][i] = buf_e[pos];
1021                         pos++;
1022                     }
1023                 }
1024                 if (d == 0)
1025                 {
1026                     comm->zone_d1[1] = buf_e[pos];
1027                     pos++;
1028                 }
1029             }
1030         }
1031     }
1032
1033     if (dd->ndim >= 2)
1034     {
1035         dim = dd->dim[1];
1036         for (i = 0; i < 2; i++)
1037         {
1038             if (debug)
1039             {
1040                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1041             }
1042             cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1043             cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1044         }
1045     }
1046     if (dd->ndim >= 3)
1047     {
1048         dim = dd->dim[2];
1049         for (i = 0; i < 2; i++)
1050         {
1051             for (j = 0; j < 2; j++)
1052             {
1053                 if (debug)
1054                 {
1055                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1056                 }
1057                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1058                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1059             }
1060         }
1061     }
1062     for (d = 1; d < dd->ndim; d++)
1063     {
1064         comm->cell_f_max0[d] = extr_s[d-1][0];
1065         comm->cell_f_min1[d] = extr_s[d-1][1];
1066         if (debug)
1067         {
1068             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1069                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1070         }
1071     }
1072 }
1073
1074 static void dd_collect_cg(gmx_domdec_t  *dd,
1075                           const t_state *state_local)
1076 {
1077     gmx_domdec_master_t *ma = nullptr;
1078     int                  buf2[2], *ibuf, i, ncg_home = 0, nat_home = 0;
1079
1080     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1081     {
1082         /* The master has the correct distribution */
1083         return;
1084     }
1085
1086     const int *cg;
1087
1088     if (state_local->ddp_count == dd->ddp_count)
1089     {
1090         /* The local state and DD are in sync, use the DD indices */
1091         ncg_home = dd->ncg_home;
1092         cg       = dd->index_gl;
1093         nat_home = dd->nat_home;
1094     }
1095     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1096     {
1097         /* The DD is out of sync with the local state, but we have stored
1098          * the cg indices with the local state, so we can use those.
1099          */
1100         t_block *cgs_gl;
1101
1102         cgs_gl = &dd->comm->cgs_gl;
1103
1104         ncg_home = state_local->cg_gl.size();
1105         cg       = state_local->cg_gl.data();
1106         nat_home = 0;
1107         for (i = 0; i < ncg_home; i++)
1108         {
1109             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1110         }
1111     }
1112     else
1113     {
1114         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1115     }
1116
1117     buf2[0] = ncg_home;
1118     buf2[1] = nat_home;
1119     if (DDMASTER(dd))
1120     {
1121         ma   = dd->ma;
1122         ibuf = ma->ibuf;
1123     }
1124     else
1125     {
1126         ibuf = nullptr;
1127     }
1128     /* Collect the charge group and atom counts on the master */
1129     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1130
1131     if (DDMASTER(dd))
1132     {
1133         ma->index[0] = 0;
1134         for (i = 0; i < dd->nnodes; i++)
1135         {
1136             ma->ncg[i]     = ma->ibuf[2*i];
1137             ma->nat[i]     = ma->ibuf[2*i+1];
1138             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1139
1140         }
1141         /* Make byte counts and indices */
1142         for (i = 0; i < dd->nnodes; i++)
1143         {
1144             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1145             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1146         }
1147         if (debug)
1148         {
1149             fprintf(debug, "Initial charge group distribution: ");
1150             for (i = 0; i < dd->nnodes; i++)
1151             {
1152                 fprintf(debug, " %d", ma->ncg[i]);
1153             }
1154             fprintf(debug, "\n");
1155         }
1156     }
1157
1158     /* Collect the charge group indices on the master */
1159     dd_gatherv(dd,
1160                ncg_home*sizeof(int), cg,
1161                DDMASTER(dd) ? ma->ibuf : nullptr,
1162                DDMASTER(dd) ? ma->ibuf+dd->nnodes : nullptr,
1163                DDMASTER(dd) ? ma->cg : nullptr);
1164
1165     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1166 }
1167
1168 static void dd_collect_vec_sendrecv(gmx_domdec_t                  *dd,
1169                                     gmx::ArrayRef<const gmx::RVec> lv,
1170                                     gmx::ArrayRef<gmx::RVec>       v)
1171 {
1172     gmx_domdec_master_t *ma;
1173     int                  n, i, c, a, nalloc = 0;
1174     rvec                *buf = nullptr;
1175     t_block             *cgs_gl;
1176
1177     ma = dd->ma;
1178
1179     if (!DDMASTER(dd))
1180     {
1181 #if GMX_MPI
1182         MPI_Send(const_cast<void *>(static_cast<const void *>(lv.data())), dd->nat_home*sizeof(rvec), MPI_BYTE,
1183                  DDMASTERRANK(dd), dd->rank, dd->mpi_comm_all);
1184 #endif
1185     }
1186     else
1187     {
1188         /* Copy the master coordinates to the global array */
1189         cgs_gl = &dd->comm->cgs_gl;
1190
1191         n = DDMASTERRANK(dd);
1192         a = 0;
1193         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1194         {
1195             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1196             {
1197                 copy_rvec(lv[a++], v[c]);
1198             }
1199         }
1200
1201         for (n = 0; n < dd->nnodes; n++)
1202         {
1203             if (n != dd->rank)
1204             {
1205                 if (ma->nat[n] > nalloc)
1206                 {
1207                     nalloc = over_alloc_dd(ma->nat[n]);
1208                     srenew(buf, nalloc);
1209                 }
1210 #if GMX_MPI
1211                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1212                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1213 #endif
1214                 a = 0;
1215                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1216                 {
1217                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1218                     {
1219                         copy_rvec(buf[a++], v[c]);
1220                     }
1221                 }
1222             }
1223         }
1224         sfree(buf);
1225     }
1226 }
1227
1228 static void get_commbuffer_counts(gmx_domdec_t *dd,
1229                                   int **counts, int **disps)
1230 {
1231     gmx_domdec_master_t *ma;
1232     int                  n;
1233
1234     ma = dd->ma;
1235
1236     /* Make the rvec count and displacment arrays */
1237     *counts  = ma->ibuf;
1238     *disps   = ma->ibuf + dd->nnodes;
1239     for (n = 0; n < dd->nnodes; n++)
1240     {
1241         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1242         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1243     }
1244 }
1245
1246 static void dd_collect_vec_gatherv(gmx_domdec_t                  *dd,
1247                                    gmx::ArrayRef<const gmx::RVec> lv,
1248                                    gmx::ArrayRef<gmx::RVec>       v)
1249 {
1250     gmx_domdec_master_t *ma;
1251     int                 *rcounts = nullptr, *disps = nullptr;
1252     int                  n, i, c, a;
1253     rvec                *buf = nullptr;
1254     t_block             *cgs_gl;
1255
1256     ma = dd->ma;
1257
1258     if (DDMASTER(dd))
1259     {
1260         get_commbuffer_counts(dd, &rcounts, &disps);
1261
1262         buf = ma->vbuf;
1263     }
1264
1265     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv.data(), rcounts, disps, buf);
1266
1267     if (DDMASTER(dd))
1268     {
1269         cgs_gl = &dd->comm->cgs_gl;
1270
1271         a = 0;
1272         for (n = 0; n < dd->nnodes; n++)
1273         {
1274             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1275             {
1276                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1277                 {
1278                     copy_rvec(buf[a++], v[c]);
1279                 }
1280             }
1281         }
1282     }
1283 }
1284
1285 void dd_collect_vec(gmx_domdec_t                  *dd,
1286                     const t_state                 *state_local,
1287                     gmx::ArrayRef<const gmx::RVec> lv,
1288                     gmx::ArrayRef<gmx::RVec>       v)
1289 {
1290     dd_collect_cg(dd, state_local);
1291
1292     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1293     {
1294         dd_collect_vec_sendrecv(dd, lv, v);
1295     }
1296     else
1297     {
1298         dd_collect_vec_gatherv(dd, lv, v);
1299     }
1300 }
1301
1302
1303 void dd_collect_state(gmx_domdec_t *dd,
1304                       const t_state *state_local, t_state *state)
1305 {
1306     int nh = state_local->nhchainlength;
1307
1308     if (DDMASTER(dd))
1309     {
1310         GMX_RELEASE_ASSERT(state->nhchainlength == nh, "The global and local Nose-Hoover chain lengths should match");
1311
1312         for (int i = 0; i < efptNR; i++)
1313         {
1314             state->lambda[i] = state_local->lambda[i];
1315         }
1316         state->fep_state = state_local->fep_state;
1317         state->veta      = state_local->veta;
1318         state->vol0      = state_local->vol0;
1319         copy_mat(state_local->box, state->box);
1320         copy_mat(state_local->boxv, state->boxv);
1321         copy_mat(state_local->svir_prev, state->svir_prev);
1322         copy_mat(state_local->fvir_prev, state->fvir_prev);
1323         copy_mat(state_local->pres_prev, state->pres_prev);
1324
1325         for (int i = 0; i < state_local->ngtc; i++)
1326         {
1327             for (int j = 0; j < nh; j++)
1328             {
1329                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1330                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1331             }
1332             state->therm_integral[i] = state_local->therm_integral[i];
1333         }
1334         for (int i = 0; i < state_local->nnhpres; i++)
1335         {
1336             for (int j = 0; j < nh; j++)
1337             {
1338                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1339                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1340             }
1341         }
1342         state->baros_integral = state_local->baros_integral;
1343     }
1344     if (state_local->flags & (1 << estX))
1345     {
1346         gmx::ArrayRef<gmx::RVec> globalXRef = state ? gmx::makeArrayRef(state->x) : gmx::EmptyArrayRef();
1347         dd_collect_vec(dd, state_local, state_local->x, globalXRef);
1348     }
1349     if (state_local->flags & (1 << estV))
1350     {
1351         gmx::ArrayRef<gmx::RVec> globalVRef = state ? gmx::makeArrayRef(state->v) : gmx::EmptyArrayRef();
1352         dd_collect_vec(dd, state_local, state_local->v, globalVRef);
1353     }
1354     if (state_local->flags & (1 << estCGP))
1355     {
1356         gmx::ArrayRef<gmx::RVec> globalCgpRef = state ? gmx::makeArrayRef(state->cg_p) : gmx::EmptyArrayRef();
1357         dd_collect_vec(dd, state_local, state_local->cg_p, globalCgpRef);
1358     }
1359 }
1360
1361 static void dd_resize_state(t_state *state, PaddedRVecVector *f, int natoms)
1362 {
1363     if (debug)
1364     {
1365         fprintf(debug, "Resizing state: currently %d, required %d\n", state->natoms, natoms);
1366     }
1367
1368     state_change_natoms(state, natoms);
1369
1370     if (f != nullptr)
1371     {
1372         /* We need to allocate one element extra, since we might use
1373          * (unaligned) 4-wide SIMD loads to access rvec entries.
1374          */
1375         f->resize(paddedRVecVectorSize(natoms));
1376     }
1377 }
1378
1379 static void dd_check_alloc_ncg(t_forcerec       *fr,
1380                                t_state          *state,
1381                                PaddedRVecVector *f,
1382                                int               numChargeGroups)
1383 {
1384     if (numChargeGroups > fr->cg_nalloc)
1385     {
1386         if (debug)
1387         {
1388             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, numChargeGroups, over_alloc_dd(numChargeGroups));
1389         }
1390         fr->cg_nalloc = over_alloc_dd(numChargeGroups);
1391         srenew(fr->cginfo, fr->cg_nalloc);
1392         if (fr->cutoff_scheme == ecutsGROUP)
1393         {
1394             srenew(fr->cg_cm, fr->cg_nalloc);
1395         }
1396     }
1397     if (fr->cutoff_scheme == ecutsVERLET)
1398     {
1399         /* We don't use charge groups, we use x in state to set up
1400          * the atom communication.
1401          */
1402         dd_resize_state(state, f, numChargeGroups);
1403     }
1404 }
1405
1406 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1407                                        const rvec *v, rvec *lv)
1408 {
1409     gmx_domdec_master_t *ma;
1410     int                  n, i, c, a, nalloc = 0;
1411     rvec                *buf = nullptr;
1412
1413     if (DDMASTER(dd))
1414     {
1415         ma  = dd->ma;
1416
1417         for (n = 0; n < dd->nnodes; n++)
1418         {
1419             if (n != dd->rank)
1420             {
1421                 if (ma->nat[n] > nalloc)
1422                 {
1423                     nalloc = over_alloc_dd(ma->nat[n]);
1424                     srenew(buf, nalloc);
1425                 }
1426                 /* Use lv as a temporary buffer */
1427                 a = 0;
1428                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1429                 {
1430                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1431                     {
1432                         copy_rvec(v[c], buf[a++]);
1433                     }
1434                 }
1435                 if (a != ma->nat[n])
1436                 {
1437                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1438                               a, ma->nat[n]);
1439                 }
1440
1441 #if GMX_MPI
1442                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1443                          DDRANK(dd, n), n, dd->mpi_comm_all);
1444 #endif
1445             }
1446         }
1447         sfree(buf);
1448         n = DDMASTERRANK(dd);
1449         a = 0;
1450         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1451         {
1452             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1453             {
1454                 copy_rvec(v[c], lv[a++]);
1455             }
1456         }
1457     }
1458     else
1459     {
1460 #if GMX_MPI
1461         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1462                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1463 #endif
1464     }
1465 }
1466
1467 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1468                                        const rvec *v, rvec *lv)
1469 {
1470     gmx_domdec_master_t *ma;
1471     int                 *scounts = nullptr, *disps = nullptr;
1472     int                  n, i, c, a;
1473     rvec                *buf = nullptr;
1474
1475     if (DDMASTER(dd))
1476     {
1477         ma  = dd->ma;
1478
1479         get_commbuffer_counts(dd, &scounts, &disps);
1480
1481         buf = ma->vbuf;
1482         a   = 0;
1483         for (n = 0; n < dd->nnodes; n++)
1484         {
1485             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1486             {
1487                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1488                 {
1489                     copy_rvec(v[c], buf[a++]);
1490                 }
1491             }
1492         }
1493     }
1494
1495     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1496 }
1497
1498 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs,
1499                               const rvec *v, rvec *lv)
1500 {
1501     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1502     {
1503         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1504     }
1505     else
1506     {
1507         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1508     }
1509 }
1510
1511 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1512 {
1513     if (dfhist == nullptr)
1514     {
1515         return;
1516     }
1517
1518     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1519     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1520     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1521
1522     if (dfhist->nlambda > 0)
1523     {
1524         int nlam = dfhist->nlambda;
1525         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1526         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1527         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1528         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1529         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1530         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1531
1532         for (int i = 0; i < nlam; i++)
1533         {
1534             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1535             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1536             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1537             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1538             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1539             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1540         }
1541     }
1542 }
1543
1544 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1545                                 t_state *state, t_state *state_local,
1546                                 PaddedRVecVector *f)
1547 {
1548     int nh = state_local->nhchainlength;
1549
1550     if (DDMASTER(dd))
1551     {
1552         GMX_RELEASE_ASSERT(state->nhchainlength == nh, "The global and local Nose-Hoover chain lengths should match");
1553
1554         for (int i = 0; i < efptNR; i++)
1555         {
1556             state_local->lambda[i] = state->lambda[i];
1557         }
1558         state_local->fep_state = state->fep_state;
1559         state_local->veta      = state->veta;
1560         state_local->vol0      = state->vol0;
1561         copy_mat(state->box, state_local->box);
1562         copy_mat(state->box_rel, state_local->box_rel);
1563         copy_mat(state->boxv, state_local->boxv);
1564         copy_mat(state->svir_prev, state_local->svir_prev);
1565         copy_mat(state->fvir_prev, state_local->fvir_prev);
1566         if (state->dfhist != nullptr)
1567         {
1568             copy_df_history(state_local->dfhist, state->dfhist);
1569         }
1570         for (int i = 0; i < state_local->ngtc; i++)
1571         {
1572             for (int j = 0; j < nh; j++)
1573             {
1574                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1575                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1576             }
1577             state_local->therm_integral[i] = state->therm_integral[i];
1578         }
1579         for (int i = 0; i < state_local->nnhpres; i++)
1580         {
1581             for (int j = 0; j < nh; j++)
1582             {
1583                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1584                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1585             }
1586         }
1587         state_local->baros_integral = state->baros_integral;
1588     }
1589     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda.data());
1590     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1591     dd_bcast(dd, sizeof(real), &state_local->veta);
1592     dd_bcast(dd, sizeof(real), &state_local->vol0);
1593     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1594     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1595     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1596     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1597     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1598     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi.data());
1599     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi.data());
1600     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral.data());
1601     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi.data());
1602     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi.data());
1603
1604     /* communicate df_history -- required for restarting from checkpoint */
1605     dd_distribute_dfhist(dd, state_local->dfhist);
1606
1607     dd_resize_state(state_local, f, dd->nat_home);
1608
1609     if (state_local->flags & (1 << estX))
1610     {
1611         const rvec *xGlobal = (DDMASTER(dd) ? as_rvec_array(state->x.data()) : nullptr);
1612         dd_distribute_vec(dd, cgs, xGlobal, as_rvec_array(state_local->x.data()));
1613     }
1614     if (state_local->flags & (1 << estV))
1615     {
1616         const rvec *vGlobal = (DDMASTER(dd) ? as_rvec_array(state->v.data()) : nullptr);
1617         dd_distribute_vec(dd, cgs, vGlobal, as_rvec_array(state_local->v.data()));
1618     }
1619     if (state_local->flags & (1 << estCGP))
1620     {
1621         const rvec *cgpGlobal = (DDMASTER(dd) ? as_rvec_array(state->cg_p.data()) : nullptr);
1622         dd_distribute_vec(dd, cgs, cgpGlobal, as_rvec_array(state_local->cg_p.data()));
1623     }
1624 }
1625
1626 static char dim2char(int dim)
1627 {
1628     char c = '?';
1629
1630     switch (dim)
1631     {
1632         case XX: c = 'X'; break;
1633         case YY: c = 'Y'; break;
1634         case ZZ: c = 'Z'; break;
1635         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1636     }
1637
1638     return c;
1639 }
1640
1641 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1642                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1643 {
1644     rvec   grid_s[2], *grid_r = nullptr, cx, r;
1645     char   fname[STRLEN], buf[22];
1646     FILE  *out;
1647     int    a, i, d, z, y, x;
1648     matrix tric;
1649     real   vol;
1650
1651     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1652     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1653
1654     if (DDMASTER(dd))
1655     {
1656         snew(grid_r, 2*dd->nnodes);
1657     }
1658
1659     dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : nullptr);
1660
1661     if (DDMASTER(dd))
1662     {
1663         for (d = 0; d < DIM; d++)
1664         {
1665             for (i = 0; i < DIM; i++)
1666             {
1667                 if (d == i)
1668                 {
1669                     tric[d][i] = 1;
1670                 }
1671                 else
1672                 {
1673                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1674                     {
1675                         tric[d][i] = box[i][d]/box[i][i];
1676                     }
1677                     else
1678                     {
1679                         tric[d][i] = 0;
1680                     }
1681                 }
1682             }
1683         }
1684         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1685         out = gmx_fio_fopen(fname, "w");
1686         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1687         a = 1;
1688         for (i = 0; i < dd->nnodes; i++)
1689         {
1690             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1691             for (d = 0; d < DIM; d++)
1692             {
1693                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1694             }
1695             for (z = 0; z < 2; z++)
1696             {
1697                 for (y = 0; y < 2; y++)
1698                 {
1699                     for (x = 0; x < 2; x++)
1700                     {
1701                         cx[XX] = grid_r[i*2+x][XX];
1702                         cx[YY] = grid_r[i*2+y][YY];
1703                         cx[ZZ] = grid_r[i*2+z][ZZ];
1704                         mvmul(tric, cx, r);
1705                         gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
1706                                                  10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
1707                     }
1708                 }
1709             }
1710             for (d = 0; d < DIM; d++)
1711             {
1712                 for (x = 0; x < 4; x++)
1713                 {
1714                     switch (d)
1715                     {
1716                         case 0: y = 1 + i*8 + 2*x; break;
1717                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1718                         case 2: y = 1 + i*8 + x; break;
1719                     }
1720                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
1721                 }
1722             }
1723         }
1724         gmx_fio_fclose(out);
1725         sfree(grid_r);
1726     }
1727 }
1728
1729 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
1730                   const gmx_mtop_t *mtop, const t_commrec *cr,
1731                   int natoms, const rvec x[], const matrix box)
1732 {
1733     char          fname[STRLEN], buf[22];
1734     FILE         *out;
1735     int           i, ii, resnr, c;
1736     const char   *atomname, *resname;
1737     real          b;
1738     gmx_domdec_t *dd;
1739
1740     dd = cr->dd;
1741     if (natoms == -1)
1742     {
1743         natoms = dd->comm->nat[ddnatVSITE];
1744     }
1745
1746     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
1747
1748     out = gmx_fio_fopen(fname, "w");
1749
1750     fprintf(out, "TITLE     %s\n", title);
1751     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1752     int molb = 0;
1753     for (i = 0; i < natoms; i++)
1754     {
1755         ii = dd->gatindex[i];
1756         mtopGetAtomAndResidueName(mtop, ii, &molb, &atomname, &resnr, &resname, nullptr);
1757         if (i < dd->comm->nat[ddnatZONE])
1758         {
1759             c = 0;
1760             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1761             {
1762                 c++;
1763             }
1764             b = c;
1765         }
1766         else if (i < dd->comm->nat[ddnatVSITE])
1767         {
1768             b = dd->comm->zones.n;
1769         }
1770         else
1771         {
1772             b = dd->comm->zones.n + 1;
1773         }
1774         gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
1775                                  10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
1776     }
1777     fprintf(out, "TER\n");
1778
1779     gmx_fio_fclose(out);
1780 }
1781
1782 real dd_cutoff_multibody(const gmx_domdec_t *dd)
1783 {
1784     gmx_domdec_comm_t *comm;
1785     int                di;
1786     real               r;
1787
1788     comm = dd->comm;
1789
1790     r = -1;
1791     if (comm->bInterCGBondeds)
1792     {
1793         if (comm->cutoff_mbody > 0)
1794         {
1795             r = comm->cutoff_mbody;
1796         }
1797         else
1798         {
1799             /* cutoff_mbody=0 means we do not have DLB */
1800             r = comm->cellsize_min[dd->dim[0]];
1801             for (di = 1; di < dd->ndim; di++)
1802             {
1803                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
1804             }
1805             if (comm->bBondComm)
1806             {
1807                 r = std::max(r, comm->cutoff_mbody);
1808             }
1809             else
1810             {
1811                 r = std::min(r, comm->cutoff);
1812             }
1813         }
1814     }
1815
1816     return r;
1817 }
1818
1819 real dd_cutoff_twobody(const gmx_domdec_t *dd)
1820 {
1821     real r_mb;
1822
1823     r_mb = dd_cutoff_multibody(dd);
1824
1825     return std::max(dd->comm->cutoff, r_mb);
1826 }
1827
1828
1829 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
1830                                    ivec coord_pme)
1831 {
1832     int nc, ntot;
1833
1834     nc   = dd->nc[dd->comm->cartpmedim];
1835     ntot = dd->comm->ntot[dd->comm->cartpmedim];
1836     copy_ivec(coord, coord_pme);
1837     coord_pme[dd->comm->cartpmedim] =
1838         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
1839 }
1840
1841 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
1842 {
1843     int npp, npme;
1844
1845     npp  = dd->nnodes;
1846     npme = dd->comm->npmenodes;
1847
1848     /* Here we assign a PME node to communicate with this DD node
1849      * by assuming that the major index of both is x.
1850      * We add cr->npmenodes/2 to obtain an even distribution.
1851      */
1852     return (ddindex*npme + npme/2)/npp;
1853 }
1854
1855 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
1856 {
1857     int *pme_rank;
1858     int  n, i, p0, p1;
1859
1860     snew(pme_rank, dd->comm->npmenodes);
1861     n = 0;
1862     for (i = 0; i < dd->nnodes; i++)
1863     {
1864         p0 = ddindex2pmeindex(dd, i);
1865         p1 = ddindex2pmeindex(dd, i+1);
1866         if (i+1 == dd->nnodes || p1 > p0)
1867         {
1868             if (debug)
1869             {
1870                 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
1871             }
1872             pme_rank[n] = i + 1 + n;
1873             n++;
1874         }
1875     }
1876
1877     return pme_rank;
1878 }
1879
1880 static int gmx_ddcoord2pmeindex(const t_commrec *cr, int x, int y, int z)
1881 {
1882     gmx_domdec_t *dd;
1883     ivec          coords;
1884     int           slab;
1885
1886     dd = cr->dd;
1887     /*
1888        if (dd->comm->bCartesian) {
1889        gmx_ddindex2xyz(dd->nc,ddindex,coords);
1890        dd_coords2pmecoords(dd,coords,coords_pme);
1891        copy_ivec(dd->ntot,nc);
1892        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
1893        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1894
1895        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
1896        } else {
1897        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
1898        }
1899      */
1900     coords[XX] = x;
1901     coords[YY] = y;
1902     coords[ZZ] = z;
1903     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
1904
1905     return slab;
1906 }
1907
1908 static int ddcoord2simnodeid(const t_commrec *cr, int x, int y, int z)
1909 {
1910     gmx_domdec_comm_t *comm;
1911     ivec               coords;
1912     int                ddindex, nodeid = -1;
1913
1914     comm = cr->dd->comm;
1915
1916     coords[XX] = x;
1917     coords[YY] = y;
1918     coords[ZZ] = z;
1919     if (comm->bCartesianPP_PME)
1920     {
1921 #if GMX_MPI
1922         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
1923 #endif
1924     }
1925     else
1926     {
1927         ddindex = dd_index(cr->dd->nc, coords);
1928         if (comm->bCartesianPP)
1929         {
1930             nodeid = comm->ddindex2simnodeid[ddindex];
1931         }
1932         else
1933         {
1934             if (comm->pmenodes)
1935             {
1936                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
1937             }
1938             else
1939             {
1940                 nodeid = ddindex;
1941             }
1942         }
1943     }
1944
1945     return nodeid;
1946 }
1947
1948 static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
1949                               const t_commrec gmx_unused *cr,
1950                               int                         sim_nodeid)
1951 {
1952     int pmenode = -1;
1953
1954     const gmx_domdec_comm_t *comm = dd->comm;
1955
1956     /* This assumes a uniform x domain decomposition grid cell size */
1957     if (comm->bCartesianPP_PME)
1958     {
1959 #if GMX_MPI
1960         ivec coord, coord_pme;
1961         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
1962         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1963         {
1964             /* This is a PP node */
1965             dd_cart_coord2pmecoord(dd, coord, coord_pme);
1966             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
1967         }
1968 #endif
1969     }
1970     else if (comm->bCartesianPP)
1971     {
1972         if (sim_nodeid < dd->nnodes)
1973         {
1974             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1975         }
1976     }
1977     else
1978     {
1979         /* This assumes DD cells with identical x coordinates
1980          * are numbered sequentially.
1981          */
1982         if (dd->comm->pmenodes == nullptr)
1983         {
1984             if (sim_nodeid < dd->nnodes)
1985             {
1986                 /* The DD index equals the nodeid */
1987                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1988             }
1989         }
1990         else
1991         {
1992             int i = 0;
1993             while (sim_nodeid > dd->comm->pmenodes[i])
1994             {
1995                 i++;
1996             }
1997             if (sim_nodeid < dd->comm->pmenodes[i])
1998             {
1999                 pmenode = dd->comm->pmenodes[i];
2000             }
2001         }
2002     }
2003
2004     return pmenode;
2005 }
2006
2007 NumPmeDomains getNumPmeDomains(const gmx_domdec_t *dd)
2008 {
2009     if (dd != nullptr)
2010     {
2011         return { dd->comm->npmenodes_x, dd->comm->npmenodes_y };
2012     }
2013     else
2014     {
2015         return { 1, 1 };
2016     }
2017 }
2018
2019 std::vector<int> get_pme_ddranks(const t_commrec *cr, int pmenodeid)
2020 {
2021     gmx_domdec_t *dd;
2022     int           x, y, z;
2023     ivec          coord, coord_pme;
2024
2025     dd = cr->dd;
2026
2027     std::vector<int> ddranks;
2028     ddranks.reserve((dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2029
2030     for (x = 0; x < dd->nc[XX]; x++)
2031     {
2032         for (y = 0; y < dd->nc[YY]; y++)
2033         {
2034             for (z = 0; z < dd->nc[ZZ]; z++)
2035             {
2036                 if (dd->comm->bCartesianPP_PME)
2037                 {
2038                     coord[XX] = x;
2039                     coord[YY] = y;
2040                     coord[ZZ] = z;
2041                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2042                     if (dd->ci[XX] == coord_pme[XX] &&
2043                         dd->ci[YY] == coord_pme[YY] &&
2044                         dd->ci[ZZ] == coord_pme[ZZ])
2045                     {
2046                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
2047                     }
2048                 }
2049                 else
2050                 {
2051                     /* The slab corresponds to the nodeid in the PME group */
2052                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2053                     {
2054                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
2055                     }
2056                 }
2057             }
2058         }
2059     }
2060     return ddranks;
2061 }
2062
2063 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
2064 {
2065     gmx_bool bReceive = TRUE;
2066
2067     if (cr->npmenodes < dd->nnodes)
2068     {
2069         gmx_domdec_comm_t *comm = dd->comm;
2070         if (comm->bCartesianPP_PME)
2071         {
2072 #if GMX_MPI
2073             int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2074             ivec coords;
2075             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2076             coords[comm->cartpmedim]++;
2077             if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2078             {
2079                 int rank;
2080                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2081                 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
2082                 {
2083                     /* This is not the last PP node for pmenode */
2084                     bReceive = FALSE;
2085                 }
2086             }
2087 #else
2088             GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
2089 #endif
2090         }
2091         else
2092         {
2093             int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2094             if (cr->sim_nodeid+1 < cr->nnodes &&
2095                 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
2096             {
2097                 /* This is not the last PP node for pmenode */
2098                 bReceive = FALSE;
2099             }
2100         }
2101     }
2102
2103     return bReceive;
2104 }
2105
2106 static void set_zones_ncg_home(gmx_domdec_t *dd)
2107 {
2108     gmx_domdec_zones_t *zones;
2109     int                 i;
2110
2111     zones = &dd->comm->zones;
2112
2113     zones->cg_range[0] = 0;
2114     for (i = 1; i < zones->n+1; i++)
2115     {
2116         zones->cg_range[i] = dd->ncg_home;
2117     }
2118     /* zone_ncg1[0] should always be equal to ncg_home */
2119     dd->comm->zone_ncg1[0] = dd->ncg_home;
2120 }
2121
2122 static void rebuild_cgindex(gmx_domdec_t *dd,
2123                             const int *gcgs_index, const t_state *state)
2124 {
2125     int * gmx_restrict dd_cg_gl = dd->index_gl;
2126     int * gmx_restrict cgindex  = dd->cgindex;
2127     int                nat      = 0;
2128
2129     /* Copy back the global charge group indices from state
2130      * and rebuild the local charge group to atom index.
2131      */
2132     cgindex[0] = nat;
2133     for (unsigned int i = 0; i < state->cg_gl.size(); i++)
2134     {
2135         cgindex[i]  = nat;
2136         int cg_gl   = state->cg_gl[i];
2137         dd_cg_gl[i] = cg_gl;
2138         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2139     }
2140     cgindex[state->cg_gl.size()] = nat;
2141
2142     dd->ncg_home = state->cg_gl.size();
2143     dd->nat_home = nat;
2144
2145     set_zones_ncg_home(dd);
2146 }
2147
2148 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2149 {
2150     while (cg >= cginfo_mb->cg_end)
2151     {
2152         cginfo_mb++;
2153     }
2154
2155     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2156 }
2157
2158 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2159                           t_forcerec *fr, char *bLocalCG)
2160 {
2161     cginfo_mb_t *cginfo_mb;
2162     int         *cginfo;
2163     int          cg;
2164
2165     if (fr != nullptr)
2166     {
2167         cginfo_mb = fr->cginfo_mb;
2168         cginfo    = fr->cginfo;
2169
2170         for (cg = cg0; cg < cg1; cg++)
2171         {
2172             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2173         }
2174     }
2175
2176     if (bLocalCG != nullptr)
2177     {
2178         for (cg = cg0; cg < cg1; cg++)
2179         {
2180             bLocalCG[index_gl[cg]] = TRUE;
2181         }
2182     }
2183 }
2184
2185 static void make_dd_indices(gmx_domdec_t *dd,
2186                             const int *gcgs_index, int cg_start)
2187 {
2188     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2189     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2190     gmx_bool     bCGs;
2191
2192     if (dd->nat_tot > dd->gatindex_nalloc)
2193     {
2194         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2195         srenew(dd->gatindex, dd->gatindex_nalloc);
2196     }
2197
2198     nzone      = dd->comm->zones.n;
2199     zone2cg    = dd->comm->zones.cg_range;
2200     zone_ncg1  = dd->comm->zone_ncg1;
2201     index_gl   = dd->index_gl;
2202     gatindex   = dd->gatindex;
2203     bCGs       = dd->comm->bCGs;
2204
2205     if (zone2cg[1] != dd->ncg_home)
2206     {
2207         gmx_incons("dd->ncg_zone is not up to date");
2208     }
2209
2210     /* Make the local to global and global to local atom index */
2211     a = dd->cgindex[cg_start];
2212     for (zone = 0; zone < nzone; zone++)
2213     {
2214         if (zone == 0)
2215         {
2216             cg0 = cg_start;
2217         }
2218         else
2219         {
2220             cg0 = zone2cg[zone];
2221         }
2222         cg1    = zone2cg[zone+1];
2223         cg1_p1 = cg0 + zone_ncg1[zone];
2224
2225         for (cg = cg0; cg < cg1; cg++)
2226         {
2227             zone1 = zone;
2228             if (cg >= cg1_p1)
2229             {
2230                 /* Signal that this cg is from more than one pulse away */
2231                 zone1 += nzone;
2232             }
2233             cg_gl = index_gl[cg];
2234             if (bCGs)
2235             {
2236                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2237                 {
2238                     gatindex[a] = a_gl;
2239                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2240                     a++;
2241                 }
2242             }
2243             else
2244             {
2245                 gatindex[a] = cg_gl;
2246                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2247                 a++;
2248             }
2249         }
2250     }
2251 }
2252
2253 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2254                           const char *where)
2255 {
2256     int i, ngl, nerr;
2257
2258     nerr = 0;
2259     if (bLocalCG == nullptr)
2260     {
2261         return nerr;
2262     }
2263     for (i = 0; i < dd->ncg_tot; i++)
2264     {
2265         if (!bLocalCG[dd->index_gl[i]])
2266         {
2267             fprintf(stderr,
2268                     "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2269             nerr++;
2270         }
2271     }
2272     ngl = 0;
2273     for (i = 0; i < ncg_sys; i++)
2274     {
2275         if (bLocalCG[i])
2276         {
2277             ngl++;
2278         }
2279     }
2280     if (ngl != dd->ncg_tot)
2281     {
2282         fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2283         nerr++;
2284     }
2285
2286     return nerr;
2287 }
2288
2289 static void check_index_consistency(gmx_domdec_t *dd,
2290                                     int natoms_sys, int ncg_sys,
2291                                     const char *where)
2292 {
2293     int   nerr, ngl, i, a, cell;
2294     int  *have;
2295
2296     nerr = 0;
2297
2298     if (dd->comm->DD_debug > 1)
2299     {
2300         snew(have, natoms_sys);
2301         for (a = 0; a < dd->nat_tot; a++)
2302         {
2303             if (have[dd->gatindex[a]] > 0)
2304             {
2305                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2306             }
2307             else
2308             {
2309                 have[dd->gatindex[a]] = a + 1;
2310             }
2311         }
2312         sfree(have);
2313     }
2314
2315     snew(have, dd->nat_tot);
2316
2317     ngl  = 0;
2318     for (i = 0; i < natoms_sys; i++)
2319     {
2320         if (ga2la_get(dd->ga2la, i, &a, &cell))
2321         {
2322             if (a >= dd->nat_tot)
2323             {
2324                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2325                 nerr++;
2326             }
2327             else
2328             {
2329                 have[a] = 1;
2330                 if (dd->gatindex[a] != i)
2331                 {
2332                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2333                     nerr++;
2334                 }
2335             }
2336             ngl++;
2337         }
2338     }
2339     if (ngl != dd->nat_tot)
2340     {
2341         fprintf(stderr,
2342                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2343                 dd->rank, where, ngl, dd->nat_tot);
2344     }
2345     for (a = 0; a < dd->nat_tot; a++)
2346     {
2347         if (have[a] == 0)
2348         {
2349             fprintf(stderr,
2350                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
2351                     dd->rank, where, a+1, dd->gatindex[a]+1);
2352         }
2353     }
2354     sfree(have);
2355
2356     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2357
2358     if (nerr > 0)
2359     {
2360         gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2361                   dd->rank, where, nerr);
2362     }
2363 }
2364
2365 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2366 {
2367     int   i;
2368     char *bLocalCG;
2369
2370     if (a_start == 0)
2371     {
2372         /* Clear the whole list without searching */
2373         ga2la_clear(dd->ga2la);
2374     }
2375     else
2376     {
2377         for (i = a_start; i < dd->nat_tot; i++)
2378         {
2379             ga2la_del(dd->ga2la, dd->gatindex[i]);
2380         }
2381     }
2382
2383     bLocalCG = dd->comm->bLocalCG;
2384     if (bLocalCG)
2385     {
2386         for (i = cg_start; i < dd->ncg_tot; i++)
2387         {
2388             bLocalCG[dd->index_gl[i]] = FALSE;
2389         }
2390     }
2391
2392     dd_clear_local_vsite_indices(dd);
2393
2394     if (dd->constraints)
2395     {
2396         dd_clear_local_constraint_indices(dd);
2397     }
2398 }
2399
2400 /* This function should be used for moving the domain boudaries during DLB,
2401  * for obtaining the minimum cell size. It checks the initially set limit
2402  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2403  * and, possibly, a longer cut-off limit set for PME load balancing.
2404  */
2405 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2406 {
2407     real cellsize_min;
2408
2409     cellsize_min = comm->cellsize_min[dim];
2410
2411     if (!comm->bVacDLBNoLimit)
2412     {
2413         /* The cut-off might have changed, e.g. by PME load balacning,
2414          * from the value used to set comm->cellsize_min, so check it.
2415          */
2416         cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2417
2418         if (comm->bPMELoadBalDLBLimits)
2419         {
2420             /* Check for the cut-off limit set by the PME load balancing */
2421             cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2422         }
2423     }
2424
2425     return cellsize_min;
2426 }
2427
2428 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2429                             int dim_ind)
2430 {
2431     real grid_jump_limit;
2432
2433     /* The distance between the boundaries of cells at distance
2434      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2435      * and by the fact that cells should not be shifted by more than
2436      * half their size, such that cg's only shift by one cell
2437      * at redecomposition.
2438      */
2439     grid_jump_limit = comm->cellsize_limit;
2440     if (!comm->bVacDLBNoLimit)
2441     {
2442         if (comm->bPMELoadBalDLBLimits)
2443         {
2444             cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
2445         }
2446         grid_jump_limit = std::max(grid_jump_limit,
2447                                    cutoff/comm->cd[dim_ind].np);
2448     }
2449
2450     return grid_jump_limit;
2451 }
2452
2453 static gmx_bool check_grid_jump(gmx_int64_t     step,
2454                                 gmx_domdec_t   *dd,
2455                                 real            cutoff,
2456                                 gmx_ddbox_t    *ddbox,
2457                                 gmx_bool        bFatal)
2458 {
2459     gmx_domdec_comm_t *comm;
2460     int                d, dim;
2461     real               limit, bfac;
2462     gmx_bool           bInvalid;
2463
2464     bInvalid = FALSE;
2465
2466     comm = dd->comm;
2467
2468     for (d = 1; d < dd->ndim; d++)
2469     {
2470         dim   = dd->dim[d];
2471         limit = grid_jump_limit(comm, cutoff, d);
2472         bfac  = ddbox->box_size[dim];
2473         if (ddbox->tric_dir[dim])
2474         {
2475             bfac *= ddbox->skew_fac[dim];
2476         }
2477         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2478                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2479         {
2480             bInvalid = TRUE;
2481
2482             if (bFatal)
2483             {
2484                 char buf[22];
2485
2486                 /* This error should never be triggered under normal
2487                  * circumstances, but you never know ...
2488                  */
2489                 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2490                           gmx_step_str(step, buf),
2491                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2492             }
2493         }
2494     }
2495
2496     return bInvalid;
2497 }
2498
2499 static int dd_load_count(gmx_domdec_comm_t *comm)
2500 {
2501     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2502 }
2503
2504 static float dd_force_load(gmx_domdec_comm_t *comm)
2505 {
2506     float load;
2507
2508     if (comm->eFlop)
2509     {
2510         load = comm->flop;
2511         if (comm->eFlop > 1)
2512         {
2513             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2514         }
2515     }
2516     else
2517     {
2518         load = comm->cycl[ddCyclF];
2519         if (comm->cycl_n[ddCyclF] > 1)
2520         {
2521             /* Subtract the maximum of the last n cycle counts
2522              * to get rid of possible high counts due to other sources,
2523              * for instance system activity, that would otherwise
2524              * affect the dynamic load balancing.
2525              */
2526             load -= comm->cycl_max[ddCyclF];
2527         }
2528
2529 #if GMX_MPI
2530         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2531         {
2532             float gpu_wait, gpu_wait_sum;
2533
2534             gpu_wait = comm->cycl[ddCyclWaitGPU];
2535             if (comm->cycl_n[ddCyclF] > 1)
2536             {
2537                 /* We should remove the WaitGPU time of the same MD step
2538                  * as the one with the maximum F time, since the F time
2539                  * and the wait time are not independent.
2540                  * Furthermore, the step for the max F time should be chosen
2541                  * the same on all ranks that share the same GPU.
2542                  * But to keep the code simple, we remove the average instead.
2543                  * The main reason for artificially long times at some steps
2544                  * is spurious CPU activity or MPI time, so we don't expect
2545                  * that changes in the GPU wait time matter a lot here.
2546                  */
2547                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2548             }
2549             /* Sum the wait times over the ranks that share the same GPU */
2550             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2551                           comm->mpi_comm_gpu_shared);
2552             /* Replace the wait time by the average over the ranks */
2553             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2554         }
2555 #endif
2556     }
2557
2558     return load;
2559 }
2560
2561 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2562 {
2563     gmx_domdec_comm_t *comm;
2564     int                i;
2565
2566     comm = dd->comm;
2567
2568     snew(*dim_f, dd->nc[dim]+1);
2569     (*dim_f)[0] = 0;
2570     for (i = 1; i < dd->nc[dim]; i++)
2571     {
2572         if (comm->slb_frac[dim])
2573         {
2574             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2575         }
2576         else
2577         {
2578             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2579         }
2580     }
2581     (*dim_f)[dd->nc[dim]] = 1;
2582 }
2583
2584 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2585 {
2586     int  pmeindex, slab, nso, i;
2587     ivec xyz;
2588
2589     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2590     {
2591         ddpme->dim = YY;
2592     }
2593     else
2594     {
2595         ddpme->dim = dimind;
2596     }
2597     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2598
2599     ddpme->nslab = (ddpme->dim == 0 ?
2600                     dd->comm->npmenodes_x :
2601                     dd->comm->npmenodes_y);
2602
2603     if (ddpme->nslab <= 1)
2604     {
2605         return;
2606     }
2607
2608     nso = dd->comm->npmenodes/ddpme->nslab;
2609     /* Determine for each PME slab the PP location range for dimension dim */
2610     snew(ddpme->pp_min, ddpme->nslab);
2611     snew(ddpme->pp_max, ddpme->nslab);
2612     for (slab = 0; slab < ddpme->nslab; slab++)
2613     {
2614         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2615         ddpme->pp_max[slab] = 0;
2616     }
2617     for (i = 0; i < dd->nnodes; i++)
2618     {
2619         ddindex2xyz(dd->nc, i, xyz);
2620         /* For y only use our y/z slab.
2621          * This assumes that the PME x grid size matches the DD grid size.
2622          */
2623         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2624         {
2625             pmeindex = ddindex2pmeindex(dd, i);
2626             if (dimind == 0)
2627             {
2628                 slab = pmeindex/nso;
2629             }
2630             else
2631             {
2632                 slab = pmeindex % ddpme->nslab;
2633             }
2634             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
2635             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
2636         }
2637     }
2638
2639     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2640 }
2641
2642 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
2643 {
2644     if (dd->comm->ddpme[0].dim == XX)
2645     {
2646         return dd->comm->ddpme[0].maxshift;
2647     }
2648     else
2649     {
2650         return 0;
2651     }
2652 }
2653
2654 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
2655 {
2656     if (dd->comm->ddpme[0].dim == YY)
2657     {
2658         return dd->comm->ddpme[0].maxshift;
2659     }
2660     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2661     {
2662         return dd->comm->ddpme[1].maxshift;
2663     }
2664     else
2665     {
2666         return 0;
2667     }
2668 }
2669
2670 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2671                              gmx_bool bUniform, const gmx_ddbox_t *ddbox,
2672                              const real *cell_f)
2673 {
2674     gmx_domdec_comm_t *comm;
2675     int                nc, ns, s;
2676     int               *xmin, *xmax;
2677     real               range, pme_boundary;
2678     int                sh;
2679
2680     comm = dd->comm;
2681     nc   = dd->nc[ddpme->dim];
2682     ns   = ddpme->nslab;
2683
2684     if (!ddpme->dim_match)
2685     {
2686         /* PP decomposition is not along dim: the worst situation */
2687         sh = ns/2;
2688     }
2689     else if (ns <= 3 || (bUniform && ns == nc))
2690     {
2691         /* The optimal situation */
2692         sh = 1;
2693     }
2694     else
2695     {
2696         /* We need to check for all pme nodes which nodes they
2697          * could possibly need to communicate with.
2698          */
2699         xmin = ddpme->pp_min;
2700         xmax = ddpme->pp_max;
2701         /* Allow for atoms to be maximally 2/3 times the cut-off
2702          * out of their DD cell. This is a reasonable balance between
2703          * between performance and support for most charge-group/cut-off
2704          * combinations.
2705          */
2706         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2707         /* Avoid extra communication when we are exactly at a boundary */
2708         range *= 0.999;
2709
2710         sh = 1;
2711         for (s = 0; s < ns; s++)
2712         {
2713             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2714             pme_boundary = (real)s/ns;
2715             while (sh+1 < ns &&
2716                    ((s-(sh+1) >= 0 &&
2717                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2718                     (s-(sh+1) <  0 &&
2719                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2720             {
2721                 sh++;
2722             }
2723             pme_boundary = (real)(s+1)/ns;
2724             while (sh+1 < ns &&
2725                    ((s+(sh+1) <  ns &&
2726                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2727                     (s+(sh+1) >= ns &&
2728                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2729             {
2730                 sh++;
2731             }
2732         }
2733     }
2734
2735     ddpme->maxshift = sh;
2736
2737     if (debug)
2738     {
2739         fprintf(debug, "PME slab communication range for dim %d is %d\n",
2740                 ddpme->dim, ddpme->maxshift);
2741     }
2742 }
2743
2744 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
2745 {
2746     int d, dim;
2747
2748     for (d = 0; d < dd->ndim; d++)
2749     {
2750         dim = dd->dim[d];
2751         if (dim < ddbox->nboundeddim &&
2752             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2753             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2754         {
2755             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2756                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2757                       dd->nc[dim], dd->comm->cellsize_limit);
2758         }
2759     }
2760 }
2761
2762 enum {
2763     setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
2764 };
2765
2766 /* Set the domain boundaries. Use for static (or no) load balancing,
2767  * and also for the starting state for dynamic load balancing.
2768  * setmode determine if and where the boundaries are stored, use enum above.
2769  * Returns the number communication pulses in npulse.
2770  */
2771 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, const gmx_ddbox_t *ddbox,
2772                                   int setmode, ivec npulse)
2773 {
2774     gmx_domdec_comm_t *comm;
2775     int                d, j;
2776     rvec               cellsize_min;
2777     real              *cell_x, cell_dx, cellsize;
2778
2779     comm = dd->comm;
2780
2781     for (d = 0; d < DIM; d++)
2782     {
2783         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2784         npulse[d]       = 1;
2785         if (dd->nc[d] == 1 || comm->slb_frac[d] == nullptr)
2786         {
2787             /* Uniform grid */
2788             cell_dx = ddbox->box_size[d]/dd->nc[d];
2789             switch (setmode)
2790             {
2791                 case setcellsizeslbMASTER:
2792                     for (j = 0; j < dd->nc[d]+1; j++)
2793                     {
2794                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2795                     }
2796                     break;
2797                 case setcellsizeslbLOCAL:
2798                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2799                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2800                     break;
2801                 default:
2802                     break;
2803             }
2804             cellsize = cell_dx*ddbox->skew_fac[d];
2805             while (cellsize*npulse[d] < comm->cutoff)
2806             {
2807                 npulse[d]++;
2808             }
2809             cellsize_min[d] = cellsize;
2810         }
2811         else
2812         {
2813             /* Statically load balanced grid */
2814             /* Also when we are not doing a master distribution we determine
2815              * all cell borders in a loop to obtain identical values
2816              * to the master distribution case and to determine npulse.
2817              */
2818             if (setmode == setcellsizeslbMASTER)
2819             {
2820                 cell_x = dd->ma->cell_x[d];
2821             }
2822             else
2823             {
2824                 snew(cell_x, dd->nc[d]+1);
2825             }
2826             cell_x[0] = ddbox->box0[d];
2827             for (j = 0; j < dd->nc[d]; j++)
2828             {
2829                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
2830                 cell_x[j+1] = cell_x[j] + cell_dx;
2831                 cellsize    = cell_dx*ddbox->skew_fac[d];
2832                 while (cellsize*npulse[d] < comm->cutoff &&
2833                        npulse[d] < dd->nc[d]-1)
2834                 {
2835                     npulse[d]++;
2836                 }
2837                 cellsize_min[d] = std::min(cellsize_min[d], cellsize);
2838             }
2839             if (setmode == setcellsizeslbLOCAL)
2840             {
2841                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2842                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2843             }
2844             if (setmode != setcellsizeslbMASTER)
2845             {
2846                 sfree(cell_x);
2847             }
2848         }
2849         /* The following limitation is to avoid that a cell would receive
2850          * some of its own home charge groups back over the periodic boundary.
2851          * Double charge groups cause trouble with the global indices.
2852          */
2853         if (d < ddbox->npbcdim &&
2854             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2855         {
2856             char error_string[STRLEN];
2857
2858             sprintf(error_string,
2859                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2860                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
2861                     comm->cutoff,
2862                     dd->nc[d], dd->nc[d],
2863                     dd->nnodes > dd->nc[d] ? "cells" : "ranks");
2864
2865             if (setmode == setcellsizeslbLOCAL)
2866             {
2867                 gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2868                                      error_string);
2869             }
2870             else
2871             {
2872                 gmx_fatal(FARGS, error_string);
2873             }
2874         }
2875     }
2876
2877     if (!isDlbOn(comm))
2878     {
2879         copy_rvec(cellsize_min, comm->cellsize_min);
2880     }
2881
2882     for (d = 0; d < comm->npmedecompdim; d++)
2883     {
2884         set_pme_maxshift(dd, &comm->ddpme[d],
2885                          comm->slb_frac[dd->dim[d]] == nullptr, ddbox,
2886                          comm->ddpme[d].slb_dim_f);
2887     }
2888 }
2889
2890
2891 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2892                                                   int d, int dim, domdec_root_t *root,
2893                                                   const gmx_ddbox_t *ddbox,
2894                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
2895 {
2896     gmx_domdec_comm_t *comm;
2897     int                ncd, i, j, nmin, nmin_old;
2898     gmx_bool           bLimLo, bLimHi;
2899     real              *cell_size;
2900     real               fac, halfway, cellsize_limit_f_i, region_size;
2901     gmx_bool           bPBC, bLastHi = FALSE;
2902     int                nrange[] = {range[0], range[1]};
2903
2904     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
2905
2906     comm = dd->comm;
2907
2908     ncd = dd->nc[dim];
2909
2910     bPBC = (dim < ddbox->npbcdim);
2911
2912     cell_size = root->buf_ncd;
2913
2914     if (debug)
2915     {
2916         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
2917     }
2918
2919     /* First we need to check if the scaling does not make cells
2920      * smaller than the smallest allowed size.
2921      * We need to do this iteratively, since if a cell is too small,
2922      * it needs to be enlarged, which makes all the other cells smaller,
2923      * which could in turn make another cell smaller than allowed.
2924      */
2925     for (i = range[0]; i < range[1]; i++)
2926     {
2927         root->bCellMin[i] = FALSE;
2928     }
2929     nmin = 0;
2930     do
2931     {
2932         nmin_old = nmin;
2933         /* We need the total for normalization */
2934         fac = 0;
2935         for (i = range[0]; i < range[1]; i++)
2936         {
2937             if (root->bCellMin[i] == FALSE)
2938             {
2939                 fac += cell_size[i];
2940             }
2941         }
2942         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
2943         /* Determine the cell boundaries */
2944         for (i = range[0]; i < range[1]; i++)
2945         {
2946             if (root->bCellMin[i] == FALSE)
2947             {
2948                 cell_size[i] *= fac;
2949                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
2950                 {
2951                     cellsize_limit_f_i = 0;
2952                 }
2953                 else
2954                 {
2955                     cellsize_limit_f_i = cellsize_limit_f;
2956                 }
2957                 if (cell_size[i] < cellsize_limit_f_i)
2958                 {
2959                     root->bCellMin[i] = TRUE;
2960                     cell_size[i]      = cellsize_limit_f_i;
2961                     nmin++;
2962                 }
2963             }
2964             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
2965         }
2966     }
2967     while (nmin > nmin_old);
2968
2969     i            = range[1]-1;
2970     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
2971     /* For this check we should not use DD_CELL_MARGIN,
2972      * but a slightly smaller factor,
2973      * since rounding could get use below the limit.
2974      */
2975     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
2976     {
2977         char buf[22];
2978         gmx_fatal(FARGS, "step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
2979                   gmx_step_str(step, buf),
2980                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2981                   ncd, comm->cellsize_min[dim]);
2982     }
2983
2984     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
2985
2986     if (!bUniform)
2987     {
2988         /* Check if the boundary did not displace more than halfway
2989          * each of the cells it bounds, as this could cause problems,
2990          * especially when the differences between cell sizes are large.
2991          * If changes are applied, they will not make cells smaller
2992          * than the cut-off, as we check all the boundaries which
2993          * might be affected by a change and if the old state was ok,
2994          * the cells will at most be shrunk back to their old size.
2995          */
2996         for (i = range[0]+1; i < range[1]; i++)
2997         {
2998             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
2999             if (root->cell_f[i] < halfway)
3000             {
3001                 root->cell_f[i] = halfway;
3002                 /* Check if the change also causes shifts of the next boundaries */
3003                 for (j = i+1; j < range[1]; j++)
3004                 {
3005                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3006                     {
3007                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3008                     }
3009                 }
3010             }
3011             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3012             if (root->cell_f[i] > halfway)
3013             {
3014                 root->cell_f[i] = halfway;
3015                 /* Check if the change also causes shifts of the next boundaries */
3016                 for (j = i-1; j >= range[0]+1; j--)
3017                 {
3018                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3019                     {
3020                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3021                     }
3022                 }
3023             }
3024         }
3025     }
3026
3027     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3028     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3029      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3030      * for a and b nrange is used */
3031     if (d > 0)
3032     {
3033         /* Take care of the staggering of the cell boundaries */
3034         if (bUniform)
3035         {
3036             for (i = range[0]; i < range[1]; i++)
3037             {
3038                 root->cell_f_max0[i] = root->cell_f[i];
3039                 root->cell_f_min1[i] = root->cell_f[i+1];
3040             }
3041         }
3042         else
3043         {
3044             for (i = range[0]+1; i < range[1]; i++)
3045             {
3046                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3047                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3048                 if (bLimLo && bLimHi)
3049                 {
3050                     /* Both limits violated, try the best we can */
3051                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3052                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3053                     nrange[0]       = range[0];
3054                     nrange[1]       = i;
3055                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3056
3057                     nrange[0] = i;
3058                     nrange[1] = range[1];
3059                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3060
3061                     return;
3062                 }
3063                 else if (bLimLo)
3064                 {
3065                     /* root->cell_f[i] = root->bound_min[i]; */
3066                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3067                     bLastHi   = FALSE;
3068                 }
3069                 else if (bLimHi && !bLastHi)
3070                 {
3071                     bLastHi = TRUE;
3072                     if (nrange[1] < range[1])   /* found a LimLo before */
3073                     {
3074                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3075                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3076                         nrange[0] = nrange[1];
3077                     }
3078                     root->cell_f[i] = root->bound_max[i];
3079                     nrange[1]       = i;
3080                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3081                     nrange[0] = i;
3082                     nrange[1] = range[1];
3083                 }
3084             }
3085             if (nrange[1] < range[1])   /* found last a LimLo */
3086             {
3087                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3088                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3089                 nrange[0] = nrange[1];
3090                 nrange[1] = range[1];
3091                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3092             }
3093             else if (nrange[0] > range[0]) /* found at least one LimHi */
3094             {
3095                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3096             }
3097         }
3098     }
3099 }
3100
3101
3102 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3103                                        int d, int dim, domdec_root_t *root,
3104                                        const gmx_ddbox_t *ddbox,
3105                                        gmx_bool bDynamicBox,
3106                                        gmx_bool bUniform, gmx_int64_t step)
3107 {
3108     gmx_domdec_comm_t *comm;
3109     int                ncd, d1, i, pos;
3110     real              *cell_size;
3111     real               load_aver, load_i, imbalance, change, change_max, sc;
3112     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3113     real               change_limit;
3114     real               relax = 0.5;
3115     gmx_bool           bPBC;
3116     int                range[] = { 0, 0 };
3117
3118     comm = dd->comm;
3119
3120     /* Convert the maximum change from the input percentage to a fraction */
3121     change_limit = comm->dlb_scale_lim*0.01;
3122
3123     ncd = dd->nc[dim];
3124
3125     bPBC = (dim < ddbox->npbcdim);
3126
3127     cell_size = root->buf_ncd;
3128
3129     /* Store the original boundaries */
3130     for (i = 0; i < ncd+1; i++)
3131     {
3132         root->old_cell_f[i] = root->cell_f[i];
3133     }
3134     if (bUniform)
3135     {
3136         for (i = 0; i < ncd; i++)
3137         {
3138             cell_size[i] = 1.0/ncd;
3139         }
3140     }
3141     else if (dd_load_count(comm) > 0)
3142     {
3143         load_aver  = comm->load[d].sum_m/ncd;
3144         change_max = 0;
3145         for (i = 0; i < ncd; i++)
3146         {
3147             /* Determine the relative imbalance of cell i */
3148             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3149             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3150             /* Determine the change of the cell size using underrelaxation */
3151             change     = -relax*imbalance;
3152             change_max = std::max(change_max, std::max(change, -change));
3153         }
3154         /* Limit the amount of scaling.
3155          * We need to use the same rescaling for all cells in one row,
3156          * otherwise the load balancing might not converge.
3157          */
3158         sc = relax;
3159         if (change_max > change_limit)
3160         {
3161             sc *= change_limit/change_max;
3162         }
3163         for (i = 0; i < ncd; i++)
3164         {
3165             /* Determine the relative imbalance of cell i */
3166             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3167             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3168             /* Determine the change of the cell size using underrelaxation */
3169             change       = -sc*imbalance;
3170             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3171         }
3172     }
3173
3174     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3175     cellsize_limit_f *= DD_CELL_MARGIN;
3176     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3177     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3178     if (ddbox->tric_dir[dim])
3179     {
3180         cellsize_limit_f /= ddbox->skew_fac[dim];
3181         dist_min_f       /= ddbox->skew_fac[dim];
3182     }
3183     if (bDynamicBox && d > 0)
3184     {
3185         dist_min_f *= DD_PRES_SCALE_MARGIN;
3186     }
3187     if (d > 0 && !bUniform)
3188     {
3189         /* Make sure that the grid is not shifted too much */
3190         for (i = 1; i < ncd; i++)
3191         {
3192             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3193             {
3194                 gmx_incons("Inconsistent DD boundary staggering limits!");
3195             }
3196             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3197             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3198             if (space > 0)
3199             {
3200                 root->bound_min[i] += 0.5*space;
3201             }
3202             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3203             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3204             if (space < 0)
3205             {
3206                 root->bound_max[i] += 0.5*space;
3207             }
3208             if (debug)
3209             {
3210                 fprintf(debug,
3211                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3212                         d, i,
3213                         root->cell_f_max0[i-1] + dist_min_f,
3214                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3215                         root->cell_f_min1[i] - dist_min_f);
3216             }
3217         }
3218     }
3219     range[1]          = ncd;
3220     root->cell_f[0]   = 0;
3221     root->cell_f[ncd] = 1;
3222     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3223
3224
3225     /* After the checks above, the cells should obey the cut-off
3226      * restrictions, but it does not hurt to check.
3227      */
3228     for (i = 0; i < ncd; i++)
3229     {
3230         if (debug)
3231         {
3232             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3233                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3234         }
3235
3236         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3237             root->cell_f[i+1] - root->cell_f[i] <
3238             cellsize_limit_f/DD_CELL_MARGIN)
3239         {
3240             char buf[22];
3241             fprintf(stderr,
3242                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3243                     gmx_step_str(step, buf), dim2char(dim), i,
3244                     (root->cell_f[i+1] - root->cell_f[i])
3245                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3246         }
3247     }
3248
3249     pos = ncd + 1;
3250     /* Store the cell boundaries of the lower dimensions at the end */
3251     for (d1 = 0; d1 < d; d1++)
3252     {
3253         root->cell_f[pos++] = comm->cell_f0[d1];
3254         root->cell_f[pos++] = comm->cell_f1[d1];
3255     }
3256
3257     if (d < comm->npmedecompdim)
3258     {
3259         /* The master determines the maximum shift for
3260          * the coordinate communication between separate PME nodes.
3261          */
3262         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3263     }
3264     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3265     if (d >= 1)
3266     {
3267         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3268     }
3269 }
3270
3271 static void relative_to_absolute_cell_bounds(gmx_domdec_t      *dd,
3272                                              const gmx_ddbox_t *ddbox,
3273                                              int                dimind)
3274 {
3275     gmx_domdec_comm_t *comm;
3276     int                dim;
3277
3278     comm = dd->comm;
3279
3280     /* Set the cell dimensions */
3281     dim                = dd->dim[dimind];
3282     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3283     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3284     if (dim >= ddbox->nboundeddim)
3285     {
3286         comm->cell_x0[dim] += ddbox->box0[dim];
3287         comm->cell_x1[dim] += ddbox->box0[dim];
3288     }
3289 }
3290
3291 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3292                                          int d, int dim, real *cell_f_row,
3293                                          const gmx_ddbox_t *ddbox)
3294 {
3295     gmx_domdec_comm_t *comm;
3296     int                d1, pos;
3297
3298     comm = dd->comm;
3299
3300 #if GMX_MPI
3301     /* Each node would only need to know two fractions,
3302      * but it is probably cheaper to broadcast the whole array.
3303      */
3304     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3305               0, comm->mpi_comm_load[d]);
3306 #endif
3307     /* Copy the fractions for this dimension from the buffer */
3308     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3309     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3310     /* The whole array was communicated, so set the buffer position */
3311     pos = dd->nc[dim] + 1;
3312     for (d1 = 0; d1 <= d; d1++)
3313     {
3314         if (d1 < d)
3315         {
3316             /* Copy the cell fractions of the lower dimensions */
3317             comm->cell_f0[d1] = cell_f_row[pos++];
3318             comm->cell_f1[d1] = cell_f_row[pos++];
3319         }
3320         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3321     }
3322     /* Convert the communicated shift from float to int */
3323     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3324     if (d >= 1)
3325     {
3326         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3327     }
3328 }
3329
3330 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3331                                          const gmx_ddbox_t *ddbox,
3332                                          gmx_bool bDynamicBox,
3333                                          gmx_bool bUniform, gmx_int64_t step)
3334 {
3335     gmx_domdec_comm_t *comm;
3336     int                d, dim, d1;
3337     gmx_bool           bRowMember, bRowRoot;
3338     real              *cell_f_row;
3339
3340     comm = dd->comm;
3341
3342     for (d = 0; d < dd->ndim; d++)
3343     {
3344         dim        = dd->dim[d];
3345         bRowMember = TRUE;
3346         bRowRoot   = TRUE;
3347         for (d1 = d; d1 < dd->ndim; d1++)
3348         {
3349             if (dd->ci[dd->dim[d1]] > 0)
3350             {
3351                 if (d1 != d)
3352                 {
3353                     bRowMember = FALSE;
3354                 }
3355                 bRowRoot = FALSE;
3356             }
3357         }
3358         if (bRowMember)
3359         {
3360             if (bRowRoot)
3361             {
3362                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3363                                            ddbox, bDynamicBox, bUniform, step);
3364                 cell_f_row = comm->root[d]->cell_f;
3365             }
3366             else
3367             {
3368                 cell_f_row = comm->cell_f_row;
3369             }
3370             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3371         }
3372     }
3373 }
3374
3375 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t      *dd,
3376                                            const gmx_ddbox_t *ddbox)
3377 {
3378     int d;
3379
3380     /* This function assumes the box is static and should therefore
3381      * not be called when the box has changed since the last
3382      * call to dd_partition_system.
3383      */
3384     for (d = 0; d < dd->ndim; d++)
3385     {
3386         relative_to_absolute_cell_bounds(dd, ddbox, d);
3387     }
3388 }
3389
3390
3391
3392 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3393                                   const gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3394                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3395                                   gmx_wallcycle_t wcycle)
3396 {
3397     gmx_domdec_comm_t *comm;
3398     int                dim;
3399
3400     comm = dd->comm;
3401
3402     if (bDoDLB)
3403     {
3404         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3405         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3406         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3407     }
3408     else if (bDynamicBox)
3409     {
3410         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3411     }
3412
3413     /* Set the dimensions for which no DD is used */
3414     for (dim = 0; dim < DIM; dim++)
3415     {
3416         if (dd->nc[dim] == 1)
3417         {
3418             comm->cell_x0[dim] = 0;
3419             comm->cell_x1[dim] = ddbox->box_size[dim];
3420             if (dim >= ddbox->nboundeddim)
3421             {
3422                 comm->cell_x0[dim] += ddbox->box0[dim];
3423                 comm->cell_x1[dim] += ddbox->box0[dim];
3424             }
3425         }
3426     }
3427 }
3428
3429 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3430 {
3431     int                    d, np, i;
3432     gmx_domdec_comm_dim_t *cd;
3433
3434     for (d = 0; d < dd->ndim; d++)
3435     {
3436         cd = &dd->comm->cd[d];
3437         np = npulse[dd->dim[d]];
3438         if (np > cd->np_nalloc)
3439         {
3440             if (debug)
3441             {
3442                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3443                         dim2char(dd->dim[d]), np);
3444             }
3445             if (DDMASTER(dd) && cd->np_nalloc > 0)
3446             {
3447                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3448             }
3449             srenew(cd->ind, np);
3450             for (i = cd->np_nalloc; i < np; i++)
3451             {
3452                 cd->ind[i].index  = nullptr;
3453                 cd->ind[i].nalloc = 0;
3454             }
3455             cd->np_nalloc = np;
3456         }
3457         cd->np = np;
3458     }
3459 }
3460
3461
3462 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3463                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3464                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3465                               gmx_wallcycle_t wcycle)
3466 {
3467     gmx_domdec_comm_t *comm;
3468     int                d;
3469     ivec               npulse;
3470
3471     comm = dd->comm;
3472
3473     /* Copy the old cell boundaries for the cg displacement check */
3474     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3475     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3476
3477     if (isDlbOn(comm))
3478     {
3479         if (DDMASTER(dd))
3480         {
3481             check_box_size(dd, ddbox);
3482         }
3483         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3484     }
3485     else
3486     {
3487         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3488         realloc_comm_ind(dd, npulse);
3489     }
3490
3491     if (debug)
3492     {
3493         for (d = 0; d < DIM; d++)
3494         {
3495             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3496                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3497         }
3498     }
3499 }
3500
3501 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3502                                   gmx_ddbox_t *ddbox,
3503                                   rvec cell_ns_x0, rvec cell_ns_x1,
3504                                   gmx_int64_t step)
3505 {
3506     gmx_domdec_comm_t *comm;
3507     int                dim_ind, dim;
3508
3509     comm = dd->comm;
3510
3511     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3512     {
3513         dim = dd->dim[dim_ind];
3514
3515         /* Without PBC we don't have restrictions on the outer cells */
3516         if (!(dim >= ddbox->npbcdim &&
3517               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3518             isDlbOn(comm) &&
3519             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3520             comm->cellsize_min[dim])
3521         {
3522             char buf[22];
3523             gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3524                       gmx_step_str(step, buf), dim2char(dim),
3525                       comm->cell_x1[dim] - comm->cell_x0[dim],
3526                       ddbox->skew_fac[dim],
3527                       dd->comm->cellsize_min[dim],
3528                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3529         }
3530     }
3531
3532     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3533     {
3534         /* Communicate the boundaries and update cell_ns_x0/1 */
3535         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3536         if (isDlbOn(dd->comm) && dd->ndim > 1)
3537         {
3538             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3539         }
3540     }
3541 }
3542
3543 static void make_tric_corr_matrix(int npbcdim, const matrix box, matrix tcm)
3544 {
3545     if (YY < npbcdim)
3546     {
3547         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3548     }
3549     else
3550     {
3551         tcm[YY][XX] = 0;
3552     }
3553     if (ZZ < npbcdim)
3554     {
3555         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3556         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3557     }
3558     else
3559     {
3560         tcm[ZZ][XX] = 0;
3561         tcm[ZZ][YY] = 0;
3562     }
3563 }
3564
3565 static void check_screw_box(const matrix box)
3566 {
3567     /* Mathematical limitation */
3568     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3569     {
3570         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3571     }
3572
3573     /* Limitation due to the asymmetry of the eighth shell method */
3574     if (box[ZZ][YY] != 0)
3575     {
3576         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3577     }
3578 }
3579
3580 static void distribute_cg(FILE *fplog,
3581                           const matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3582                           gmx_domdec_t *dd)
3583 {
3584     gmx_domdec_master_t *ma;
3585     int                **tmp_ind = nullptr, *tmp_nalloc = nullptr;
3586     int                  i, icg, j, k, k0, k1, d;
3587     matrix               tcm;
3588     rvec                 cg_cm;
3589     ivec                 ind;
3590     real                 nrcg, inv_ncg, pos_d;
3591     int                 *cgindex;
3592     gmx_bool             bScrew;
3593
3594     ma = dd->ma;
3595
3596     snew(tmp_nalloc, dd->nnodes);
3597     snew(tmp_ind, dd->nnodes);
3598     for (i = 0; i < dd->nnodes; i++)
3599     {
3600         tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3601         snew(tmp_ind[i], tmp_nalloc[i]);
3602     }
3603
3604     /* Clear the count */
3605     for (i = 0; i < dd->nnodes; i++)
3606     {
3607         ma->ncg[i] = 0;
3608         ma->nat[i] = 0;
3609     }
3610
3611     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3612
3613     cgindex = cgs->index;
3614
3615     /* Compute the center of geometry for all charge groups */
3616     for (icg = 0; icg < cgs->nr; icg++)
3617     {
3618         k0      = cgindex[icg];
3619         k1      = cgindex[icg+1];
3620         nrcg    = k1 - k0;
3621         if (nrcg == 1)
3622         {
3623             copy_rvec(pos[k0], cg_cm);
3624         }
3625         else
3626         {
3627             inv_ncg = 1.0/nrcg;
3628
3629             clear_rvec(cg_cm);
3630             for (k = k0; (k < k1); k++)
3631             {
3632                 rvec_inc(cg_cm, pos[k]);
3633             }
3634             for (d = 0; (d < DIM); d++)
3635             {
3636                 cg_cm[d] *= inv_ncg;
3637             }
3638         }
3639         /* Put the charge group in the box and determine the cell index */
3640         for (d = DIM-1; d >= 0; d--)
3641         {
3642             pos_d = cg_cm[d];
3643             if (d < dd->npbcdim)
3644             {
3645                 bScrew = (dd->bScrewPBC && d == XX);
3646                 if (tric_dir[d] && dd->nc[d] > 1)
3647                 {
3648                     /* Use triclinic coordintates for this dimension */
3649                     for (j = d+1; j < DIM; j++)
3650                     {
3651                         pos_d += cg_cm[j]*tcm[j][d];
3652                     }
3653                 }
3654                 while (pos_d >= box[d][d])
3655                 {
3656                     pos_d -= box[d][d];
3657                     rvec_dec(cg_cm, box[d]);
3658                     if (bScrew)
3659                     {
3660                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3661                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3662                     }
3663                     for (k = k0; (k < k1); k++)
3664                     {
3665                         rvec_dec(pos[k], box[d]);
3666                         if (bScrew)
3667                         {
3668                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3669                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3670                         }
3671                     }
3672                 }
3673                 while (pos_d < 0)
3674                 {
3675                     pos_d += box[d][d];
3676                     rvec_inc(cg_cm, box[d]);
3677                     if (bScrew)
3678                     {
3679                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3680                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3681                     }
3682                     for (k = k0; (k < k1); k++)
3683                     {
3684                         rvec_inc(pos[k], box[d]);
3685                         if (bScrew)
3686                         {
3687                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3688                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3689                         }
3690                     }
3691                 }
3692             }
3693             /* This could be done more efficiently */
3694             ind[d] = 0;
3695             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3696             {
3697                 ind[d]++;
3698             }
3699         }
3700         i = dd_index(dd->nc, ind);
3701         if (ma->ncg[i] == tmp_nalloc[i])
3702         {
3703             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3704             srenew(tmp_ind[i], tmp_nalloc[i]);
3705         }
3706         tmp_ind[i][ma->ncg[i]] = icg;
3707         ma->ncg[i]++;
3708         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3709     }
3710
3711     k1 = 0;
3712     for (i = 0; i < dd->nnodes; i++)
3713     {
3714         ma->index[i] = k1;
3715         for (k = 0; k < ma->ncg[i]; k++)
3716         {
3717             ma->cg[k1++] = tmp_ind[i][k];
3718         }
3719     }
3720     ma->index[dd->nnodes] = k1;
3721
3722     for (i = 0; i < dd->nnodes; i++)
3723     {
3724         sfree(tmp_ind[i]);
3725     }
3726     sfree(tmp_ind);
3727     sfree(tmp_nalloc);
3728
3729     if (fplog)
3730     {
3731         // Use double for the sums to avoid natoms^2 overflowing
3732         // (65537^2 > 2^32)
3733         int    nat_sum, nat_min, nat_max;
3734         double nat2_sum;
3735
3736         nat_sum  = 0;
3737         nat2_sum = 0;
3738         nat_min  = ma->nat[0];
3739         nat_max  = ma->nat[0];
3740         for (i = 0; i < dd->nnodes; i++)
3741         {
3742             nat_sum  += ma->nat[i];
3743             // cast to double to avoid integer overflows when squaring
3744             nat2_sum += gmx::square(static_cast<double>(ma->nat[i]));
3745             nat_min   = std::min(nat_min, ma->nat[i]);
3746             nat_max   = std::max(nat_max, ma->nat[i]);
3747         }
3748         nat_sum  /= dd->nnodes;
3749         nat2_sum /= dd->nnodes;
3750
3751         fprintf(fplog, "Atom distribution over %d domains: av %d stddev %d min %d max %d\n",
3752                 dd->nnodes,
3753                 nat_sum,
3754                 static_cast<int>(std::sqrt(nat2_sum - gmx::square(static_cast<double>(nat_sum)) + 0.5)),
3755                 nat_min, nat_max);
3756     }
3757 }
3758
3759 static void get_cg_distribution(FILE *fplog, gmx_domdec_t *dd,
3760                                 t_block *cgs, const matrix box, gmx_ddbox_t *ddbox,
3761                                 rvec pos[])
3762 {
3763     gmx_domdec_master_t *ma = nullptr;
3764     ivec                 npulse;
3765     int                  i, cg_gl;
3766     int                 *ibuf, buf2[2] = { 0, 0 };
3767     gmx_bool             bMaster = DDMASTER(dd);
3768
3769     if (bMaster)
3770     {
3771         ma = dd->ma;
3772
3773         if (dd->bScrewPBC)
3774         {
3775             check_screw_box(box);
3776         }
3777
3778         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
3779
3780         distribute_cg(fplog, box, ddbox->tric_dir, cgs, pos, dd);
3781         for (i = 0; i < dd->nnodes; i++)
3782         {
3783             ma->ibuf[2*i]   = ma->ncg[i];
3784             ma->ibuf[2*i+1] = ma->nat[i];
3785         }
3786         ibuf = ma->ibuf;
3787     }
3788     else
3789     {
3790         ibuf = nullptr;
3791     }
3792     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
3793
3794     dd->ncg_home = buf2[0];
3795     dd->nat_home = buf2[1];
3796     dd->ncg_tot  = dd->ncg_home;
3797     dd->nat_tot  = dd->nat_home;
3798     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3799     {
3800         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3801         srenew(dd->index_gl, dd->cg_nalloc);
3802         srenew(dd->cgindex, dd->cg_nalloc+1);
3803     }
3804     if (bMaster)
3805     {
3806         for (i = 0; i < dd->nnodes; i++)
3807         {
3808             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
3809             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3810         }
3811     }
3812
3813     dd_scatterv(dd,
3814                 bMaster ? ma->ibuf : nullptr,
3815                 bMaster ? ma->ibuf+dd->nnodes : nullptr,
3816                 bMaster ? ma->cg : nullptr,
3817                 dd->ncg_home*sizeof(int), dd->index_gl);
3818
3819     /* Determine the home charge group sizes */
3820     dd->cgindex[0] = 0;
3821     for (i = 0; i < dd->ncg_home; i++)
3822     {
3823         cg_gl            = dd->index_gl[i];
3824         dd->cgindex[i+1] =
3825             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3826     }
3827
3828     if (debug)
3829     {
3830         fprintf(debug, "Home charge groups:\n");
3831         for (i = 0; i < dd->ncg_home; i++)
3832         {
3833             fprintf(debug, " %d", dd->index_gl[i]);
3834             if (i % 10 == 9)
3835             {
3836                 fprintf(debug, "\n");
3837             }
3838         }
3839         fprintf(debug, "\n");
3840     }
3841 }
3842
3843 static int compact_and_copy_vec_at(int ncg, int *move,
3844                                    int *cgindex,
3845                                    int nvec, int vec,
3846                                    rvec *src, gmx_domdec_comm_t *comm,
3847                                    gmx_bool bCompact)
3848 {
3849     int m, icg, i, i0, i1, nrcg;
3850     int home_pos;
3851     int pos_vec[DIM*2];
3852
3853     home_pos = 0;
3854
3855     for (m = 0; m < DIM*2; m++)
3856     {
3857         pos_vec[m] = 0;
3858     }
3859
3860     i0 = 0;
3861     for (icg = 0; icg < ncg; icg++)
3862     {
3863         i1 = cgindex[icg+1];
3864         m  = move[icg];
3865         if (m == -1)
3866         {
3867             if (bCompact)
3868             {
3869                 /* Compact the home array in place */
3870                 for (i = i0; i < i1; i++)
3871                 {
3872                     copy_rvec(src[i], src[home_pos++]);
3873                 }
3874             }
3875         }
3876         else
3877         {
3878             /* Copy to the communication buffer */
3879             nrcg        = i1 - i0;
3880             pos_vec[m] += 1 + vec*nrcg;
3881             for (i = i0; i < i1; i++)
3882             {
3883                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
3884             }
3885             pos_vec[m] += (nvec - vec - 1)*nrcg;
3886         }
3887         if (!bCompact)
3888         {
3889             home_pos += i1 - i0;
3890         }
3891         i0 = i1;
3892     }
3893
3894     return home_pos;
3895 }
3896
3897 static int compact_and_copy_vec_cg(int ncg, int *move,
3898                                    int *cgindex,
3899                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
3900                                    gmx_bool bCompact)
3901 {
3902     int m, icg, i0, i1, nrcg;
3903     int home_pos;
3904     int pos_vec[DIM*2];
3905
3906     home_pos = 0;
3907
3908     for (m = 0; m < DIM*2; m++)
3909     {
3910         pos_vec[m] = 0;
3911     }
3912
3913     i0 = 0;
3914     for (icg = 0; icg < ncg; icg++)
3915     {
3916         i1 = cgindex[icg+1];
3917         m  = move[icg];
3918         if (m == -1)
3919         {
3920             if (bCompact)
3921             {
3922                 /* Compact the home array in place */
3923                 copy_rvec(src[icg], src[home_pos++]);
3924             }
3925         }
3926         else
3927         {
3928             nrcg = i1 - i0;
3929             /* Copy to the communication buffer */
3930             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
3931             pos_vec[m] += 1 + nrcg*nvec;
3932         }
3933         i0 = i1;
3934     }
3935     if (!bCompact)
3936     {
3937         home_pos = ncg;
3938     }
3939
3940     return home_pos;
3941 }
3942
3943 static int compact_ind(int ncg, int *move,
3944                        int *index_gl, int *cgindex,
3945                        int *gatindex,
3946                        gmx_ga2la_t *ga2la, char *bLocalCG,
3947                        int *cginfo)
3948 {
3949     int cg, nat, a0, a1, a, a_gl;
3950     int home_pos;
3951
3952     home_pos = 0;
3953     nat      = 0;
3954     for (cg = 0; cg < ncg; cg++)
3955     {
3956         a0 = cgindex[cg];
3957         a1 = cgindex[cg+1];
3958         if (move[cg] == -1)
3959         {
3960             /* Compact the home arrays in place.
3961              * Anything that can be done here avoids access to global arrays.
3962              */
3963             cgindex[home_pos] = nat;
3964             for (a = a0; a < a1; a++)
3965             {
3966                 a_gl          = gatindex[a];
3967                 gatindex[nat] = a_gl;
3968                 /* The cell number stays 0, so we don't need to set it */
3969                 ga2la_change_la(ga2la, a_gl, nat);
3970                 nat++;
3971             }
3972             index_gl[home_pos] = index_gl[cg];
3973             cginfo[home_pos]   = cginfo[cg];
3974             /* The charge group remains local, so bLocalCG does not change */
3975             home_pos++;
3976         }
3977         else
3978         {
3979             /* Clear the global indices */
3980             for (a = a0; a < a1; a++)
3981             {
3982                 ga2la_del(ga2la, gatindex[a]);
3983             }
3984             if (bLocalCG)
3985             {
3986                 bLocalCG[index_gl[cg]] = FALSE;
3987             }
3988         }
3989     }
3990     cgindex[home_pos] = nat;
3991
3992     return home_pos;
3993 }
3994
3995 static void clear_and_mark_ind(int ncg, int *move,
3996                                int *index_gl, int *cgindex, int *gatindex,
3997                                gmx_ga2la_t *ga2la, char *bLocalCG,
3998                                int *cell_index)
3999 {
4000     int cg, a0, a1, a;
4001
4002     for (cg = 0; cg < ncg; cg++)
4003     {
4004         if (move[cg] >= 0)
4005         {
4006             a0 = cgindex[cg];
4007             a1 = cgindex[cg+1];
4008             /* Clear the global indices */
4009             for (a = a0; a < a1; a++)
4010             {
4011                 ga2la_del(ga2la, gatindex[a]);
4012             }
4013             if (bLocalCG)
4014             {
4015                 bLocalCG[index_gl[cg]] = FALSE;
4016             }
4017             /* Signal that this cg has moved using the ns cell index.
4018              * Here we set it to -1. fill_grid will change it
4019              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4020              */
4021             cell_index[cg] = -1;
4022         }
4023     }
4024 }
4025
4026 static void print_cg_move(FILE *fplog,
4027                           gmx_domdec_t *dd,
4028                           gmx_int64_t step, int cg, int dim, int dir,
4029                           gmx_bool bHaveCgcmOld, real limitd,
4030                           rvec cm_old, rvec cm_new, real pos_d)
4031 {
4032     gmx_domdec_comm_t *comm;
4033     char               buf[22];
4034
4035     comm = dd->comm;
4036
4037     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4038     if (limitd > 0)
4039     {
4040         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4041                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4042                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4043     }
4044     else
4045     {
4046         /* We don't have a limiting distance available: don't print it */
4047         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4048                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4049                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4050     }
4051     fprintf(fplog, "distance out of cell %f\n",
4052             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4053     if (bHaveCgcmOld)
4054     {
4055         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4056                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4057     }
4058     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4059             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4060     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4061             dim2char(dim),
4062             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4063     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4064             dim2char(dim),
4065             comm->cell_x0[dim], comm->cell_x1[dim]);
4066 }
4067
4068 static void cg_move_error(FILE *fplog,
4069                           gmx_domdec_t *dd,
4070                           gmx_int64_t step, int cg, int dim, int dir,
4071                           gmx_bool bHaveCgcmOld, real limitd,
4072                           rvec cm_old, rvec cm_new, real pos_d)
4073 {
4074     if (fplog)
4075     {
4076         print_cg_move(fplog, dd, step, cg, dim, dir,
4077                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4078     }
4079     print_cg_move(stderr, dd, step, cg, dim, dir,
4080                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4081     gmx_fatal(FARGS,
4082               "%s moved too far between two domain decomposition steps\n"
4083               "This usually means that your system is not well equilibrated",
4084               dd->comm->bCGs ? "A charge group" : "An atom");
4085 }
4086
4087 static void rotate_state_atom(t_state *state, int a)
4088 {
4089     if (state->flags & (1 << estX))
4090     {
4091         /* Rotate the complete state; for a rectangular box only */
4092         state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4093         state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4094     }
4095     if (state->flags & (1 << estV))
4096     {
4097         state->v[a][YY] = -state->v[a][YY];
4098         state->v[a][ZZ] = -state->v[a][ZZ];
4099     }
4100     if (state->flags & (1 << estCGP))
4101     {
4102         state->cg_p[a][YY] = -state->cg_p[a][YY];
4103         state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4104     }
4105 }
4106
4107 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4108 {
4109     if (natoms > comm->moved_nalloc)
4110     {
4111         /* Contents should be preserved here */
4112         comm->moved_nalloc = over_alloc_dd(natoms);
4113         srenew(comm->moved, comm->moved_nalloc);
4114     }
4115
4116     return comm->moved;
4117 }
4118
4119 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4120                          gmx_domdec_t *dd,
4121                          t_state *state,
4122                          ivec tric_dir, matrix tcm,
4123                          rvec cell_x0, rvec cell_x1,
4124                          rvec limitd, rvec limit0, rvec limit1,
4125                          const int *cgindex,
4126                          int cg_start, int cg_end,
4127                          rvec *cg_cm,
4128                          int *move)
4129 {
4130     int      npbcdim;
4131     int      cg, k, k0, k1, d, dim, d2;
4132     int      mc, nrcg;
4133     int      flag;
4134     gmx_bool bScrew;
4135     ivec     dev;
4136     real     inv_ncg, pos_d;
4137     rvec     cm_new;
4138
4139     npbcdim = dd->npbcdim;
4140
4141     for (cg = cg_start; cg < cg_end; cg++)
4142     {
4143         k0   = cgindex[cg];
4144         k1   = cgindex[cg+1];
4145         nrcg = k1 - k0;
4146         if (nrcg == 1)
4147         {
4148             copy_rvec(state->x[k0], cm_new);
4149         }
4150         else
4151         {
4152             inv_ncg = 1.0/nrcg;
4153
4154             clear_rvec(cm_new);
4155             for (k = k0; (k < k1); k++)
4156             {
4157                 rvec_inc(cm_new, state->x[k]);
4158             }
4159             for (d = 0; (d < DIM); d++)
4160             {
4161                 cm_new[d] = inv_ncg*cm_new[d];
4162             }
4163         }
4164
4165         clear_ivec(dev);
4166         /* Do pbc and check DD cell boundary crossings */
4167         for (d = DIM-1; d >= 0; d--)
4168         {
4169             if (dd->nc[d] > 1)
4170             {
4171                 bScrew = (dd->bScrewPBC && d == XX);
4172                 /* Determine the location of this cg in lattice coordinates */
4173                 pos_d = cm_new[d];
4174                 if (tric_dir[d])
4175                 {
4176                     for (d2 = d+1; d2 < DIM; d2++)
4177                     {
4178                         pos_d += cm_new[d2]*tcm[d2][d];
4179                     }
4180                 }
4181                 /* Put the charge group in the triclinic unit-cell */
4182                 if (pos_d >= cell_x1[d])
4183                 {
4184                     if (pos_d >= limit1[d])
4185                     {
4186                         cg_move_error(fplog, dd, step, cg, d, 1,
4187                                       cg_cm != as_rvec_array(state->x.data()), limitd[d],
4188                                       cg_cm[cg], cm_new, pos_d);
4189                     }
4190                     dev[d] = 1;
4191                     if (dd->ci[d] == dd->nc[d] - 1)
4192                     {
4193                         rvec_dec(cm_new, state->box[d]);
4194                         if (bScrew)
4195                         {
4196                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4197                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4198                         }
4199                         for (k = k0; (k < k1); k++)
4200                         {
4201                             rvec_dec(state->x[k], state->box[d]);
4202                             if (bScrew)
4203                             {
4204                                 rotate_state_atom(state, k);
4205                             }
4206                         }
4207                     }
4208                 }
4209                 else if (pos_d < cell_x0[d])
4210                 {
4211                     if (pos_d < limit0[d])
4212                     {
4213                         cg_move_error(fplog, dd, step, cg, d, -1,
4214                                       cg_cm != as_rvec_array(state->x.data()), limitd[d],
4215                                       cg_cm[cg], cm_new, pos_d);
4216                     }
4217                     dev[d] = -1;
4218                     if (dd->ci[d] == 0)
4219                     {
4220                         rvec_inc(cm_new, state->box[d]);
4221                         if (bScrew)
4222                         {
4223                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4224                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4225                         }
4226                         for (k = k0; (k < k1); k++)
4227                         {
4228                             rvec_inc(state->x[k], state->box[d]);
4229                             if (bScrew)
4230                             {
4231                                 rotate_state_atom(state, k);
4232                             }
4233                         }
4234                     }
4235                 }
4236             }
4237             else if (d < npbcdim)
4238             {
4239                 /* Put the charge group in the rectangular unit-cell */
4240                 while (cm_new[d] >= state->box[d][d])
4241                 {
4242                     rvec_dec(cm_new, state->box[d]);
4243                     for (k = k0; (k < k1); k++)
4244                     {
4245                         rvec_dec(state->x[k], state->box[d]);
4246                     }
4247                 }
4248                 while (cm_new[d] < 0)
4249                 {
4250                     rvec_inc(cm_new, state->box[d]);
4251                     for (k = k0; (k < k1); k++)
4252                     {
4253                         rvec_inc(state->x[k], state->box[d]);
4254                     }
4255                 }
4256             }
4257         }
4258
4259         copy_rvec(cm_new, cg_cm[cg]);
4260
4261         /* Determine where this cg should go */
4262         flag = 0;
4263         mc   = -1;
4264         for (d = 0; d < dd->ndim; d++)
4265         {
4266             dim = dd->dim[d];
4267             if (dev[dim] == 1)
4268             {
4269                 flag |= DD_FLAG_FW(d);
4270                 if (mc == -1)
4271                 {
4272                     mc = d*2;
4273                 }
4274             }
4275             else if (dev[dim] == -1)
4276             {
4277                 flag |= DD_FLAG_BW(d);
4278                 if (mc == -1)
4279                 {
4280                     if (dd->nc[dim] > 2)
4281                     {
4282                         mc = d*2 + 1;
4283                     }
4284                     else
4285                     {
4286                         mc = d*2;
4287                     }
4288                 }
4289             }
4290         }
4291         /* Temporarily store the flag in move */
4292         move[cg] = mc + flag;
4293     }
4294 }
4295
4296 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4297                                gmx_domdec_t *dd, ivec tric_dir,
4298                                t_state *state, PaddedRVecVector *f,
4299                                t_forcerec *fr,
4300                                gmx_bool bCompact,
4301                                t_nrnb *nrnb,
4302                                int *ncg_stay_home,
4303                                int *ncg_moved)
4304 {
4305     int               *move;
4306     int                npbcdim;
4307     int                ncg[DIM*2] = { 0 }, nat[DIM*2] = { 0 };
4308     int                i, cg, k, d, dim, dim2, dir, d2, d3;
4309     int                mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
4310     int                sbuf[2], rbuf[2];
4311     int                home_pos_cg, home_pos_at, buf_pos;
4312     int                flag;
4313     real               pos_d;
4314     matrix             tcm;
4315     rvec              *cg_cm = nullptr, cell_x0, cell_x1, limitd, limit0, limit1;
4316     int               *cgindex;
4317     cginfo_mb_t       *cginfo_mb;
4318     gmx_domdec_comm_t *comm;
4319     int               *moved;
4320     int                nthread, thread;
4321
4322     if (dd->bScrewPBC)
4323     {
4324         check_screw_box(state->box);
4325     }
4326
4327     comm  = dd->comm;
4328     if (fr->cutoff_scheme == ecutsGROUP)
4329     {
4330         cg_cm = fr->cg_cm;
4331     }
4332
4333     // Positions are always present, so there's nothing to flag
4334     bool bV   = state->flags & (1<<estV);
4335     bool bCGP = state->flags & (1<<estCGP);
4336
4337     if (dd->ncg_tot > comm->nalloc_int)
4338     {
4339         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4340         srenew(comm->buf_int, comm->nalloc_int);
4341     }
4342     move = comm->buf_int;
4343
4344     npbcdim = dd->npbcdim;
4345
4346     for (d = 0; (d < DIM); d++)
4347     {
4348         limitd[d] = dd->comm->cellsize_min[d];
4349         if (d >= npbcdim && dd->ci[d] == 0)
4350         {
4351             cell_x0[d] = -GMX_FLOAT_MAX;
4352         }
4353         else
4354         {
4355             cell_x0[d] = comm->cell_x0[d];
4356         }
4357         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4358         {
4359             cell_x1[d] = GMX_FLOAT_MAX;
4360         }
4361         else
4362         {
4363             cell_x1[d] = comm->cell_x1[d];
4364         }
4365         if (d < npbcdim)
4366         {
4367             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4368             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4369         }
4370         else
4371         {
4372             /* We check after communication if a charge group moved
4373              * more than one cell. Set the pre-comm check limit to float_max.
4374              */
4375             limit0[d] = -GMX_FLOAT_MAX;
4376             limit1[d] =  GMX_FLOAT_MAX;
4377         }
4378     }
4379
4380     make_tric_corr_matrix(npbcdim, state->box, tcm);
4381
4382     cgindex = dd->cgindex;
4383
4384     nthread = gmx_omp_nthreads_get(emntDomdec);
4385
4386     /* Compute the center of geometry for all home charge groups
4387      * and put them in the box and determine where they should go.
4388      */
4389 #pragma omp parallel for num_threads(nthread) schedule(static)
4390     for (thread = 0; thread < nthread; thread++)
4391     {
4392         try
4393         {
4394             calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4395                          cell_x0, cell_x1, limitd, limit0, limit1,
4396                          cgindex,
4397                          ( thread   *dd->ncg_home)/nthread,
4398                          ((thread+1)*dd->ncg_home)/nthread,
4399                          fr->cutoff_scheme == ecutsGROUP ? cg_cm : as_rvec_array(state->x.data()),
4400                          move);
4401         }
4402         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4403     }
4404
4405     for (cg = 0; cg < dd->ncg_home; cg++)
4406     {
4407         if (move[cg] >= 0)
4408         {
4409             mc       = move[cg];
4410             flag     = mc & ~DD_FLAG_NRCG;
4411             mc       = mc & DD_FLAG_NRCG;
4412             move[cg] = mc;
4413
4414             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4415             {
4416                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4417                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4418             }
4419             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4420             /* We store the cg size in the lower 16 bits
4421              * and the place where the charge group should go
4422              * in the next 6 bits. This saves some communication volume.
4423              */
4424             nrcg = cgindex[cg+1] - cgindex[cg];
4425             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4426             ncg[mc] += 1;
4427             nat[mc] += nrcg;
4428         }
4429     }
4430
4431     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4432     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4433
4434     *ncg_moved = 0;
4435     for (i = 0; i < dd->ndim*2; i++)
4436     {
4437         *ncg_moved += ncg[i];
4438     }
4439
4440     nvec = 1;
4441     if (bV)
4442     {
4443         nvec++;
4444     }
4445     if (bCGP)
4446     {
4447         nvec++;
4448     }
4449
4450     /* Make sure the communication buffers are large enough */
4451     for (mc = 0; mc < dd->ndim*2; mc++)
4452     {
4453         nvr = ncg[mc] + nat[mc]*nvec;
4454         if (nvr > comm->cgcm_state_nalloc[mc])
4455         {
4456             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4457             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4458         }
4459     }
4460
4461     switch (fr->cutoff_scheme)
4462     {
4463         case ecutsGROUP:
4464             /* Recalculating cg_cm might be cheaper than communicating,
4465              * but that could give rise to rounding issues.
4466              */
4467             home_pos_cg =
4468                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4469                                         nvec, cg_cm, comm, bCompact);
4470             break;
4471         case ecutsVERLET:
4472             /* Without charge groups we send the moved atom coordinates
4473              * over twice. This is so the code below can be used without
4474              * many conditionals for both for with and without charge groups.
4475              */
4476             home_pos_cg =
4477                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4478                                         nvec, as_rvec_array(state->x.data()), comm, FALSE);
4479             if (bCompact)
4480             {
4481                 home_pos_cg -= *ncg_moved;
4482             }
4483             break;
4484         default:
4485             gmx_incons("unimplemented");
4486             home_pos_cg = 0;
4487     }
4488
4489     vec         = 0;
4490     home_pos_at =
4491         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4492                                 nvec, vec++, as_rvec_array(state->x.data()),
4493                                 comm, bCompact);
4494     if (bV)
4495     {
4496         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4497                                 nvec, vec++, as_rvec_array(state->v.data()),
4498                                 comm, bCompact);
4499     }
4500     if (bCGP)
4501     {
4502         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4503                                 nvec, vec++, as_rvec_array(state->cg_p.data()),
4504                                 comm, bCompact);
4505     }
4506
4507     if (bCompact)
4508     {
4509         compact_ind(dd->ncg_home, move,
4510                     dd->index_gl, dd->cgindex, dd->gatindex,
4511                     dd->ga2la, comm->bLocalCG,
4512                     fr->cginfo);
4513     }
4514     else
4515     {
4516         if (fr->cutoff_scheme == ecutsVERLET)
4517         {
4518             moved = get_moved(comm, dd->ncg_home);
4519
4520             for (k = 0; k < dd->ncg_home; k++)
4521             {
4522                 moved[k] = 0;
4523             }
4524         }
4525         else
4526         {
4527             moved = fr->ns->grid->cell_index;
4528         }
4529
4530         clear_and_mark_ind(dd->ncg_home, move,
4531                            dd->index_gl, dd->cgindex, dd->gatindex,
4532                            dd->ga2la, comm->bLocalCG,
4533                            moved);
4534     }
4535
4536     cginfo_mb = fr->cginfo_mb;
4537
4538     *ncg_stay_home = home_pos_cg;
4539     for (d = 0; d < dd->ndim; d++)
4540     {
4541         dim      = dd->dim[d];
4542         ncg_recv = 0;
4543         nvr      = 0;
4544         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4545         {
4546             cdd = d*2 + dir;
4547             /* Communicate the cg and atom counts */
4548             sbuf[0] = ncg[cdd];
4549             sbuf[1] = nat[cdd];
4550             if (debug)
4551             {
4552                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4553                         d, dir, sbuf[0], sbuf[1]);
4554             }
4555             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4556
4557             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4558             {
4559                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4560                 srenew(comm->buf_int, comm->nalloc_int);
4561             }
4562
4563             /* Communicate the charge group indices, sizes and flags */
4564             dd_sendrecv_int(dd, d, dir,
4565                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4566                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4567
4568             nvs = ncg[cdd] + nat[cdd]*nvec;
4569             i   = rbuf[0]  + rbuf[1] *nvec;
4570             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4571
4572             /* Communicate cgcm and state */
4573             dd_sendrecv_rvec(dd, d, dir,
4574                              comm->cgcm_state[cdd], nvs,
4575                              comm->vbuf.v+nvr, i);
4576             ncg_recv += rbuf[0];
4577             nvr      += i;
4578         }
4579
4580         dd_check_alloc_ncg(fr, state, f, home_pos_cg + ncg_recv);
4581         if (fr->cutoff_scheme == ecutsGROUP)
4582         {
4583             /* Here we resize to more than necessary and shrink later */
4584             dd_resize_state(state, f, home_pos_at + ncg_recv*MAX_CGCGSIZE);
4585         }
4586
4587         /* Process the received charge groups */
4588         buf_pos = 0;
4589         for (cg = 0; cg < ncg_recv; cg++)
4590         {
4591             flag = comm->buf_int[cg*DD_CGIBS+1];
4592
4593             if (dim >= npbcdim && dd->nc[dim] > 2)
4594             {
4595                 /* No pbc in this dim and more than one domain boundary.
4596                  * We do a separate check if a charge group didn't move too far.
4597                  */
4598                 if (((flag & DD_FLAG_FW(d)) &&
4599                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4600                     ((flag & DD_FLAG_BW(d)) &&
4601                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4602                 {
4603                     cg_move_error(fplog, dd, step, cg, dim,
4604                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4605                                   fr->cutoff_scheme == ecutsGROUP, 0,
4606                                   comm->vbuf.v[buf_pos],
4607                                   comm->vbuf.v[buf_pos],
4608                                   comm->vbuf.v[buf_pos][dim]);
4609                 }
4610             }
4611
4612             mc = -1;
4613             if (d < dd->ndim-1)
4614             {
4615                 /* Check which direction this cg should go */
4616                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4617                 {
4618                     if (isDlbOn(dd->comm))
4619                     {
4620                         /* The cell boundaries for dimension d2 are not equal
4621                          * for each cell row of the lower dimension(s),
4622                          * therefore we might need to redetermine where
4623                          * this cg should go.
4624                          */
4625                         dim2 = dd->dim[d2];
4626                         /* If this cg crosses the box boundary in dimension d2
4627                          * we can use the communicated flag, so we do not
4628                          * have to worry about pbc.
4629                          */
4630                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4631                                (flag & DD_FLAG_FW(d2))) ||
4632                               (dd->ci[dim2] == 0 &&
4633                                (flag & DD_FLAG_BW(d2)))))
4634                         {
4635                             /* Clear the two flags for this dimension */
4636                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4637                             /* Determine the location of this cg
4638                              * in lattice coordinates
4639                              */
4640                             pos_d = comm->vbuf.v[buf_pos][dim2];
4641                             if (tric_dir[dim2])
4642                             {
4643                                 for (d3 = dim2+1; d3 < DIM; d3++)
4644                                 {
4645                                     pos_d +=
4646                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4647                                 }
4648                             }
4649                             /* Check of we are not at the box edge.
4650                              * pbc is only handled in the first step above,
4651                              * but this check could move over pbc while
4652                              * the first step did not due to different rounding.
4653                              */
4654                             if (pos_d >= cell_x1[dim2] &&
4655                                 dd->ci[dim2] != dd->nc[dim2]-1)
4656                             {
4657                                 flag |= DD_FLAG_FW(d2);
4658                             }
4659                             else if (pos_d < cell_x0[dim2] &&
4660                                      dd->ci[dim2] != 0)
4661                             {
4662                                 flag |= DD_FLAG_BW(d2);
4663                             }
4664                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4665                         }
4666                     }
4667                     /* Set to which neighboring cell this cg should go */
4668                     if (flag & DD_FLAG_FW(d2))
4669                     {
4670                         mc = d2*2;
4671                     }
4672                     else if (flag & DD_FLAG_BW(d2))
4673                     {
4674                         if (dd->nc[dd->dim[d2]] > 2)
4675                         {
4676                             mc = d2*2+1;
4677                         }
4678                         else
4679                         {
4680                             mc = d2*2;
4681                         }
4682                     }
4683                 }
4684             }
4685
4686             nrcg = flag & DD_FLAG_NRCG;
4687             if (mc == -1)
4688             {
4689                 if (home_pos_cg+1 > dd->cg_nalloc)
4690                 {
4691                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4692                     srenew(dd->index_gl, dd->cg_nalloc);
4693                     srenew(dd->cgindex, dd->cg_nalloc+1);
4694                 }
4695                 /* Set the global charge group index and size */
4696                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
4697                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4698                 /* Copy the state from the buffer */
4699                 if (fr->cutoff_scheme == ecutsGROUP)
4700                 {
4701                     cg_cm = fr->cg_cm;
4702                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4703                 }
4704                 buf_pos++;
4705
4706                 /* Set the cginfo */
4707                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4708                                                    dd->index_gl[home_pos_cg]);
4709                 if (comm->bLocalCG)
4710                 {
4711                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4712                 }
4713
4714                 for (i = 0; i < nrcg; i++)
4715                 {
4716                     copy_rvec(comm->vbuf.v[buf_pos++],
4717                               state->x[home_pos_at+i]);
4718                 }
4719                 if (bV)
4720                 {
4721                     for (i = 0; i < nrcg; i++)
4722                     {
4723                         copy_rvec(comm->vbuf.v[buf_pos++],
4724                                   state->v[home_pos_at+i]);
4725                     }
4726                 }
4727                 if (bCGP)
4728                 {
4729                     for (i = 0; i < nrcg; i++)
4730                     {
4731                         copy_rvec(comm->vbuf.v[buf_pos++],
4732                                   state->cg_p[home_pos_at+i]);
4733                     }
4734                 }
4735                 home_pos_cg += 1;
4736                 home_pos_at += nrcg;
4737             }
4738             else
4739             {
4740                 /* Reallocate the buffers if necessary  */
4741                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4742                 {
4743                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4744                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4745                 }
4746                 nvr = ncg[mc] + nat[mc]*nvec;
4747                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4748                 {
4749                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4750                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4751                 }
4752                 /* Copy from the receive to the send buffers */
4753                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4754                        comm->buf_int + cg*DD_CGIBS,
4755                        DD_CGIBS*sizeof(int));
4756                 memcpy(comm->cgcm_state[mc][nvr],
4757                        comm->vbuf.v[buf_pos],
4758                        (1+nrcg*nvec)*sizeof(rvec));
4759                 buf_pos += 1 + nrcg*nvec;
4760                 ncg[mc] += 1;
4761                 nat[mc] += nrcg;
4762             }
4763         }
4764     }
4765
4766     /* With sorting (!bCompact) the indices are now only partially up to date
4767      * and ncg_home and nat_home are not the real count, since there are
4768      * "holes" in the arrays for the charge groups that moved to neighbors.
4769      */
4770     if (fr->cutoff_scheme == ecutsVERLET)
4771     {
4772         moved = get_moved(comm, home_pos_cg);
4773
4774         for (i = dd->ncg_home; i < home_pos_cg; i++)
4775         {
4776             moved[i] = 0;
4777         }
4778     }
4779     dd->ncg_home = home_pos_cg;
4780     dd->nat_home = home_pos_at;
4781
4782     if (fr->cutoff_scheme == ecutsGROUP && !bCompact)
4783     {
4784         /* We overallocated before, we need to set the right size here */
4785         dd_resize_state(state, f, dd->nat_home);
4786     }
4787
4788     if (debug)
4789     {
4790         fprintf(debug,
4791                 "Finished repartitioning: cgs moved out %d, new home %d\n",
4792                 *ncg_moved, dd->ncg_home-*ncg_moved);
4793
4794     }
4795 }
4796
4797 void dd_cycles_add(const gmx_domdec_t *dd, float cycles, int ddCycl)
4798 {
4799     /* Note that the cycles value can be incorrect, either 0 or some
4800      * extremely large value, when our thread migrated to another core
4801      * with an unsynchronized cycle counter. If this happens less often
4802      * that once per nstlist steps, this will not cause issues, since
4803      * we later subtract the maximum value from the sum over nstlist steps.
4804      * A zero count will slightly lower the total, but that's a small effect.
4805      * Note that the main purpose of the subtraction of the maximum value
4806      * is to avoid throwing off the load balancing when stalls occur due
4807      * e.g. system activity or network congestion.
4808      */
4809     dd->comm->cycl[ddCycl] += cycles;
4810     dd->comm->cycl_n[ddCycl]++;
4811     if (cycles > dd->comm->cycl_max[ddCycl])
4812     {
4813         dd->comm->cycl_max[ddCycl] = cycles;
4814     }
4815 }
4816
4817 static double force_flop_count(t_nrnb *nrnb)
4818 {
4819     int         i;
4820     double      sum;
4821     const char *name;
4822
4823     sum = 0;
4824     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
4825     {
4826         /* To get closer to the real timings, we half the count
4827          * for the normal loops and again half it for water loops.
4828          */
4829         name = nrnb_str(i);
4830         if (strstr(name, "W3") != nullptr || strstr(name, "W4") != nullptr)
4831         {
4832             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4833         }
4834         else
4835         {
4836             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4837         }
4838     }
4839     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
4840     {
4841         name = nrnb_str(i);
4842         if (strstr(name, "W3") != nullptr || strstr(name, "W4") != nullptr)
4843         {
4844             sum += nrnb->n[i]*cost_nrnb(i);
4845         }
4846     }
4847     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
4848     {
4849         sum += nrnb->n[i]*cost_nrnb(i);
4850     }
4851
4852     return sum;
4853 }
4854
4855 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
4856 {
4857     if (dd->comm->eFlop)
4858     {
4859         dd->comm->flop -= force_flop_count(nrnb);
4860     }
4861 }
4862 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
4863 {
4864     if (dd->comm->eFlop)
4865     {
4866         dd->comm->flop += force_flop_count(nrnb);
4867         dd->comm->flop_n++;
4868     }
4869 }
4870
4871 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4872 {
4873     int i;
4874
4875     for (i = 0; i < ddCyclNr; i++)
4876     {
4877         dd->comm->cycl[i]     = 0;
4878         dd->comm->cycl_n[i]   = 0;
4879         dd->comm->cycl_max[i] = 0;
4880     }
4881     dd->comm->flop   = 0;
4882     dd->comm->flop_n = 0;
4883 }
4884
4885 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
4886 {
4887     gmx_domdec_comm_t *comm;
4888     domdec_load_t     *load;
4889     domdec_root_t     *root = nullptr;
4890     int                d, dim, i, pos;
4891     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
4892     gmx_bool           bSepPME;
4893
4894     if (debug)
4895     {
4896         fprintf(debug, "get_load_distribution start\n");
4897     }
4898
4899     wallcycle_start(wcycle, ewcDDCOMMLOAD);
4900
4901     comm = dd->comm;
4902
4903     bSepPME = (dd->pme_nodeid >= 0);
4904
4905     if (dd->ndim == 0 && bSepPME)
4906     {
4907         /* Without decomposition, but with PME nodes, we need the load */
4908         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
4909         comm->load[0].pme = comm->cycl[ddCyclPME];
4910     }
4911
4912     for (d = dd->ndim-1; d >= 0; d--)
4913     {
4914         dim = dd->dim[d];
4915         /* Check if we participate in the communication in this dimension */
4916         if (d == dd->ndim-1 ||
4917             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
4918         {
4919             load = &comm->load[d];
4920             if (isDlbOn(dd->comm))
4921             {
4922                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4923             }
4924             pos = 0;
4925             if (d == dd->ndim-1)
4926             {
4927                 sbuf[pos++] = dd_force_load(comm);
4928                 sbuf[pos++] = sbuf[0];
4929                 if (isDlbOn(dd->comm))
4930                 {
4931                     sbuf[pos++] = sbuf[0];
4932                     sbuf[pos++] = cell_frac;
4933                     if (d > 0)
4934                     {
4935                         sbuf[pos++] = comm->cell_f_max0[d];
4936                         sbuf[pos++] = comm->cell_f_min1[d];
4937                     }
4938                 }
4939                 if (bSepPME)
4940                 {
4941                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4942                     sbuf[pos++] = comm->cycl[ddCyclPME];
4943                 }
4944             }
4945             else
4946             {
4947                 sbuf[pos++] = comm->load[d+1].sum;
4948                 sbuf[pos++] = comm->load[d+1].max;
4949                 if (isDlbOn(dd->comm))
4950                 {
4951                     sbuf[pos++] = comm->load[d+1].sum_m;
4952                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4953                     sbuf[pos++] = comm->load[d+1].flags;
4954                     if (d > 0)
4955                     {
4956                         sbuf[pos++] = comm->cell_f_max0[d];
4957                         sbuf[pos++] = comm->cell_f_min1[d];
4958                     }
4959                 }
4960                 if (bSepPME)
4961                 {
4962                     sbuf[pos++] = comm->load[d+1].mdf;
4963                     sbuf[pos++] = comm->load[d+1].pme;
4964                 }
4965             }
4966             load->nload = pos;
4967             /* Communicate a row in DD direction d.
4968              * The communicators are setup such that the root always has rank 0.
4969              */
4970 #if GMX_MPI
4971             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
4972                        load->load, load->nload*sizeof(float), MPI_BYTE,
4973                        0, comm->mpi_comm_load[d]);
4974 #endif
4975             if (dd->ci[dim] == dd->master_ci[dim])
4976             {
4977                 /* We are the root, process this row */
4978                 if (isDlbOn(comm))
4979                 {
4980                     root = comm->root[d];
4981                 }
4982                 load->sum      = 0;
4983                 load->max      = 0;
4984                 load->sum_m    = 0;
4985                 load->cvol_min = 1;
4986                 load->flags    = 0;
4987                 load->mdf      = 0;
4988                 load->pme      = 0;
4989                 pos            = 0;
4990                 for (i = 0; i < dd->nc[dim]; i++)
4991                 {
4992                     load->sum += load->load[pos++];
4993                     load->max  = std::max(load->max, load->load[pos]);
4994                     pos++;
4995                     if (isDlbOn(dd->comm))
4996                     {
4997                         if (root->bLimited)
4998                         {
4999                             /* This direction could not be load balanced properly,
5000                              * therefore we need to use the maximum iso the average load.
5001                              */
5002                             load->sum_m = std::max(load->sum_m, load->load[pos]);
5003                         }
5004                         else
5005                         {
5006                             load->sum_m += load->load[pos];
5007                         }
5008                         pos++;
5009                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
5010                         pos++;
5011                         if (d < dd->ndim-1)
5012                         {
5013                             load->flags = (int)(load->load[pos++] + 0.5);
5014                         }
5015                         if (d > 0)
5016                         {
5017                             root->cell_f_max0[i] = load->load[pos++];
5018                             root->cell_f_min1[i] = load->load[pos++];
5019                         }
5020                     }
5021                     if (bSepPME)
5022                     {
5023                         load->mdf = std::max(load->mdf, load->load[pos]);
5024                         pos++;
5025                         load->pme = std::max(load->pme, load->load[pos]);
5026                         pos++;
5027                     }
5028                 }
5029                 if (isDlbOn(comm) && root->bLimited)
5030                 {
5031                     load->sum_m *= dd->nc[dim];
5032                     load->flags |= (1<<d);
5033                 }
5034             }
5035         }
5036     }
5037
5038     if (DDMASTER(dd))
5039     {
5040         comm->nload      += dd_load_count(comm);
5041         comm->load_step  += comm->cycl[ddCyclStep];
5042         comm->load_sum   += comm->load[0].sum;
5043         comm->load_max   += comm->load[0].max;
5044         if (isDlbOn(comm))
5045         {
5046             for (d = 0; d < dd->ndim; d++)
5047             {
5048                 if (comm->load[0].flags & (1<<d))
5049                 {
5050                     comm->load_lim[d]++;
5051                 }
5052             }
5053         }
5054         if (bSepPME)
5055         {
5056             comm->load_mdf += comm->load[0].mdf;
5057             comm->load_pme += comm->load[0].pme;
5058         }
5059     }
5060
5061     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5062
5063     if (debug)
5064     {
5065         fprintf(debug, "get_load_distribution finished\n");
5066     }
5067 }
5068
5069 static float dd_force_load_fraction(gmx_domdec_t *dd)
5070 {
5071     /* Return the relative performance loss on the total run time
5072      * due to the force calculation load imbalance.
5073      */
5074     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5075     {
5076         return dd->comm->load_sum/(dd->comm->load_step*dd->nnodes);
5077     }
5078     else
5079     {
5080         return 0;
5081     }
5082 }
5083
5084 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5085 {
5086     /* Return the relative performance loss on the total run time
5087      * due to the force calculation load imbalance.
5088      */
5089     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5090     {
5091         return
5092             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5093             (dd->comm->load_step*dd->nnodes);
5094     }
5095     else
5096     {
5097         return 0;
5098     }
5099 }
5100
5101 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5102 {
5103     gmx_domdec_comm_t *comm = dd->comm;
5104
5105     /* Only the master rank prints loads and only if we measured loads */
5106     if (!DDMASTER(dd) || comm->nload == 0)
5107     {
5108         return;
5109     }
5110
5111     char  buf[STRLEN];
5112     int   numPpRanks   = dd->nnodes;
5113     int   numPmeRanks  = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5114     int   numRanks     = numPpRanks + numPmeRanks;
5115     float lossFraction = 0;
5116
5117     /* Print the average load imbalance and performance loss */
5118     if (dd->nnodes > 1 && comm->load_sum > 0)
5119     {
5120         float imbalance = comm->load_max*numPpRanks/comm->load_sum - 1;
5121         lossFraction    = dd_force_imb_perf_loss(dd);
5122
5123         std::string msg         = "\n Dynamic load balancing report:\n";
5124         std::string dlbStateStr = "";
5125
5126         switch (dd->comm->dlbState)
5127         {
5128             case edlbsOffUser:
5129                 dlbStateStr = "DLB was off during the run per user request.";
5130                 break;
5131             case edlbsOffForever:
5132                 /* Currectly this can happen due to performance loss observed, cell size
5133                  * limitations or incompatibility with other settings observed during
5134                  * determineInitialDlbState(). */
5135                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
5136                 break;
5137             case edlbsOffCanTurnOn:
5138                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
5139                 break;
5140             case edlbsOffTemporarilyLocked:
5141                 dlbStateStr = "DLB was locked at the end of the run due to unfinished PP-PME balancing.";
5142                 break;
5143             case edlbsOnCanTurnOff:
5144                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
5145                 break;
5146             case edlbsOnUser:
5147                 dlbStateStr = "DLB was permanently on during the run per user request.";
5148                 break;
5149             default:
5150                 GMX_ASSERT(false, "Undocumented DLB state");
5151         }
5152
5153         msg += " " + dlbStateStr + "\n";
5154         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance*100);
5155         msg += gmx::formatString(" The balanceable part of the MD step is %d%%, load imbalance is computed from this.\n",
5156                                  static_cast<int>(dd_force_load_fraction(dd)*100 + 0.5));
5157         msg += gmx::formatString(" Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
5158                                  lossFraction*100);
5159         fprintf(fplog, "%s", msg.c_str());
5160         fprintf(stderr, "%s", msg.c_str());
5161     }
5162
5163     /* Print during what percentage of steps the  load balancing was limited */
5164     bool dlbWasLimited = false;
5165     if (isDlbOn(comm))
5166     {
5167         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5168         for (int d = 0; d < dd->ndim; d++)
5169         {
5170             int limitPercentage = (200*comm->load_lim[d] + 1)/(2*comm->nload);
5171             sprintf(buf+strlen(buf), " %c %d %%",
5172                     dim2char(dd->dim[d]), limitPercentage);
5173             if (limitPercentage >= 50)
5174             {
5175                 dlbWasLimited = true;
5176             }
5177         }
5178         sprintf(buf + strlen(buf), "\n");
5179         fprintf(fplog, "%s", buf);
5180         fprintf(stderr, "%s", buf);
5181     }
5182
5183     /* Print the performance loss due to separate PME - PP rank imbalance */
5184     float lossFractionPme = 0;
5185     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
5186     {
5187         float pmeForceRatio = comm->load_pme/comm->load_mdf;
5188         lossFractionPme     = (comm->load_pme - comm->load_mdf)/comm->load_step;
5189         if (lossFractionPme <= 0)
5190         {
5191             lossFractionPme *= numPmeRanks/static_cast<float>(numRanks);
5192         }
5193         else
5194         {
5195             lossFractionPme *= numPpRanks/static_cast<float>(numRanks);
5196         }
5197         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
5198         fprintf(fplog, "%s", buf);
5199         fprintf(stderr, "%s", buf);
5200         sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossFractionPme)*100);
5201         fprintf(fplog, "%s", buf);
5202         fprintf(stderr, "%s", buf);
5203     }
5204     fprintf(fplog, "\n");
5205     fprintf(stderr, "\n");
5206
5207     if (lossFraction >= DD_PERF_LOSS_WARN)
5208     {
5209         sprintf(buf,
5210                 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5211                 "      in the domain decomposition.\n", lossFraction*100);
5212         if (!isDlbOn(comm))
5213         {
5214             sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5215         }
5216         else if (dlbWasLimited)
5217         {
5218             sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5219         }
5220         fprintf(fplog, "%s\n", buf);
5221         fprintf(stderr, "%s\n", buf);
5222     }
5223     if (numPmeRanks > 0 && fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
5224     {
5225         sprintf(buf,
5226                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
5227                 "      had %s work to do than the PP ranks.\n"
5228                 "      You might want to %s the number of PME ranks\n"
5229                 "      or %s the cut-off and the grid spacing.\n",
5230                 fabs(lossFractionPme*100),
5231                 (lossFractionPme < 0) ? "less"     : "more",
5232                 (lossFractionPme < 0) ? "decrease" : "increase",
5233                 (lossFractionPme < 0) ? "decrease" : "increase");
5234         fprintf(fplog, "%s\n", buf);
5235         fprintf(stderr, "%s\n", buf);
5236     }
5237 }
5238
5239 static float dd_vol_min(gmx_domdec_t *dd)
5240 {
5241     return dd->comm->load[0].cvol_min*dd->nnodes;
5242 }
5243
5244 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5245 {
5246     return dd->comm->load[0].flags;
5247 }
5248
5249 static float dd_f_imbal(gmx_domdec_t *dd)
5250 {
5251     if (dd->comm->load[0].sum > 0)
5252     {
5253         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
5254     }
5255     else
5256     {
5257         /* Something is wrong in the cycle counting, report no load imbalance */
5258         return 0.0f;
5259     }
5260 }
5261
5262 float dd_pme_f_ratio(gmx_domdec_t *dd)
5263 {
5264     /* Should only be called on the DD master rank */
5265     assert(DDMASTER(dd));
5266
5267     if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
5268     {
5269         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5270     }
5271     else
5272     {
5273         return -1.0;
5274     }
5275 }
5276
5277 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5278 {
5279     int  flags, d;
5280     char buf[22];
5281
5282     flags = dd_load_flags(dd);
5283     if (flags)
5284     {
5285         fprintf(fplog,
5286                 "DD  load balancing is limited by minimum cell size in dimension");
5287         for (d = 0; d < dd->ndim; d++)
5288         {
5289             if (flags & (1<<d))
5290             {
5291                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5292             }
5293         }
5294         fprintf(fplog, "\n");
5295     }
5296     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5297     if (isDlbOn(dd->comm))
5298     {
5299         fprintf(fplog, "  vol min/aver %5.3f%c",
5300                 dd_vol_min(dd), flags ? '!' : ' ');
5301     }
5302     if (dd->nnodes > 1)
5303     {
5304         fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5305     }
5306     if (dd->comm->cycl_n[ddCyclPME])
5307     {
5308         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5309     }
5310     fprintf(fplog, "\n\n");
5311 }
5312
5313 static void dd_print_load_verbose(gmx_domdec_t *dd)
5314 {
5315     if (isDlbOn(dd->comm))
5316     {
5317         fprintf(stderr, "vol %4.2f%c ",
5318                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5319     }
5320     if (dd->nnodes > 1)
5321     {
5322         fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5323     }
5324     if (dd->comm->cycl_n[ddCyclPME])
5325     {
5326         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5327     }
5328 }
5329
5330 #if GMX_MPI
5331 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5332 {
5333     MPI_Comm           c_row;
5334     int                dim, i, rank;
5335     ivec               loc_c;
5336     domdec_root_t     *root;
5337     gmx_bool           bPartOfGroup = FALSE;
5338
5339     dim = dd->dim[dim_ind];
5340     copy_ivec(loc, loc_c);
5341     for (i = 0; i < dd->nc[dim]; i++)
5342     {
5343         loc_c[dim] = i;
5344         rank       = dd_index(dd->nc, loc_c);
5345         if (rank == dd->rank)
5346         {
5347             /* This process is part of the group */
5348             bPartOfGroup = TRUE;
5349         }
5350     }
5351     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5352                    &c_row);
5353     if (bPartOfGroup)
5354     {
5355         dd->comm->mpi_comm_load[dim_ind] = c_row;
5356         if (!isDlbDisabled(dd->comm))
5357         {
5358             if (dd->ci[dim] == dd->master_ci[dim])
5359             {
5360                 /* This is the root process of this row */
5361                 snew(dd->comm->root[dim_ind], 1);
5362                 root = dd->comm->root[dim_ind];
5363                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5364                 snew(root->old_cell_f, dd->nc[dim]+1);
5365                 snew(root->bCellMin, dd->nc[dim]);
5366                 if (dim_ind > 0)
5367                 {
5368                     snew(root->cell_f_max0, dd->nc[dim]);
5369                     snew(root->cell_f_min1, dd->nc[dim]);
5370                     snew(root->bound_min, dd->nc[dim]);
5371                     snew(root->bound_max, dd->nc[dim]);
5372                 }
5373                 snew(root->buf_ncd, dd->nc[dim]);
5374             }
5375             else
5376             {
5377                 /* This is not a root process, we only need to receive cell_f */
5378                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5379             }
5380         }
5381         if (dd->ci[dim] == dd->master_ci[dim])
5382         {
5383             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5384         }
5385     }
5386 }
5387 #endif
5388
5389 void dd_setup_dlb_resource_sharing(t_commrec            *cr,
5390                                    int                   gpu_id)
5391 {
5392 #if GMX_MPI
5393     int           physicalnode_id_hash;
5394     gmx_domdec_t *dd;
5395     MPI_Comm      mpi_comm_pp_physicalnode;
5396
5397     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
5398     {
5399         /* Only ranks with short-ranged tasks (currently) use GPUs.
5400          * If we don't have GPUs assigned, there are no resources to share.
5401          */
5402         return;
5403     }
5404
5405     physicalnode_id_hash = gmx_physicalnode_id_hash();
5406
5407     dd = cr->dd;
5408
5409     if (debug)
5410     {
5411         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5412         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5413                 dd->rank, physicalnode_id_hash, gpu_id);
5414     }
5415     /* Split the PP communicator over the physical nodes */
5416     /* TODO: See if we should store this (before), as it's also used for
5417      * for the nodecomm summation.
5418      */
5419     // TODO PhysicalNodeCommunicator could be extended/used to handle
5420     // the need for per-node per-group communicators.
5421     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5422                    &mpi_comm_pp_physicalnode);
5423     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5424                    &dd->comm->mpi_comm_gpu_shared);
5425     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5426     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5427
5428     if (debug)
5429     {
5430         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5431     }
5432
5433     /* Note that some ranks could share a GPU, while others don't */
5434
5435     if (dd->comm->nrank_gpu_shared == 1)
5436     {
5437         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5438     }
5439 #else
5440     GMX_UNUSED_VALUE(cr);
5441     GMX_UNUSED_VALUE(gpu_id);
5442 #endif
5443 }
5444
5445 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5446 {
5447 #if GMX_MPI
5448     int  dim0, dim1, i, j;
5449     ivec loc;
5450
5451     if (debug)
5452     {
5453         fprintf(debug, "Making load communicators\n");
5454     }
5455
5456     snew(dd->comm->load,          std::max(dd->ndim, 1));
5457     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
5458
5459     if (dd->ndim == 0)
5460     {
5461         return;
5462     }
5463
5464     clear_ivec(loc);
5465     make_load_communicator(dd, 0, loc);
5466     if (dd->ndim > 1)
5467     {
5468         dim0 = dd->dim[0];
5469         for (i = 0; i < dd->nc[dim0]; i++)
5470         {
5471             loc[dim0] = i;
5472             make_load_communicator(dd, 1, loc);
5473         }
5474     }
5475     if (dd->ndim > 2)
5476     {
5477         dim0 = dd->dim[0];
5478         for (i = 0; i < dd->nc[dim0]; i++)
5479         {
5480             loc[dim0] = i;
5481             dim1      = dd->dim[1];
5482             for (j = 0; j < dd->nc[dim1]; j++)
5483             {
5484                 loc[dim1] = j;
5485                 make_load_communicator(dd, 2, loc);
5486             }
5487         }
5488     }
5489
5490     if (debug)
5491     {
5492         fprintf(debug, "Finished making load communicators\n");
5493     }
5494 #endif
5495 }
5496
5497 /*! \brief Sets up the relation between neighboring domains and zones */
5498 static void setup_neighbor_relations(gmx_domdec_t *dd)
5499 {
5500     int                     d, dim, i, j, m;
5501     ivec                    tmp, s;
5502     gmx_domdec_zones_t     *zones;
5503     gmx_domdec_ns_ranges_t *izone;
5504
5505     for (d = 0; d < dd->ndim; d++)
5506     {
5507         dim = dd->dim[d];
5508         copy_ivec(dd->ci, tmp);
5509         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5510         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5511         copy_ivec(dd->ci, tmp);
5512         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5513         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5514         if (debug)
5515         {
5516             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5517                     dd->rank, dim,
5518                     dd->neighbor[d][0],
5519                     dd->neighbor[d][1]);
5520         }
5521     }
5522
5523     int nzone  = (1 << dd->ndim);
5524     int nizone = (1 << std::max(dd->ndim - 1, 0));
5525     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
5526
5527     zones = &dd->comm->zones;
5528
5529     for (i = 0; i < nzone; i++)
5530     {
5531         m = 0;
5532         clear_ivec(zones->shift[i]);
5533         for (d = 0; d < dd->ndim; d++)
5534         {
5535             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5536         }
5537     }
5538
5539     zones->n = nzone;
5540     for (i = 0; i < nzone; i++)
5541     {
5542         for (d = 0; d < DIM; d++)
5543         {
5544             s[d] = dd->ci[d] - zones->shift[i][d];
5545             if (s[d] < 0)
5546             {
5547                 s[d] += dd->nc[d];
5548             }
5549             else if (s[d] >= dd->nc[d])
5550             {
5551                 s[d] -= dd->nc[d];
5552             }
5553         }
5554     }
5555     zones->nizone = nizone;
5556     for (i = 0; i < zones->nizone; i++)
5557     {
5558         assert(ddNonbondedZonePairRanges[i][0] == i);
5559
5560         izone     = &zones->izone[i];
5561         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
5562          * j-zones up to nzone.
5563          */
5564         izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
5565         izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
5566         for (dim = 0; dim < DIM; dim++)
5567         {
5568             if (dd->nc[dim] == 1)
5569             {
5570                 /* All shifts should be allowed */
5571                 izone->shift0[dim] = -1;
5572                 izone->shift1[dim] = 1;
5573             }
5574             else
5575             {
5576                 /* Determine the min/max j-zone shift wrt the i-zone */
5577                 izone->shift0[dim] = 1;
5578                 izone->shift1[dim] = -1;
5579                 for (j = izone->j0; j < izone->j1; j++)
5580                 {
5581                     int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5582                     if (shift_diff < izone->shift0[dim])
5583                     {
5584                         izone->shift0[dim] = shift_diff;
5585                     }
5586                     if (shift_diff > izone->shift1[dim])
5587                     {
5588                         izone->shift1[dim] = shift_diff;
5589                     }
5590                 }
5591             }
5592         }
5593     }
5594
5595     if (!isDlbDisabled(dd->comm))
5596     {
5597         snew(dd->comm->root, dd->ndim);
5598     }
5599
5600     if (dd->comm->bRecordLoad)
5601     {
5602         make_load_communicators(dd);
5603     }
5604 }
5605
5606 static void make_pp_communicator(FILE                 *fplog,
5607                                  gmx_domdec_t         *dd,
5608                                  t_commrec gmx_unused *cr,
5609                                  int gmx_unused        reorder)
5610 {
5611 #if GMX_MPI
5612     gmx_domdec_comm_t *comm;
5613     int                rank, *buf;
5614     ivec               periods;
5615     MPI_Comm           comm_cart;
5616
5617     comm = dd->comm;
5618
5619     if (comm->bCartesianPP)
5620     {
5621         /* Set up cartesian communication for the particle-particle part */
5622         if (fplog)
5623         {
5624             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5625                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5626         }
5627
5628         for (int i = 0; i < DIM; i++)
5629         {
5630             periods[i] = TRUE;
5631         }
5632         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5633                         &comm_cart);
5634         /* We overwrite the old communicator with the new cartesian one */
5635         cr->mpi_comm_mygroup = comm_cart;
5636     }
5637
5638     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5639     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5640
5641     if (comm->bCartesianPP_PME)
5642     {
5643         /* Since we want to use the original cartesian setup for sim,
5644          * and not the one after split, we need to make an index.
5645          */
5646         snew(comm->ddindex2ddnodeid, dd->nnodes);
5647         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5648         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5649         /* Get the rank of the DD master,
5650          * above we made sure that the master node is a PP node.
5651          */
5652         if (MASTER(cr))
5653         {
5654             rank = dd->rank;
5655         }
5656         else
5657         {
5658             rank = 0;
5659         }
5660         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5661     }
5662     else if (comm->bCartesianPP)
5663     {
5664         if (cr->npmenodes == 0)
5665         {
5666             /* The PP communicator is also
5667              * the communicator for this simulation
5668              */
5669             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5670         }
5671         cr->nodeid = dd->rank;
5672
5673         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5674
5675         /* We need to make an index to go from the coordinates
5676          * to the nodeid of this simulation.
5677          */
5678         snew(comm->ddindex2simnodeid, dd->nnodes);
5679         snew(buf, dd->nnodes);
5680         if (thisRankHasDuty(cr, DUTY_PP))
5681         {
5682             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5683         }
5684         /* Communicate the ddindex to simulation nodeid index */
5685         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5686                       cr->mpi_comm_mysim);
5687         sfree(buf);
5688
5689         /* Determine the master coordinates and rank.
5690          * The DD master should be the same node as the master of this sim.
5691          */
5692         for (int i = 0; i < dd->nnodes; i++)
5693         {
5694             if (comm->ddindex2simnodeid[i] == 0)
5695             {
5696                 ddindex2xyz(dd->nc, i, dd->master_ci);
5697                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5698             }
5699         }
5700         if (debug)
5701         {
5702             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5703         }
5704     }
5705     else
5706     {
5707         /* No Cartesian communicators */
5708         /* We use the rank in dd->comm->all as DD index */
5709         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5710         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5711         dd->masterrank = 0;
5712         clear_ivec(dd->master_ci);
5713     }
5714 #endif
5715
5716     if (fplog)
5717     {
5718         fprintf(fplog,
5719                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5720                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5721     }
5722     if (debug)
5723     {
5724         fprintf(debug,
5725                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5726                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5727     }
5728 }
5729
5730 static void receive_ddindex2simnodeid(gmx_domdec_t         *dd,
5731                                       t_commrec            *cr)
5732 {
5733 #if GMX_MPI
5734     gmx_domdec_comm_t *comm = dd->comm;
5735
5736     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5737     {
5738         int *buf;
5739         snew(comm->ddindex2simnodeid, dd->nnodes);
5740         snew(buf, dd->nnodes);
5741         if (thisRankHasDuty(cr, DUTY_PP))
5742         {
5743             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5744         }
5745         /* Communicate the ddindex to simulation nodeid index */
5746         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5747                       cr->mpi_comm_mysim);
5748         sfree(buf);
5749     }
5750 #else
5751     GMX_UNUSED_VALUE(dd);
5752     GMX_UNUSED_VALUE(cr);
5753 #endif
5754 }
5755
5756 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5757                                                      int ncg, int natoms)
5758 {
5759     gmx_domdec_master_t *ma;
5760     int                  i;
5761
5762     snew(ma, 1);
5763
5764     snew(ma->ncg, dd->nnodes);
5765     snew(ma->index, dd->nnodes+1);
5766     snew(ma->cg, ncg);
5767     snew(ma->nat, dd->nnodes);
5768     snew(ma->ibuf, dd->nnodes*2);
5769     snew(ma->cell_x, DIM);
5770     for (i = 0; i < DIM; i++)
5771     {
5772         snew(ma->cell_x[i], dd->nc[i]+1);
5773     }
5774
5775     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5776     {
5777         ma->vbuf = nullptr;
5778     }
5779     else
5780     {
5781         snew(ma->vbuf, natoms);
5782     }
5783
5784     return ma;
5785 }
5786
5787 static void split_communicator(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
5788                                DdRankOrder gmx_unused rankOrder,
5789                                int gmx_unused reorder)
5790 {
5791     gmx_domdec_comm_t *comm;
5792     int                i;
5793     gmx_bool           bDiv[DIM];
5794 #if GMX_MPI
5795     MPI_Comm           comm_cart;
5796 #endif
5797
5798     comm = dd->comm;
5799
5800     if (comm->bCartesianPP)
5801     {
5802         for (i = 1; i < DIM; i++)
5803         {
5804             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5805         }
5806         if (bDiv[YY] || bDiv[ZZ])
5807         {
5808             comm->bCartesianPP_PME = TRUE;
5809             /* If we have 2D PME decomposition, which is always in x+y,
5810              * we stack the PME only nodes in z.
5811              * Otherwise we choose the direction that provides the thinnest slab
5812              * of PME only nodes as this will have the least effect
5813              * on the PP communication.
5814              * But for the PME communication the opposite might be better.
5815              */
5816             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5817                              !bDiv[YY] ||
5818                              dd->nc[YY] > dd->nc[ZZ]))
5819             {
5820                 comm->cartpmedim = ZZ;
5821             }
5822             else
5823             {
5824                 comm->cartpmedim = YY;
5825             }
5826             comm->ntot[comm->cartpmedim]
5827                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5828         }
5829         else if (fplog)
5830         {
5831             fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
5832             fprintf(fplog,
5833                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5834         }
5835     }
5836
5837     if (comm->bCartesianPP_PME)
5838     {
5839 #if GMX_MPI
5840         int  rank;
5841         ivec periods;
5842
5843         if (fplog)
5844         {
5845             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
5846         }
5847
5848         for (i = 0; i < DIM; i++)
5849         {
5850             periods[i] = TRUE;
5851         }
5852         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
5853                         &comm_cart);
5854         MPI_Comm_rank(comm_cart, &rank);
5855         if (MASTER(cr) && rank != 0)
5856         {
5857             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5858         }
5859
5860         /* With this assigment we loose the link to the original communicator
5861          * which will usually be MPI_COMM_WORLD, unless have multisim.
5862          */
5863         cr->mpi_comm_mysim = comm_cart;
5864         cr->sim_nodeid     = rank;
5865
5866         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
5867
5868         if (fplog)
5869         {
5870             fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
5871                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5872         }
5873
5874         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5875         {
5876             cr->duty = DUTY_PP;
5877         }
5878         if (cr->npmenodes == 0 ||
5879             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5880         {
5881             cr->duty = DUTY_PME;
5882         }
5883
5884         /* Split the sim communicator into PP and PME only nodes */
5885         MPI_Comm_split(cr->mpi_comm_mysim,
5886                        getThisRankDuties(cr),
5887                        dd_index(comm->ntot, dd->ci),
5888                        &cr->mpi_comm_mygroup);
5889 #endif
5890     }
5891     else
5892     {
5893         switch (rankOrder)
5894         {
5895             case DdRankOrder::pp_pme:
5896                 if (fplog)
5897                 {
5898                     fprintf(fplog, "Order of the ranks: PP first, PME last\n");
5899                 }
5900                 break;
5901             case DdRankOrder::interleave:
5902                 /* Interleave the PP-only and PME-only ranks */
5903                 if (fplog)
5904                 {
5905                     fprintf(fplog, "Interleaving PP and PME ranks\n");
5906                 }
5907                 comm->pmenodes = dd_interleaved_pme_ranks(dd);
5908                 break;
5909             case DdRankOrder::cartesian:
5910                 break;
5911             default:
5912                 gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(rankOrder));
5913         }
5914
5915         if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
5916         {
5917             cr->duty = DUTY_PME;
5918         }
5919         else
5920         {
5921             cr->duty = DUTY_PP;
5922         }
5923 #if GMX_MPI
5924         /* Split the sim communicator into PP and PME only nodes */
5925         MPI_Comm_split(cr->mpi_comm_mysim,
5926                        getThisRankDuties(cr),
5927                        cr->nodeid,
5928                        &cr->mpi_comm_mygroup);
5929         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
5930 #endif
5931     }
5932
5933     if (fplog)
5934     {
5935         fprintf(fplog, "This rank does only %s work.\n\n",
5936                 thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
5937     }
5938 }
5939
5940 /*! \brief Generates the MPI communicators for domain decomposition */
5941 static void make_dd_communicators(FILE *fplog, t_commrec *cr,
5942                                   gmx_domdec_t *dd, DdRankOrder ddRankOrder)
5943 {
5944     gmx_domdec_comm_t *comm;
5945     int                CartReorder;
5946
5947     comm = dd->comm;
5948
5949     copy_ivec(dd->nc, comm->ntot);
5950
5951     comm->bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
5952     comm->bCartesianPP_PME = FALSE;
5953
5954     /* Reorder the nodes by default. This might change the MPI ranks.
5955      * Real reordering is only supported on very few architectures,
5956      * Blue Gene is one of them.
5957      */
5958     CartReorder = (getenv("GMX_NO_CART_REORDER") == nullptr);
5959
5960     if (cr->npmenodes > 0)
5961     {
5962         /* Split the communicator into a PP and PME part */
5963         split_communicator(fplog, cr, dd, ddRankOrder, CartReorder);
5964         if (comm->bCartesianPP_PME)
5965         {
5966             /* We (possibly) reordered the nodes in split_communicator,
5967              * so it is no longer required in make_pp_communicator.
5968              */
5969             CartReorder = FALSE;
5970         }
5971     }
5972     else
5973     {
5974         /* All nodes do PP and PME */
5975 #if GMX_MPI
5976         /* We do not require separate communicators */
5977         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5978 #endif
5979     }
5980
5981     if (thisRankHasDuty(cr, DUTY_PP))
5982     {
5983         /* Copy or make a new PP communicator */
5984         make_pp_communicator(fplog, dd, cr, CartReorder);
5985     }
5986     else
5987     {
5988         receive_ddindex2simnodeid(dd, cr);
5989     }
5990
5991     if (!thisRankHasDuty(cr, DUTY_PME))
5992     {
5993         /* Set up the commnuication to our PME node */
5994         dd->pme_nodeid           = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
5995         dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
5996         if (debug)
5997         {
5998             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
5999                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6000         }
6001     }
6002     else
6003     {
6004         dd->pme_nodeid = -1;
6005     }
6006
6007     if (DDMASTER(dd))
6008     {
6009         dd->ma = init_gmx_domdec_master_t(dd,
6010                                           comm->cgs_gl.nr,
6011                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6012     }
6013 }
6014
6015 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6016 {
6017     real  *slb_frac, tot;
6018     int    i, n;
6019     double dbl;
6020
6021     slb_frac = nullptr;
6022     if (nc > 1 && size_string != nullptr)
6023     {
6024         if (fplog)
6025         {
6026             fprintf(fplog, "Using static load balancing for the %s direction\n",
6027                     dir);
6028         }
6029         snew(slb_frac, nc);
6030         tot = 0;
6031         for (i = 0; i < nc; i++)
6032         {
6033             dbl = 0;
6034             sscanf(size_string, "%20lf%n", &dbl, &n);
6035             if (dbl == 0)
6036             {
6037                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6038             }
6039             slb_frac[i]  = dbl;
6040             size_string += n;
6041             tot         += slb_frac[i];
6042         }
6043         /* Normalize */
6044         if (fplog)
6045         {
6046             fprintf(fplog, "Relative cell sizes:");
6047         }
6048         for (i = 0; i < nc; i++)
6049         {
6050             slb_frac[i] /= tot;
6051             if (fplog)
6052             {
6053                 fprintf(fplog, " %5.3f", slb_frac[i]);
6054             }
6055         }
6056         if (fplog)
6057         {
6058             fprintf(fplog, "\n");
6059         }
6060     }
6061
6062     return slb_frac;
6063 }
6064
6065 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
6066 {
6067     int                  n, nmol, ftype;
6068     gmx_mtop_ilistloop_t iloop;
6069     const t_ilist       *il;
6070
6071     n     = 0;
6072     iloop = gmx_mtop_ilistloop_init(mtop);
6073     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6074     {
6075         for (ftype = 0; ftype < F_NRE; ftype++)
6076         {
6077             if ((interaction_function[ftype].flags & IF_BOND) &&
6078                 NRAL(ftype) >  2)
6079             {
6080                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6081             }
6082         }
6083     }
6084
6085     return n;
6086 }
6087
6088 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6089 {
6090     char *val;
6091     int   nst;
6092
6093     nst = def;
6094     val = getenv(env_var);
6095     if (val)
6096     {
6097         if (sscanf(val, "%20d", &nst) <= 0)
6098         {
6099             nst = 1;
6100         }
6101         if (fplog)
6102         {
6103             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6104                     env_var, val, nst);
6105         }
6106     }
6107
6108     return nst;
6109 }
6110
6111 static void dd_warning(const t_commrec *cr, FILE *fplog, const char *warn_string)
6112 {
6113     if (MASTER(cr))
6114     {
6115         fprintf(stderr, "\n%s\n", warn_string);
6116     }
6117     if (fplog)
6118     {
6119         fprintf(fplog, "\n%s\n", warn_string);
6120     }
6121 }
6122
6123 static void check_dd_restrictions(t_commrec *cr, const gmx_domdec_t *dd,
6124                                   const t_inputrec *ir, FILE *fplog)
6125 {
6126     if (ir->ePBC == epbcSCREW &&
6127         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6128     {
6129         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6130     }
6131
6132     if (ir->ns_type == ensSIMPLE)
6133     {
6134         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6135     }
6136
6137     if (ir->nstlist == 0)
6138     {
6139         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6140     }
6141
6142     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6143     {
6144         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6145     }
6146 }
6147
6148 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6149 {
6150     int  di, d;
6151     real r;
6152
6153     r = ddbox->box_size[XX];
6154     for (di = 0; di < dd->ndim; di++)
6155     {
6156         d = dd->dim[di];
6157         /* Check using the initial average cell size */
6158         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6159     }
6160
6161     return r;
6162 }
6163
6164 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
6165  */
6166 static int forceDlbOffOrBail(int                cmdlineDlbState,
6167                              const std::string &reasonStr,
6168                              t_commrec         *cr,
6169                              FILE              *fplog)
6170 {
6171     std::string dlbNotSupportedErr  = "Dynamic load balancing requested, but ";
6172     std::string dlbDisableNote      = "NOTE: disabling dynamic load balancing as ";
6173
6174     if (cmdlineDlbState == edlbsOnUser)
6175     {
6176         gmx_fatal(FARGS, (dlbNotSupportedErr + reasonStr).c_str());
6177     }
6178     else if (cmdlineDlbState == edlbsOffCanTurnOn)
6179     {
6180         dd_warning(cr, fplog, (dlbDisableNote + reasonStr + "\n").c_str());
6181     }
6182     return edlbsOffForever;
6183 }
6184
6185 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
6186  *
6187  * This function parses the parameters of "-dlb" command line option setting
6188  * corresponding state values. Then it checks the consistency of the determined
6189  * state with other run parameters and settings. As a result, the initial state
6190  * may be altered or an error may be thrown if incompatibility of options is detected.
6191  *
6192  * \param [in] fplog       Pointer to mdrun log file.
6193  * \param [in] cr          Pointer to MPI communication object.
6194  * \param [in] dlbOption   Enum value for the DLB option.
6195  * \param [in] bRecordLoad True if the load balancer is recording load information.
6196  * \param [in] mdrunOptions  Options for mdrun.
6197  * \param [in] ir          Pointer mdrun to input parameters.
6198  * \returns                DLB initial/startup state.
6199  */
6200 static int determineInitialDlbState(FILE *fplog, t_commrec *cr,
6201                                     DlbOption dlbOption, gmx_bool bRecordLoad,
6202                                     const MdrunOptions &mdrunOptions,
6203                                     const t_inputrec *ir)
6204 {
6205     int dlbState = edlbsOffCanTurnOn;
6206
6207     switch (dlbOption)
6208     {
6209         case DlbOption::turnOnWhenUseful: dlbState = edlbsOffCanTurnOn; break;
6210         case DlbOption::no:               dlbState = edlbsOffUser;      break;
6211         case DlbOption::yes:              dlbState = edlbsOnUser;       break;
6212         default: gmx_incons("Invalid dlbOption enum value");
6213     }
6214
6215     /* Reruns don't support DLB: bail or override auto mode */
6216     if (mdrunOptions.rerun)
6217     {
6218         std::string reasonStr = "it is not supported in reruns.";
6219         return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6220     }
6221
6222     /* Unsupported integrators */
6223     if (!EI_DYNAMICS(ir->eI))
6224     {
6225         auto reasonStr = gmx::formatString("it is only supported with dynamics, not with integrator '%s'.", EI(ir->eI));
6226         return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6227     }
6228
6229     /* Without cycle counters we can't time work to balance on */
6230     if (!bRecordLoad)
6231     {
6232         std::string reasonStr = "cycle counters unsupported or not enabled in the operating system kernel.";
6233         return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6234     }
6235
6236     if (mdrunOptions.reproducible)
6237     {
6238         std::string reasonStr = "you started a reproducible run.";
6239         switch (dlbState)
6240         {
6241             case edlbsOffUser:
6242                 break;
6243             case edlbsOffForever:
6244                 GMX_RELEASE_ASSERT(false, "edlbsOffForever is not a valid initial state");
6245                 break;
6246             case edlbsOffCanTurnOn:
6247                 return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6248                 break;
6249             case edlbsOnCanTurnOff:
6250                 GMX_RELEASE_ASSERT(false, "edlbsOffCanTurnOff is not a valid initial state");
6251                 break;
6252             case edlbsOnUser:
6253                 return forceDlbOffOrBail(dlbState, reasonStr + " In load balanced runs binary reproducibility cannot be ensured.", cr, fplog);
6254                 break;
6255             default:
6256                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
6257                 break;
6258         }
6259     }
6260
6261     return dlbState;
6262 }
6263
6264 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6265 {
6266     int dim;
6267
6268     dd->ndim = 0;
6269     if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
6270     {
6271         /* Decomposition order z,y,x */
6272         if (fplog)
6273         {
6274             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6275         }
6276         for (dim = DIM-1; dim >= 0; dim--)
6277         {
6278             if (dd->nc[dim] > 1)
6279             {
6280                 dd->dim[dd->ndim++] = dim;
6281             }
6282         }
6283     }
6284     else
6285     {
6286         /* Decomposition order x,y,z */
6287         for (dim = 0; dim < DIM; dim++)
6288         {
6289             if (dd->nc[dim] > 1)
6290             {
6291                 dd->dim[dd->ndim++] = dim;
6292             }
6293         }
6294     }
6295 }
6296
6297 static gmx_domdec_comm_t *init_dd_comm()
6298 {
6299     gmx_domdec_comm_t *comm;
6300     int                i;
6301
6302     snew(comm, 1);
6303     snew(comm->cggl_flag, DIM*2);
6304     snew(comm->cgcm_state, DIM*2);
6305     for (i = 0; i < DIM*2; i++)
6306     {
6307         comm->cggl_flag_nalloc[i]  = 0;
6308         comm->cgcm_state_nalloc[i] = 0;
6309     }
6310
6311     comm->nalloc_int = 0;
6312     comm->buf_int    = nullptr;
6313
6314     vec_rvec_init(&comm->vbuf);
6315
6316     comm->n_load_have    = 0;
6317     comm->n_load_collect = 0;
6318
6319     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6320     {
6321         comm->sum_nat[i] = 0;
6322     }
6323     comm->ndecomp   = 0;
6324     comm->nload     = 0;
6325     comm->load_step = 0;
6326     comm->load_sum  = 0;
6327     comm->load_max  = 0;
6328     clear_ivec(comm->load_lim);
6329     comm->load_mdf  = 0;
6330     comm->load_pme  = 0;
6331
6332     /* This should be replaced by a unique pointer */
6333     comm->balanceRegion = ddBalanceRegionAllocate();
6334
6335     return comm;
6336 }
6337
6338 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
6339 static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
6340                                    const DomdecOptions &options,
6341                                    const MdrunOptions &mdrunOptions,
6342                                    const gmx_mtop_t *mtop,
6343                                    const t_inputrec *ir,
6344                                    const matrix box, const rvec *xGlobal,
6345                                    gmx_ddbox_t *ddbox)
6346 {
6347     real               r_bonded         = -1;
6348     real               r_bonded_limit   = -1;
6349     const real         tenPercentMargin = 1.1;
6350     gmx_domdec_comm_t *comm             = dd->comm;
6351
6352     snew(comm->cggl_flag, DIM*2);
6353     snew(comm->cgcm_state, DIM*2);
6354
6355     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6356     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6357
6358     dd->pme_recv_f_alloc = 0;
6359     dd->pme_recv_f_buf   = nullptr;
6360
6361     /* Initialize to GPU share count to 0, might change later */
6362     comm->nrank_gpu_shared = 0;
6363
6364     comm->dlbState         = determineInitialDlbState(fplog, cr, options.dlbOption, comm->bRecordLoad, mdrunOptions, ir);
6365     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
6366     /* To consider turning DLB on after 2*nstlist steps we need to check
6367      * at partitioning count 3. Thus we need to increase the first count by 2.
6368      */
6369     comm->ddPartioningCountFirstDlbOff += 2;
6370
6371     if (fplog)
6372     {
6373         fprintf(fplog, "Dynamic load balancing: %s\n",
6374                 edlbs_names[comm->dlbState]);
6375     }
6376     comm->bPMELoadBalDLBLimits = FALSE;
6377
6378     /* Allocate the charge group/atom sorting struct */
6379     snew(comm->sort, 1);
6380
6381     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6382
6383     comm->bInterCGBondeds = ((ncg_mtop(mtop) > gmx_mtop_num_molecules(*mtop)) ||
6384                              mtop->bIntermolecularInteractions);
6385     if (comm->bInterCGBondeds)
6386     {
6387         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6388     }
6389     else
6390     {
6391         comm->bInterCGMultiBody = FALSE;
6392     }
6393
6394     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6395     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6396
6397     if (ir->rlist == 0)
6398     {
6399         /* Set the cut-off to some very large value,
6400          * so we don't need if statements everywhere in the code.
6401          * We use sqrt, since the cut-off is squared in some places.
6402          */
6403         comm->cutoff   = GMX_CUTOFF_INF;
6404     }
6405     else
6406     {
6407         comm->cutoff   = ir->rlist;
6408     }
6409     comm->cutoff_mbody = 0;
6410
6411     comm->cellsize_limit = 0;
6412     comm->bBondComm      = FALSE;
6413
6414     /* Atoms should be able to move by up to half the list buffer size (if > 0)
6415      * within nstlist steps. Since boundaries are allowed to displace by half
6416      * a cell size, DD cells should be at least the size of the list buffer.
6417      */
6418     comm->cellsize_limit = std::max(comm->cellsize_limit,
6419                                     ir->rlist - std::max(ir->rvdw, ir->rcoulomb));
6420
6421     if (comm->bInterCGBondeds)
6422     {
6423         if (options.minimumCommunicationRange > 0)
6424         {
6425             comm->cutoff_mbody = options.minimumCommunicationRange;
6426             if (options.useBondedCommunication)
6427             {
6428                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6429             }
6430             else
6431             {
6432                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6433             }
6434             r_bonded_limit = comm->cutoff_mbody;
6435         }
6436         else if (ir->bPeriodicMols)
6437         {
6438             /* Can not easily determine the required cut-off */
6439             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6440             comm->cutoff_mbody = comm->cutoff/2;
6441             r_bonded_limit     = comm->cutoff_mbody;
6442         }
6443         else
6444         {
6445             real r_2b, r_mb;
6446
6447             if (MASTER(cr))
6448             {
6449                 dd_bonded_cg_distance(fplog, mtop, ir, xGlobal, box,
6450                                       options.checkBondedInteractions,
6451                                       &r_2b, &r_mb);
6452             }
6453             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6454             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6455
6456             /* We use an initial margin of 10% for the minimum cell size,
6457              * except when we are just below the non-bonded cut-off.
6458              */
6459             if (options.useBondedCommunication)
6460             {
6461                 if (std::max(r_2b, r_mb) > comm->cutoff)
6462                 {
6463                     r_bonded        = std::max(r_2b, r_mb);
6464                     r_bonded_limit  = tenPercentMargin*r_bonded;
6465                     comm->bBondComm = TRUE;
6466                 }
6467                 else
6468                 {
6469                     r_bonded       = r_mb;
6470                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
6471                 }
6472                 /* We determine cutoff_mbody later */
6473             }
6474             else
6475             {
6476                 /* No special bonded communication,
6477                  * simply increase the DD cut-off.
6478                  */
6479                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
6480                 comm->cutoff_mbody = r_bonded_limit;
6481                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
6482             }
6483         }
6484         if (fplog)
6485         {
6486             fprintf(fplog,
6487                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6488                     r_bonded_limit);
6489         }
6490         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
6491     }
6492
6493     real rconstr = 0;
6494     if (dd->bInterCGcons && options.constraintCommunicationRange <= 0)
6495     {
6496         /* There is a cell size limit due to the constraints (P-LINCS) */
6497         rconstr = constr_r_max(fplog, mtop, ir);
6498         if (fplog)
6499         {
6500             fprintf(fplog,
6501                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6502                     rconstr);
6503             if (rconstr > comm->cellsize_limit)
6504             {
6505                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6506             }
6507         }
6508     }
6509     else if (options.constraintCommunicationRange > 0 && fplog)
6510     {
6511         /* Here we do not check for dd->bInterCGcons,
6512          * because one can also set a cell size limit for virtual sites only
6513          * and at this point we don't know yet if there are intercg v-sites.
6514          */
6515         fprintf(fplog,
6516                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6517                 options.constraintCommunicationRange);
6518         rconstr = options.constraintCommunicationRange;
6519     }
6520     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
6521
6522     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6523
6524     if (options.numCells[XX] > 0)
6525     {
6526         copy_ivec(options.numCells, dd->nc);
6527         set_dd_dim(fplog, dd);
6528         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, xGlobal, ddbox);
6529
6530         if (options.numPmeRanks >= 0)
6531         {
6532             cr->npmenodes = options.numPmeRanks;
6533         }
6534         else
6535         {
6536             /* When the DD grid is set explicitly and -npme is set to auto,
6537              * don't use PME ranks. We check later if the DD grid is
6538              * compatible with the total number of ranks.
6539              */
6540             cr->npmenodes = 0;
6541         }
6542
6543         real acs = average_cellsize_min(dd, ddbox);
6544         if (acs < comm->cellsize_limit)
6545         {
6546             if (fplog)
6547             {
6548                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6549             }
6550             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6551                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6552                                  acs, comm->cellsize_limit);
6553         }
6554     }
6555     else
6556     {
6557         set_ddbox_cr(cr, nullptr, ir, box, &comm->cgs_gl, xGlobal, ddbox);
6558
6559         /* We need to choose the optimal DD grid and possibly PME nodes */
6560         real limit =
6561             dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6562                            options.numPmeRanks,
6563                            !isDlbDisabled(comm),
6564                            options.dlbScaling,
6565                            comm->cellsize_limit, comm->cutoff,
6566                            comm->bInterCGBondeds);
6567
6568         if (dd->nc[XX] == 0)
6569         {
6570             char     buf[STRLEN];
6571             gmx_bool bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6572             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6573                     !bC ? "-rdd" : "-rcon",
6574                     comm->dlbState != edlbsOffUser ? " or -dds" : "",
6575                     bC ? " or your LINCS settings" : "");
6576
6577             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6578                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6579                                  "%s\n"
6580                                  "Look in the log file for details on the domain decomposition",
6581                                  cr->nnodes-cr->npmenodes, limit, buf);
6582         }
6583         set_dd_dim(fplog, dd);
6584     }
6585
6586     if (fplog)
6587     {
6588         fprintf(fplog,
6589                 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6590                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6591     }
6592
6593     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6594     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6595     {
6596         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6597                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6598                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6599     }
6600     if (cr->npmenodes > dd->nnodes)
6601     {
6602         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6603                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6604     }
6605     if (cr->npmenodes > 0)
6606     {
6607         comm->npmenodes = cr->npmenodes;
6608     }
6609     else
6610     {
6611         comm->npmenodes = dd->nnodes;
6612     }
6613
6614     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6615     {
6616         /* The following choices should match those
6617          * in comm_cost_est in domdec_setup.c.
6618          * Note that here the checks have to take into account
6619          * that the decomposition might occur in a different order than xyz
6620          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6621          * in which case they will not match those in comm_cost_est,
6622          * but since that is mainly for testing purposes that's fine.
6623          */
6624         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6625             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6626             getenv("GMX_PMEONEDD") == nullptr)
6627         {
6628             comm->npmedecompdim = 2;
6629             comm->npmenodes_x   = dd->nc[XX];
6630             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6631         }
6632         else
6633         {
6634             /* In case nc is 1 in both x and y we could still choose to
6635              * decompose pme in y instead of x, but we use x for simplicity.
6636              */
6637             comm->npmedecompdim = 1;
6638             if (dd->dim[0] == YY)
6639             {
6640                 comm->npmenodes_x = 1;
6641                 comm->npmenodes_y = comm->npmenodes;
6642             }
6643             else
6644             {
6645                 comm->npmenodes_x = comm->npmenodes;
6646                 comm->npmenodes_y = 1;
6647             }
6648         }
6649         if (fplog)
6650         {
6651             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6652                     comm->npmenodes_x, comm->npmenodes_y, 1);
6653         }
6654     }
6655     else
6656     {
6657         comm->npmedecompdim = 0;
6658         comm->npmenodes_x   = 0;
6659         comm->npmenodes_y   = 0;
6660     }
6661
6662     snew(comm->slb_frac, DIM);
6663     if (isDlbDisabled(comm))
6664     {
6665         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], options.cellSizeX);
6666         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], options.cellSizeY);
6667         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], options.cellSizeZ);
6668     }
6669
6670     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6671     {
6672         if (comm->bBondComm || !isDlbDisabled(comm))
6673         {
6674             /* Set the bonded communication distance to halfway
6675              * the minimum and the maximum,
6676              * since the extra communication cost is nearly zero.
6677              */
6678             real acs           = average_cellsize_min(dd, ddbox);
6679             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6680             if (!isDlbDisabled(comm))
6681             {
6682                 /* Check if this does not limit the scaling */
6683                 comm->cutoff_mbody = std::min(comm->cutoff_mbody,
6684                                               options.dlbScaling*acs);
6685             }
6686             if (!comm->bBondComm)
6687             {
6688                 /* Without bBondComm do not go beyond the n.b. cut-off */
6689                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
6690                 if (comm->cellsize_limit >= comm->cutoff)
6691                 {
6692                     /* We don't loose a lot of efficieny
6693                      * when increasing it to the n.b. cut-off.
6694                      * It can even be slightly faster, because we need
6695                      * less checks for the communication setup.
6696                      */
6697                     comm->cutoff_mbody = comm->cutoff;
6698                 }
6699             }
6700             /* Check if we did not end up below our original limit */
6701             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
6702
6703             if (comm->cutoff_mbody > comm->cellsize_limit)
6704             {
6705                 comm->cellsize_limit = comm->cutoff_mbody;
6706             }
6707         }
6708         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6709     }
6710
6711     if (debug)
6712     {
6713         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6714                 "cellsize limit %f\n",
6715                 comm->bBondComm, comm->cellsize_limit);
6716     }
6717
6718     if (MASTER(cr))
6719     {
6720         check_dd_restrictions(cr, dd, ir, fplog);
6721     }
6722 }
6723
6724 static void set_dlb_limits(gmx_domdec_t *dd)
6725
6726 {
6727     int d;
6728
6729     for (d = 0; d < dd->ndim; d++)
6730     {
6731         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6732         dd->comm->cellsize_min[dd->dim[d]] =
6733             dd->comm->cellsize_min_dlb[dd->dim[d]];
6734     }
6735 }
6736
6737
6738 static void turn_on_dlb(FILE *fplog, const t_commrec *cr, gmx_int64_t step)
6739 {
6740     gmx_domdec_t      *dd;
6741     gmx_domdec_comm_t *comm;
6742     real               cellsize_min;
6743     int                d, nc, i;
6744
6745     dd   = cr->dd;
6746     comm = dd->comm;
6747
6748     cellsize_min = comm->cellsize_min[dd->dim[0]];
6749     for (d = 1; d < dd->ndim; d++)
6750     {
6751         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6752     }
6753
6754     /* Turn off DLB if we're too close to the cell size limit. */
6755     if (cellsize_min < comm->cellsize_limit*1.05)
6756     {
6757         auto str = gmx::formatString("step %" GMX_PRId64 " Measured %.1f %% performance loss due to load imbalance, "
6758                                      "but the minimum cell size is smaller than 1.05 times the cell size limit."
6759                                      "Will no longer try dynamic load balancing.\n", step, dd_force_imb_perf_loss(dd)*100);
6760         dd_warning(cr, fplog, str.c_str());
6761
6762         comm->dlbState = edlbsOffForever;
6763         return;
6764     }
6765
6766     char buf[STRLEN];
6767     sprintf(buf, "step %" GMX_PRId64 " Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.\n", step, dd_force_imb_perf_loss(dd)*100);
6768     dd_warning(cr, fplog, buf);
6769     comm->dlbState = edlbsOnCanTurnOff;
6770
6771     /* Store the non-DLB performance, so we can check if DLB actually
6772      * improves performance.
6773      */
6774     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
6775     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
6776
6777     set_dlb_limits(dd);
6778
6779     /* We can set the required cell size info here,
6780      * so we do not need to communicate this.
6781      * The grid is completely uniform.
6782      */
6783     for (d = 0; d < dd->ndim; d++)
6784     {
6785         if (comm->root[d])
6786         {
6787             comm->load[d].sum_m = comm->load[d].sum;
6788
6789             nc = dd->nc[dd->dim[d]];
6790             for (i = 0; i < nc; i++)
6791             {
6792                 comm->root[d]->cell_f[i]    = i/(real)nc;
6793                 if (d > 0)
6794                 {
6795                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6796                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6797                 }
6798             }
6799             comm->root[d]->cell_f[nc] = 1.0;
6800         }
6801     }
6802 }
6803
6804 static void turn_off_dlb(FILE *fplog, const t_commrec *cr, gmx_int64_t step)
6805 {
6806     gmx_domdec_t *dd = cr->dd;
6807
6808     char          buf[STRLEN];
6809     sprintf(buf, "step %" GMX_PRId64 " Turning off dynamic load balancing, because it is degrading performance.\n", step);
6810     dd_warning(cr, fplog, buf);
6811     dd->comm->dlbState                     = edlbsOffCanTurnOn;
6812     dd->comm->haveTurnedOffDlb             = true;
6813     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
6814 }
6815
6816 static void turn_off_dlb_forever(FILE *fplog, const t_commrec *cr, gmx_int64_t step)
6817 {
6818     GMX_RELEASE_ASSERT(cr->dd->comm->dlbState == edlbsOffCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
6819     char buf[STRLEN];
6820     sprintf(buf, "step %" GMX_PRId64 " Will no longer try dynamic load balancing, as it degraded performance.\n", step);
6821     dd_warning(cr, fplog, buf);
6822     cr->dd->comm->dlbState = edlbsOffForever;
6823 }
6824
6825 static char *init_bLocalCG(const gmx_mtop_t *mtop)
6826 {
6827     int   ncg, cg;
6828     char *bLocalCG;
6829
6830     ncg = ncg_mtop(mtop);
6831     snew(bLocalCG, ncg);
6832     for (cg = 0; cg < ncg; cg++)
6833     {
6834         bLocalCG[cg] = FALSE;
6835     }
6836
6837     return bLocalCG;
6838 }
6839
6840 void dd_init_bondeds(FILE *fplog,
6841                      gmx_domdec_t *dd,
6842                      const gmx_mtop_t *mtop,
6843                      const gmx_vsite_t *vsite,
6844                      const t_inputrec *ir,
6845                      gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
6846 {
6847     gmx_domdec_comm_t *comm;
6848
6849     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
6850
6851     comm = dd->comm;
6852
6853     if (comm->bBondComm)
6854     {
6855         /* Communicate atoms beyond the cut-off for bonded interactions */
6856         comm = dd->comm;
6857
6858         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
6859
6860         comm->bLocalCG = init_bLocalCG(mtop);
6861     }
6862     else
6863     {
6864         /* Only communicate atoms based on cut-off */
6865         comm->cglink   = nullptr;
6866         comm->bLocalCG = nullptr;
6867     }
6868 }
6869
6870 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
6871                               const gmx_mtop_t *mtop, const t_inputrec *ir,
6872                               gmx_bool bDynLoadBal, real dlb_scale,
6873                               const gmx_ddbox_t *ddbox)
6874 {
6875     gmx_domdec_comm_t *comm;
6876     int                d;
6877     ivec               np;
6878     real               limit, shrink;
6879     char               buf[64];
6880
6881     if (fplog == nullptr)
6882     {
6883         return;
6884     }
6885
6886     comm = dd->comm;
6887
6888     if (bDynLoadBal)
6889     {
6890         fprintf(fplog, "The maximum number of communication pulses is:");
6891         for (d = 0; d < dd->ndim; d++)
6892         {
6893             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
6894         }
6895         fprintf(fplog, "\n");
6896         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
6897         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
6898         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
6899         for (d = 0; d < DIM; d++)
6900         {
6901             if (dd->nc[d] > 1)
6902             {
6903                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6904                 {
6905                     shrink = 0;
6906                 }
6907                 else
6908                 {
6909                     shrink =
6910                         comm->cellsize_min_dlb[d]/
6911                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6912                 }
6913                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
6914             }
6915         }
6916         fprintf(fplog, "\n");
6917     }
6918     else
6919     {
6920         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
6921         fprintf(fplog, "The initial number of communication pulses is:");
6922         for (d = 0; d < dd->ndim; d++)
6923         {
6924             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
6925         }
6926         fprintf(fplog, "\n");
6927         fprintf(fplog, "The initial domain decomposition cell size is:");
6928         for (d = 0; d < DIM; d++)
6929         {
6930             if (dd->nc[d] > 1)
6931             {
6932                 fprintf(fplog, " %c %.2f nm",
6933                         dim2char(d), dd->comm->cellsize_min[d]);
6934             }
6935         }
6936         fprintf(fplog, "\n\n");
6937     }
6938
6939     gmx_bool bInterCGVsites = count_intercg_vsites(mtop);
6940
6941     if (comm->bInterCGBondeds ||
6942         bInterCGVsites ||
6943         dd->bInterCGcons || dd->bInterCGsettles)
6944     {
6945         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
6946         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6947                 "non-bonded interactions", "", comm->cutoff);
6948
6949         if (bDynLoadBal)
6950         {
6951             limit = dd->comm->cellsize_limit;
6952         }
6953         else
6954         {
6955             if (dynamic_dd_box(ddbox, ir))
6956             {
6957                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
6958             }
6959             limit = dd->comm->cellsize_min[XX];
6960             for (d = 1; d < DIM; d++)
6961             {
6962                 limit = std::min(limit, dd->comm->cellsize_min[d]);
6963             }
6964         }
6965
6966         if (comm->bInterCGBondeds)
6967         {
6968             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6969                     "two-body bonded interactions", "(-rdd)",
6970                     std::max(comm->cutoff, comm->cutoff_mbody));
6971             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6972                     "multi-body bonded interactions", "(-rdd)",
6973                     (comm->bBondComm || isDlbOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
6974         }
6975         if (bInterCGVsites)
6976         {
6977             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6978                     "virtual site constructions", "(-rcon)", limit);
6979         }
6980         if (dd->bInterCGcons || dd->bInterCGsettles)
6981         {
6982             sprintf(buf, "atoms separated by up to %d constraints",
6983                     1+ir->nProjOrder);
6984             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6985                     buf, "(-rcon)", limit);
6986         }
6987         fprintf(fplog, "\n");
6988     }
6989
6990     fflush(fplog);
6991 }
6992
6993 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
6994                                 real               dlb_scale,
6995                                 const t_inputrec  *ir,
6996                                 const gmx_ddbox_t *ddbox)
6997 {
6998     gmx_domdec_comm_t *comm;
6999     int                d, dim, npulse, npulse_d_max, npulse_d;
7000     gmx_bool           bNoCutOff;
7001
7002     comm = dd->comm;
7003
7004     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7005
7006     /* Determine the maximum number of comm. pulses in one dimension */
7007
7008     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7009
7010     /* Determine the maximum required number of grid pulses */
7011     if (comm->cellsize_limit >= comm->cutoff)
7012     {
7013         /* Only a single pulse is required */
7014         npulse = 1;
7015     }
7016     else if (!bNoCutOff && comm->cellsize_limit > 0)
7017     {
7018         /* We round down slightly here to avoid overhead due to the latency
7019          * of extra communication calls when the cut-off
7020          * would be only slightly longer than the cell size.
7021          * Later cellsize_limit is redetermined,
7022          * so we can not miss interactions due to this rounding.
7023          */
7024         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7025     }
7026     else
7027     {
7028         /* There is no cell size limit */
7029         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7030     }
7031
7032     if (!bNoCutOff && npulse > 1)
7033     {
7034         /* See if we can do with less pulses, based on dlb_scale */
7035         npulse_d_max = 0;
7036         for (d = 0; d < dd->ndim; d++)
7037         {
7038             dim      = dd->dim[d];
7039             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7040                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7041             npulse_d_max = std::max(npulse_d_max, npulse_d);
7042         }
7043         npulse = std::min(npulse, npulse_d_max);
7044     }
7045
7046     /* This env var can override npulse */
7047     d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
7048     if (d > 0)
7049     {
7050         npulse = d;
7051     }
7052
7053     comm->maxpulse       = 1;
7054     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7055     for (d = 0; d < dd->ndim; d++)
7056     {
7057         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
7058         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7059         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7060         comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
7061         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7062         {
7063             comm->bVacDLBNoLimit = FALSE;
7064         }
7065     }
7066
7067     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7068     if (!comm->bVacDLBNoLimit)
7069     {
7070         comm->cellsize_limit = std::max(comm->cellsize_limit,
7071                                         comm->cutoff/comm->maxpulse);
7072     }
7073     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7074     /* Set the minimum cell size for each DD dimension */
7075     for (d = 0; d < dd->ndim; d++)
7076     {
7077         if (comm->bVacDLBNoLimit ||
7078             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7079         {
7080             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7081         }
7082         else
7083         {
7084             comm->cellsize_min_dlb[dd->dim[d]] =
7085                 comm->cutoff/comm->cd[d].np_dlb;
7086         }
7087     }
7088     if (comm->cutoff_mbody <= 0)
7089     {
7090         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
7091     }
7092     if (isDlbOn(comm))
7093     {
7094         set_dlb_limits(dd);
7095     }
7096 }
7097
7098 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
7099 {
7100     /* If each molecule is a single charge group
7101      * or we use domain decomposition for each periodic dimension,
7102      * we do not need to take pbc into account for the bonded interactions.
7103      */
7104     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7105             !(dd->nc[XX] > 1 &&
7106               dd->nc[YY] > 1 &&
7107               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7108 }
7109
7110 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
7111 static void set_ddgrid_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7112                                   const gmx_mtop_t *mtop, const t_inputrec *ir,
7113                                   const gmx_ddbox_t *ddbox)
7114 {
7115     gmx_domdec_comm_t *comm;
7116     int                natoms_tot;
7117     real               vol_frac;
7118
7119     comm = dd->comm;
7120
7121     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7122     {
7123         init_ddpme(dd, &comm->ddpme[0], 0);
7124         if (comm->npmedecompdim >= 2)
7125         {
7126             init_ddpme(dd, &comm->ddpme[1], 1);
7127         }
7128     }
7129     else
7130     {
7131         comm->npmenodes = 0;
7132         if (dd->pme_nodeid >= 0)
7133         {
7134             gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
7135                                  "Can not have separate PME ranks without PME electrostatics");
7136         }
7137     }
7138
7139     if (debug)
7140     {
7141         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7142     }
7143     if (!isDlbDisabled(comm))
7144     {
7145         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7146     }
7147
7148     print_dd_settings(fplog, dd, mtop, ir, isDlbOn(comm), dlb_scale, ddbox);
7149     if (comm->dlbState == edlbsOffCanTurnOn)
7150     {
7151         if (fplog)
7152         {
7153             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7154         }
7155         print_dd_settings(fplog, dd, mtop, ir, TRUE, dlb_scale, ddbox);
7156     }
7157
7158     if (ir->ePBC == epbcNONE)
7159     {
7160         vol_frac = 1 - 1/(double)dd->nnodes;
7161     }
7162     else
7163     {
7164         vol_frac =
7165             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7166     }
7167     if (debug)
7168     {
7169         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7170     }
7171     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7172
7173     dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
7174 }
7175
7176 /*! \brief Set some important DD parameters that can be modified by env.vars */
7177 static void set_dd_envvar_options(FILE *fplog, gmx_domdec_t *dd, int rank_mysim)
7178 {
7179     gmx_domdec_comm_t *comm = dd->comm;
7180
7181     dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
7182     comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
7183     comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
7184     int recload         = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
7185     comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
7186     comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
7187     comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
7188
7189     if (dd->bSendRecv2 && fplog)
7190     {
7191         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
7192     }
7193
7194     if (comm->eFlop)
7195     {
7196         if (fplog)
7197         {
7198             fprintf(fplog, "Will load balance based on FLOP count\n");
7199         }
7200         if (comm->eFlop > 1)
7201         {
7202             srand(1 + rank_mysim);
7203         }
7204         comm->bRecordLoad = TRUE;
7205     }
7206     else
7207     {
7208         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
7209     }
7210 }
7211
7212 DomdecOptions::DomdecOptions() :
7213     checkBondedInteractions(TRUE),
7214     useBondedCommunication(TRUE),
7215     numPmeRanks(-1),
7216     rankOrder(DdRankOrder::pp_pme),
7217     minimumCommunicationRange(0),
7218     constraintCommunicationRange(0),
7219     dlbOption(DlbOption::turnOnWhenUseful),
7220     dlbScaling(0.8),
7221     cellSizeX(nullptr),
7222     cellSizeY(nullptr),
7223     cellSizeZ(nullptr)
7224 {
7225     clear_ivec(numCells);
7226 }
7227
7228 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
7229                                         const DomdecOptions &options,
7230                                         const MdrunOptions &mdrunOptions,
7231                                         const gmx_mtop_t *mtop,
7232                                         const t_inputrec *ir,
7233                                         const matrix box,
7234                                         const rvec *xGlobal)
7235 {
7236     gmx_domdec_t      *dd;
7237
7238     if (fplog)
7239     {
7240         fprintf(fplog,
7241                 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
7242     }
7243
7244     snew(dd, 1);
7245
7246     dd->comm = init_dd_comm();
7247
7248     set_dd_envvar_options(fplog, dd, cr->nodeid);
7249
7250     gmx_ddbox_t ddbox = {0};
7251     set_dd_limits_and_grid(fplog, cr, dd, options, mdrunOptions,
7252                            mtop, ir,
7253                            box, xGlobal,
7254                            &ddbox);
7255
7256     make_dd_communicators(fplog, cr, dd, options.rankOrder);
7257
7258     if (thisRankHasDuty(cr, DUTY_PP))
7259     {
7260         set_ddgrid_parameters(fplog, dd, options.dlbScaling, mtop, ir, &ddbox);
7261
7262         setup_neighbor_relations(dd);
7263     }
7264
7265     /* Set overallocation to avoid frequent reallocation of arrays */
7266     set_over_alloc_dd(TRUE);
7267
7268     /* Initialize DD paritioning counters */
7269     dd->comm->partition_step = INT_MIN;
7270     dd->ddp_count            = 0;
7271
7272     /* We currently don't know the number of threads yet, we set this later */
7273     dd->comm->nth = 0;
7274
7275     clear_dd_cycle_counts(dd);
7276
7277     return dd;
7278 }
7279
7280 static gmx_bool test_dd_cutoff(t_commrec *cr,
7281                                t_state *state, const t_inputrec *ir,
7282                                real cutoff_req)
7283 {
7284     gmx_domdec_t *dd;
7285     gmx_ddbox_t   ddbox;
7286     int           d, dim, np;
7287     real          inv_cell_size;
7288     int           LocallyLimited;
7289
7290     dd = cr->dd;
7291
7292     set_ddbox(dd, FALSE, ir, state->box,
7293               TRUE, &dd->comm->cgs_gl, as_rvec_array(state->x.data()), &ddbox);
7294
7295     LocallyLimited = 0;
7296
7297     for (d = 0; d < dd->ndim; d++)
7298     {
7299         dim = dd->dim[d];
7300
7301         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7302         if (dynamic_dd_box(&ddbox, ir))
7303         {
7304             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7305         }
7306
7307         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7308
7309         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
7310         {
7311             if (np > dd->comm->cd[d].np_dlb)
7312             {
7313                 return FALSE;
7314             }
7315
7316             /* If a current local cell size is smaller than the requested
7317              * cut-off, we could still fix it, but this gets very complicated.
7318              * Without fixing here, we might actually need more checks.
7319              */
7320             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7321             {
7322                 LocallyLimited = 1;
7323             }
7324         }
7325     }
7326
7327     if (!isDlbDisabled(dd->comm))
7328     {
7329         /* If DLB is not active yet, we don't need to check the grid jumps.
7330          * Actually we shouldn't, because then the grid jump data is not set.
7331          */
7332         if (isDlbOn(dd->comm) &&
7333             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7334         {
7335             LocallyLimited = 1;
7336         }
7337
7338         gmx_sumi(1, &LocallyLimited, cr);
7339
7340         if (LocallyLimited > 0)
7341         {
7342             return FALSE;
7343         }
7344     }
7345
7346     return TRUE;
7347 }
7348
7349 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, const t_inputrec *ir,
7350                           real cutoff_req)
7351 {
7352     gmx_bool bCutoffAllowed;
7353
7354     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7355
7356     if (bCutoffAllowed)
7357     {
7358         cr->dd->comm->cutoff = cutoff_req;
7359     }
7360
7361     return bCutoffAllowed;
7362 }
7363
7364 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
7365 {
7366     gmx_domdec_comm_t *comm;
7367
7368     comm = cr->dd->comm;
7369
7370     /* Turn on the DLB limiting (might have been on already) */
7371     comm->bPMELoadBalDLBLimits = TRUE;
7372
7373     /* Change the cut-off limit */
7374     comm->PMELoadBal_max_cutoff = cutoff;
7375
7376     if (debug)
7377     {
7378         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
7379                 comm->PMELoadBal_max_cutoff);
7380     }
7381 }
7382
7383 /* Sets whether we should later check the load imbalance data, so that
7384  * we can trigger dynamic load balancing if enough imbalance has
7385  * arisen.
7386  *
7387  * Used after PME load balancing unlocks DLB, so that the check
7388  * whether DLB will be useful can happen immediately.
7389  */
7390 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
7391 {
7392     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7393     {
7394         dd->comm->bCheckWhetherToTurnDlbOn = bValue;
7395
7396         if (bValue == TRUE)
7397         {
7398             /* Store the DD partitioning count, so we can ignore cycle counts
7399              * over the next nstlist steps, which are often slower.
7400              */
7401             dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
7402         }
7403     }
7404 }
7405
7406 /* Returns if we should check whether there has been enough load
7407  * imbalance to trigger dynamic load balancing.
7408  */
7409 static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
7410 {
7411     if (dd->comm->dlbState != edlbsOffCanTurnOn)
7412     {
7413         return FALSE;
7414     }
7415
7416     if (dd->ddp_count <= dd->comm->ddPartioningCountFirstDlbOff)
7417     {
7418         /* We ignore the first nstlist steps at the start of the run
7419          * or after PME load balancing or after turning DLB off, since
7420          * these often have extra allocation or cache miss overhead.
7421          */
7422         return FALSE;
7423     }
7424
7425     if (dd->comm->cycl_n[ddCyclStep] == 0)
7426     {
7427         /* We can have zero timed steps when dd_partition_system is called
7428          * more than once at the same step, e.g. with replica exchange.
7429          * Turning on DLB would trigger an assertion failure later, but is
7430          * also useless right after exchanging replicas.
7431          */
7432         return FALSE;
7433     }
7434
7435     /* We should check whether we should use DLB directly after
7436      * unlocking DLB. */
7437     if (dd->comm->bCheckWhetherToTurnDlbOn)
7438     {
7439         /* This flag was set when the PME load-balancing routines
7440            unlocked DLB, and should now be cleared. */
7441         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
7442         return TRUE;
7443     }
7444     /* We check whether we should use DLB every c_checkTurnDlbOnInterval
7445      * partitionings (we do not do this every partioning, so that we
7446      * avoid excessive communication). */
7447     if (dd->comm->n_load_have % c_checkTurnDlbOnInterval == c_checkTurnDlbOnInterval - 1)
7448     {
7449         return TRUE;
7450     }
7451
7452     return FALSE;
7453 }
7454
7455 gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
7456 {
7457     return isDlbOn(dd->comm);
7458 }
7459
7460 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
7461 {
7462     return (dd->comm->dlbState == edlbsOffTemporarilyLocked);
7463 }
7464
7465 void dd_dlb_lock(gmx_domdec_t *dd)
7466 {
7467     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7468     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7469     {
7470         dd->comm->dlbState = edlbsOffTemporarilyLocked;
7471     }
7472 }
7473
7474 void dd_dlb_unlock(gmx_domdec_t *dd)
7475 {
7476     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7477     if (dd->comm->dlbState == edlbsOffTemporarilyLocked)
7478     {
7479         dd->comm->dlbState = edlbsOffCanTurnOn;
7480         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
7481     }
7482 }
7483
7484 static void merge_cg_buffers(int ncell,
7485                              gmx_domdec_comm_dim_t *cd, int pulse,
7486                              int  *ncg_cell,
7487                              int  *index_gl, int  *recv_i,
7488                              rvec *cg_cm,    rvec *recv_vr,
7489                              int *cgindex,
7490                              cginfo_mb_t *cginfo_mb, int *cginfo)
7491 {
7492     gmx_domdec_ind_t *ind, *ind_p;
7493     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7494     int               shift, shift_at;
7495
7496     ind = &cd->ind[pulse];
7497
7498     /* First correct the already stored data */
7499     shift = ind->nrecv[ncell];
7500     for (cell = ncell-1; cell >= 0; cell--)
7501     {
7502         shift -= ind->nrecv[cell];
7503         if (shift > 0)
7504         {
7505             /* Move the cg's present from previous grid pulses */
7506             cg0                = ncg_cell[ncell+cell];
7507             cg1                = ncg_cell[ncell+cell+1];
7508             cgindex[cg1+shift] = cgindex[cg1];
7509             for (cg = cg1-1; cg >= cg0; cg--)
7510             {
7511                 index_gl[cg+shift] = index_gl[cg];
7512                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7513                 cgindex[cg+shift] = cgindex[cg];
7514                 cginfo[cg+shift]  = cginfo[cg];
7515             }
7516             /* Correct the already stored send indices for the shift */
7517             for (p = 1; p <= pulse; p++)
7518             {
7519                 ind_p = &cd->ind[p];
7520                 cg0   = 0;
7521                 for (c = 0; c < cell; c++)
7522                 {
7523                     cg0 += ind_p->nsend[c];
7524                 }
7525                 cg1 = cg0 + ind_p->nsend[cell];
7526                 for (cg = cg0; cg < cg1; cg++)
7527                 {
7528                     ind_p->index[cg] += shift;
7529                 }
7530             }
7531         }
7532     }
7533
7534     /* Merge in the communicated buffers */
7535     shift    = 0;
7536     shift_at = 0;
7537     cg0      = 0;
7538     for (cell = 0; cell < ncell; cell++)
7539     {
7540         cg1 = ncg_cell[ncell+cell+1] + shift;
7541         if (shift_at > 0)
7542         {
7543             /* Correct the old cg indices */
7544             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7545             {
7546                 cgindex[cg+1] += shift_at;
7547             }
7548         }
7549         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7550         {
7551             /* Copy this charge group from the buffer */
7552             index_gl[cg1] = recv_i[cg0];
7553             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7554             /* Add it to the cgindex */
7555             cg_gl          = index_gl[cg1];
7556             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7557             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7558             cgindex[cg1+1] = cgindex[cg1] + nat;
7559             cg0++;
7560             cg1++;
7561             shift_at += nat;
7562         }
7563         shift                 += ind->nrecv[cell];
7564         ncg_cell[ncell+cell+1] = cg1;
7565     }
7566 }
7567
7568 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7569                                int nzone, int cg0, const int *cgindex)
7570 {
7571     int cg, zone, p;
7572
7573     /* Store the atom block boundaries for easy copying of communication buffers
7574      */
7575     cg = cg0;
7576     for (zone = 0; zone < nzone; zone++)
7577     {
7578         for (p = 0; p < cd->np; p++)
7579         {
7580             cd->ind[p].cell2at0[zone] = cgindex[cg];
7581             cg += cd->ind[p].nrecv[zone];
7582             cd->ind[p].cell2at1[zone] = cgindex[cg];
7583         }
7584     }
7585 }
7586
7587 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7588 {
7589     int      i;
7590     gmx_bool bMiss;
7591
7592     bMiss = FALSE;
7593     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7594     {
7595         if (!bLocalCG[link->a[i]])
7596         {
7597             bMiss = TRUE;
7598         }
7599     }
7600
7601     return bMiss;
7602 }
7603
7604 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7605 typedef struct {
7606     real c[DIM][4]; /* the corners for the non-bonded communication */
7607     real cr0;       /* corner for rounding */
7608     real cr1[4];    /* corners for rounding */
7609     real bc[DIM];   /* corners for bounded communication */
7610     real bcr1;      /* corner for rounding for bonded communication */
7611 } dd_corners_t;
7612
7613 /* Determine the corners of the domain(s) we are communicating with */
7614 static void
7615 set_dd_corners(const gmx_domdec_t *dd,
7616                int dim0, int dim1, int dim2,
7617                gmx_bool bDistMB,
7618                dd_corners_t *c)
7619 {
7620     const gmx_domdec_comm_t  *comm;
7621     const gmx_domdec_zones_t *zones;
7622     int i, j;
7623
7624     comm = dd->comm;
7625
7626     zones = &comm->zones;
7627
7628     /* Keep the compiler happy */
7629     c->cr0  = 0;
7630     c->bcr1 = 0;
7631
7632     /* The first dimension is equal for all cells */
7633     c->c[0][0] = comm->cell_x0[dim0];
7634     if (bDistMB)
7635     {
7636         c->bc[0] = c->c[0][0];
7637     }
7638     if (dd->ndim >= 2)
7639     {
7640         dim1 = dd->dim[1];
7641         /* This cell row is only seen from the first row */
7642         c->c[1][0] = comm->cell_x0[dim1];
7643         /* All rows can see this row */
7644         c->c[1][1] = comm->cell_x0[dim1];
7645         if (isDlbOn(dd->comm))
7646         {
7647             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7648             if (bDistMB)
7649             {
7650                 /* For the multi-body distance we need the maximum */
7651                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7652             }
7653         }
7654         /* Set the upper-right corner for rounding */
7655         c->cr0 = comm->cell_x1[dim0];
7656
7657         if (dd->ndim >= 3)
7658         {
7659             dim2 = dd->dim[2];
7660             for (j = 0; j < 4; j++)
7661             {
7662                 c->c[2][j] = comm->cell_x0[dim2];
7663             }
7664             if (isDlbOn(dd->comm))
7665             {
7666                 /* Use the maximum of the i-cells that see a j-cell */
7667                 for (i = 0; i < zones->nizone; i++)
7668                 {
7669                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7670                     {
7671                         if (j >= 4)
7672                         {
7673                             c->c[2][j-4] =
7674                                 std::max(c->c[2][j-4],
7675                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7676                         }
7677                     }
7678                 }
7679                 if (bDistMB)
7680                 {
7681                     /* For the multi-body distance we need the maximum */
7682                     c->bc[2] = comm->cell_x0[dim2];
7683                     for (i = 0; i < 2; i++)
7684                     {
7685                         for (j = 0; j < 2; j++)
7686                         {
7687                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
7688                         }
7689                     }
7690                 }
7691             }
7692
7693             /* Set the upper-right corner for rounding */
7694             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7695              * Only cell (0,0,0) can see cell 7 (1,1,1)
7696              */
7697             c->cr1[0] = comm->cell_x1[dim1];
7698             c->cr1[3] = comm->cell_x1[dim1];
7699             if (isDlbOn(dd->comm))
7700             {
7701                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7702                 if (bDistMB)
7703                 {
7704                     /* For the multi-body distance we need the maximum */
7705                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7706                 }
7707             }
7708         }
7709     }
7710 }
7711
7712 /* Determine which cg's we need to send in this pulse from this zone */
7713 static void
7714 get_zone_pulse_cgs(gmx_domdec_t *dd,
7715                    int zonei, int zone,
7716                    int cg0, int cg1,
7717                    const int *index_gl,
7718                    const int *cgindex,
7719                    int dim, int dim_ind,
7720                    int dim0, int dim1, int dim2,
7721                    real r_comm2, real r_bcomm2,
7722                    matrix box,
7723                    ivec tric_dist,
7724                    rvec *normal,
7725                    real skew_fac2_d, real skew_fac_01,
7726                    rvec *v_d, rvec *v_0, rvec *v_1,
7727                    const dd_corners_t *c,
7728                    rvec sf2_round,
7729                    gmx_bool bDistBonded,
7730                    gmx_bool bBondComm,
7731                    gmx_bool bDist2B,
7732                    gmx_bool bDistMB,
7733                    rvec *cg_cm,
7734                    int *cginfo,
7735                    gmx_domdec_ind_t *ind,
7736                    int **ibuf, int *ibuf_nalloc,
7737                    vec_rvec_t *vbuf,
7738                    int *nsend_ptr,
7739                    int *nat_ptr,
7740                    int *nsend_z_ptr)
7741 {
7742     gmx_domdec_comm_t *comm;
7743     gmx_bool           bScrew;
7744     gmx_bool           bDistMB_pulse;
7745     int                cg, i;
7746     real               r2, rb2, r, tric_sh;
7747     rvec               rn, rb;
7748     int                dimd;
7749     int                nsend_z, nsend, nat;
7750
7751     comm = dd->comm;
7752
7753     bScrew = (dd->bScrewPBC && dim == XX);
7754
7755     bDistMB_pulse = (bDistMB && bDistBonded);
7756
7757     nsend_z = 0;
7758     nsend   = *nsend_ptr;
7759     nat     = *nat_ptr;
7760
7761     for (cg = cg0; cg < cg1; cg++)
7762     {
7763         r2  = 0;
7764         rb2 = 0;
7765         if (tric_dist[dim_ind] == 0)
7766         {
7767             /* Rectangular direction, easy */
7768             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7769             if (r > 0)
7770             {
7771                 r2 += r*r;
7772             }
7773             if (bDistMB_pulse)
7774             {
7775                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7776                 if (r > 0)
7777                 {
7778                     rb2 += r*r;
7779                 }
7780             }
7781             /* Rounding gives at most a 16% reduction
7782              * in communicated atoms
7783              */
7784             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7785             {
7786                 r = cg_cm[cg][dim0] - c->cr0;
7787                 /* This is the first dimension, so always r >= 0 */
7788                 r2 += r*r;
7789                 if (bDistMB_pulse)
7790                 {
7791                     rb2 += r*r;
7792                 }
7793             }
7794             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7795             {
7796                 r = cg_cm[cg][dim1] - c->cr1[zone];
7797                 if (r > 0)
7798                 {
7799                     r2 += r*r;
7800                 }
7801                 if (bDistMB_pulse)
7802                 {
7803                     r = cg_cm[cg][dim1] - c->bcr1;
7804                     if (r > 0)
7805                     {
7806                         rb2 += r*r;
7807                     }
7808                 }
7809             }
7810         }
7811         else
7812         {
7813             /* Triclinic direction, more complicated */
7814             clear_rvec(rn);
7815             clear_rvec(rb);
7816             /* Rounding, conservative as the skew_fac multiplication
7817              * will slightly underestimate the distance.
7818              */
7819             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7820             {
7821                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7822                 for (i = dim0+1; i < DIM; i++)
7823                 {
7824                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7825                 }
7826                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7827                 if (bDistMB_pulse)
7828                 {
7829                     rb[dim0] = rn[dim0];
7830                     rb2      = r2;
7831                 }
7832                 /* Take care that the cell planes along dim0 might not
7833                  * be orthogonal to those along dim1 and dim2.
7834                  */
7835                 for (i = 1; i <= dim_ind; i++)
7836                 {
7837                     dimd = dd->dim[i];
7838                     if (normal[dim0][dimd] > 0)
7839                     {
7840                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7841                         if (bDistMB_pulse)
7842                         {
7843                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7844                         }
7845                     }
7846                 }
7847             }
7848             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7849             {
7850                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7851                 tric_sh   = 0;
7852                 for (i = dim1+1; i < DIM; i++)
7853                 {
7854                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7855                 }
7856                 rn[dim1] += tric_sh;
7857                 if (rn[dim1] > 0)
7858                 {
7859                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7860                     /* Take care of coupling of the distances
7861                      * to the planes along dim0 and dim1 through dim2.
7862                      */
7863                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7864                     /* Take care that the cell planes along dim1
7865                      * might not be orthogonal to that along dim2.
7866                      */
7867                     if (normal[dim1][dim2] > 0)
7868                     {
7869                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7870                     }
7871                 }
7872                 if (bDistMB_pulse)
7873                 {
7874                     rb[dim1] +=
7875                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7876                     if (rb[dim1] > 0)
7877                     {
7878                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7879                         /* Take care of coupling of the distances
7880                          * to the planes along dim0 and dim1 through dim2.
7881                          */
7882                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7883                         /* Take care that the cell planes along dim1
7884                          * might not be orthogonal to that along dim2.
7885                          */
7886                         if (normal[dim1][dim2] > 0)
7887                         {
7888                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7889                         }
7890                     }
7891                 }
7892             }
7893             /* The distance along the communication direction */
7894             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7895             tric_sh  = 0;
7896             for (i = dim+1; i < DIM; i++)
7897             {
7898                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7899             }
7900             rn[dim] += tric_sh;
7901             if (rn[dim] > 0)
7902             {
7903                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7904                 /* Take care of coupling of the distances
7905                  * to the planes along dim0 and dim1 through dim2.
7906                  */
7907                 if (dim_ind == 1 && zonei == 1)
7908                 {
7909                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7910                 }
7911             }
7912             if (bDistMB_pulse)
7913             {
7914                 clear_rvec(rb);
7915                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7916                 if (rb[dim] > 0)
7917                 {
7918                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7919                     /* Take care of coupling of the distances
7920                      * to the planes along dim0 and dim1 through dim2.
7921                      */
7922                     if (dim_ind == 1 && zonei == 1)
7923                     {
7924                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7925                     }
7926                 }
7927             }
7928         }
7929
7930         if (r2 < r_comm2 ||
7931             (bDistBonded &&
7932              ((bDistMB && rb2 < r_bcomm2) ||
7933               (bDist2B && r2  < r_bcomm2)) &&
7934              (!bBondComm ||
7935               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7936                missing_link(comm->cglink, index_gl[cg],
7937                             comm->bLocalCG)))))
7938         {
7939             /* Make an index to the local charge groups */
7940             if (nsend+1 > ind->nalloc)
7941             {
7942                 ind->nalloc = over_alloc_large(nsend+1);
7943                 srenew(ind->index, ind->nalloc);
7944             }
7945             if (nsend+1 > *ibuf_nalloc)
7946             {
7947                 *ibuf_nalloc = over_alloc_large(nsend+1);
7948                 srenew(*ibuf, *ibuf_nalloc);
7949             }
7950             ind->index[nsend] = cg;
7951             (*ibuf)[nsend]    = index_gl[cg];
7952             nsend_z++;
7953             vec_rvec_check_alloc(vbuf, nsend+1);
7954
7955             if (dd->ci[dim] == 0)
7956             {
7957                 /* Correct cg_cm for pbc */
7958                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7959                 if (bScrew)
7960                 {
7961                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7962                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7963                 }
7964             }
7965             else
7966             {
7967                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7968             }
7969             nsend++;
7970             nat += cgindex[cg+1] - cgindex[cg];
7971         }
7972     }
7973
7974     *nsend_ptr   = nsend;
7975     *nat_ptr     = nat;
7976     *nsend_z_ptr = nsend_z;
7977 }
7978
7979 static void setup_dd_communication(gmx_domdec_t *dd,
7980                                    matrix box, gmx_ddbox_t *ddbox,
7981                                    t_forcerec *fr,
7982                                    t_state *state, PaddedRVecVector *f)
7983 {
7984     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7985     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7986     int                    c, i, cg, cg_gl, nrcg;
7987     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7988     gmx_domdec_comm_t     *comm;
7989     gmx_domdec_zones_t    *zones;
7990     gmx_domdec_comm_dim_t *cd;
7991     gmx_domdec_ind_t      *ind;
7992     cginfo_mb_t           *cginfo_mb;
7993     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
7994     real                   r_comm2, r_bcomm2;
7995     dd_corners_t           corners;
7996     ivec                   tric_dist;
7997     rvec                  *cg_cm, *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr, *recv_vr;
7998     real                   skew_fac2_d, skew_fac_01;
7999     rvec                   sf2_round;
8000     int                    nsend, nat;
8001     int                    th;
8002
8003     if (debug)
8004     {
8005         fprintf(debug, "Setting up DD communication\n");
8006     }
8007
8008     comm  = dd->comm;
8009
8010     if (comm->nth == 0)
8011     {
8012         /* Initialize the thread data.
8013          * This can not be done in init_domain_decomposition,
8014          * as the numbers of threads is determined later.
8015          */
8016         comm->nth = gmx_omp_nthreads_get(emntDomdec);
8017         if (comm->nth > 1)
8018         {
8019             snew(comm->dth, comm->nth);
8020         }
8021     }
8022
8023     switch (fr->cutoff_scheme)
8024     {
8025         case ecutsGROUP:
8026             cg_cm = fr->cg_cm;
8027             break;
8028         case ecutsVERLET:
8029             cg_cm = as_rvec_array(state->x.data());
8030             break;
8031         default:
8032             gmx_incons("unimplemented");
8033             cg_cm = nullptr;
8034     }
8035
8036     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8037     {
8038         /* Check if we need to use triclinic distances */
8039         tric_dist[dim_ind] = 0;
8040         for (i = 0; i <= dim_ind; i++)
8041         {
8042             if (ddbox->tric_dir[dd->dim[i]])
8043             {
8044                 tric_dist[dim_ind] = 1;
8045             }
8046         }
8047     }
8048
8049     bBondComm = comm->bBondComm;
8050
8051     /* Do we need to determine extra distances for multi-body bondeds? */
8052     bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
8053
8054     /* Do we need to determine extra distances for only two-body bondeds? */
8055     bDist2B = (bBondComm && !bDistMB);
8056
8057     r_comm2  = gmx::square(comm->cutoff);
8058     r_bcomm2 = gmx::square(comm->cutoff_mbody);
8059
8060     if (debug)
8061     {
8062         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, std::sqrt(r_bcomm2));
8063     }
8064
8065     zones = &comm->zones;
8066
8067     dim0 = dd->dim[0];
8068     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8069     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8070
8071     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8072
8073     /* Triclinic stuff */
8074     normal      = ddbox->normal;
8075     skew_fac_01 = 0;
8076     if (dd->ndim >= 2)
8077     {
8078         v_0 = ddbox->v[dim0];
8079         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8080         {
8081             /* Determine the coupling coefficient for the distances
8082              * to the cell planes along dim0 and dim1 through dim2.
8083              * This is required for correct rounding.
8084              */
8085             skew_fac_01 =
8086                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8087             if (debug)
8088             {
8089                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8090             }
8091         }
8092     }
8093     if (dd->ndim >= 3)
8094     {
8095         v_1 = ddbox->v[dim1];
8096     }
8097
8098     zone_cg_range = zones->cg_range;
8099     index_gl      = dd->index_gl;
8100     cgindex       = dd->cgindex;
8101     cginfo_mb     = fr->cginfo_mb;
8102
8103     zone_cg_range[0]   = 0;
8104     zone_cg_range[1]   = dd->ncg_home;
8105     comm->zone_ncg1[0] = dd->ncg_home;
8106     pos_cg             = dd->ncg_home;
8107
8108     nat_tot = dd->nat_home;
8109     nzone   = 1;
8110     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8111     {
8112         dim = dd->dim[dim_ind];
8113         cd  = &comm->cd[dim_ind];
8114
8115         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8116         {
8117             /* No pbc in this dimension, the first node should not comm. */
8118             nzone_send = 0;
8119         }
8120         else
8121         {
8122             nzone_send = nzone;
8123         }
8124
8125         v_d         = ddbox->v[dim];
8126         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
8127
8128         cd->bInPlace = TRUE;
8129         for (p = 0; p < cd->np; p++)
8130         {
8131             /* Only atoms communicated in the first pulse are used
8132              * for multi-body bonded interactions or for bBondComm.
8133              */
8134             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8135
8136             ind   = &cd->ind[p];
8137             nsend = 0;
8138             nat   = 0;
8139             for (zone = 0; zone < nzone_send; zone++)
8140             {
8141                 if (tric_dist[dim_ind] && dim_ind > 0)
8142                 {
8143                     /* Determine slightly more optimized skew_fac's
8144                      * for rounding.
8145                      * This reduces the number of communicated atoms
8146                      * by about 10% for 3D DD of rhombic dodecahedra.
8147                      */
8148                     for (dimd = 0; dimd < dim; dimd++)
8149                     {
8150                         sf2_round[dimd] = 1;
8151                         if (ddbox->tric_dir[dimd])
8152                         {
8153                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8154                             {
8155                                 /* If we are shifted in dimension i
8156                                  * and the cell plane is tilted forward
8157                                  * in dimension i, skip this coupling.
8158                                  */
8159                                 if (!(zones->shift[nzone+zone][i] &&
8160                                       ddbox->v[dimd][i][dimd] >= 0))
8161                                 {
8162                                     sf2_round[dimd] +=
8163                                         gmx::square(ddbox->v[dimd][i][dimd]);
8164                                 }
8165                             }
8166                             sf2_round[dimd] = 1/sf2_round[dimd];
8167                         }
8168                     }
8169                 }
8170
8171                 zonei = zone_perm[dim_ind][zone];
8172                 if (p == 0)
8173                 {
8174                     /* Here we permutate the zones to obtain a convenient order
8175                      * for neighbor searching
8176                      */
8177                     cg0 = zone_cg_range[zonei];
8178                     cg1 = zone_cg_range[zonei+1];
8179                 }
8180                 else
8181                 {
8182                     /* Look only at the cg's received in the previous grid pulse
8183                      */
8184                     cg1 = zone_cg_range[nzone+zone+1];
8185                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8186                 }
8187
8188 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8189                 for (th = 0; th < comm->nth; th++)
8190                 {
8191                     try
8192                     {
8193                         gmx_domdec_ind_t *ind_p;
8194                         int             **ibuf_p, *ibuf_nalloc_p;
8195                         vec_rvec_t       *vbuf_p;
8196                         int              *nsend_p, *nat_p;
8197                         int              *nsend_zone_p;
8198                         int               cg0_th, cg1_th;
8199
8200                         if (th == 0)
8201                         {
8202                             /* Thread 0 writes in the comm buffers */
8203                             ind_p         = ind;
8204                             ibuf_p        = &comm->buf_int;
8205                             ibuf_nalloc_p = &comm->nalloc_int;
8206                             vbuf_p        = &comm->vbuf;
8207                             nsend_p       = &nsend;
8208                             nat_p         = &nat;
8209                             nsend_zone_p  = &ind->nsend[zone];
8210                         }
8211                         else
8212                         {
8213                             /* Other threads write into temp buffers */
8214                             ind_p         = &comm->dth[th].ind;
8215                             ibuf_p        = &comm->dth[th].ibuf;
8216                             ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8217                             vbuf_p        = &comm->dth[th].vbuf;
8218                             nsend_p       = &comm->dth[th].nsend;
8219                             nat_p         = &comm->dth[th].nat;
8220                             nsend_zone_p  = &comm->dth[th].nsend_zone;
8221
8222                             comm->dth[th].nsend      = 0;
8223                             comm->dth[th].nat        = 0;
8224                             comm->dth[th].nsend_zone = 0;
8225                         }
8226
8227                         if (comm->nth == 1)
8228                         {
8229                             cg0_th = cg0;
8230                             cg1_th = cg1;
8231                         }
8232                         else
8233                         {
8234                             cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8235                             cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8236                         }
8237
8238                         /* Get the cg's for this pulse in this zone */
8239                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8240                                            index_gl, cgindex,
8241                                            dim, dim_ind, dim0, dim1, dim2,
8242                                            r_comm2, r_bcomm2,
8243                                            box, tric_dist,
8244                                            normal, skew_fac2_d, skew_fac_01,
8245                                            v_d, v_0, v_1, &corners, sf2_round,
8246                                            bDistBonded, bBondComm,
8247                                            bDist2B, bDistMB,
8248                                            cg_cm, fr->cginfo,
8249                                            ind_p,
8250                                            ibuf_p, ibuf_nalloc_p,
8251                                            vbuf_p,
8252                                            nsend_p, nat_p,
8253                                            nsend_zone_p);
8254                     }
8255                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
8256                 } // END
8257
8258                 /* Append data of threads>=1 to the communication buffers */
8259                 for (th = 1; th < comm->nth; th++)
8260                 {
8261                     dd_comm_setup_work_t *dth;
8262                     int                   i, ns1;
8263
8264                     dth = &comm->dth[th];
8265
8266                     ns1 = nsend + dth->nsend_zone;
8267                     if (ns1 > ind->nalloc)
8268                     {
8269                         ind->nalloc = over_alloc_dd(ns1);
8270                         srenew(ind->index, ind->nalloc);
8271                     }
8272                     if (ns1 > comm->nalloc_int)
8273                     {
8274                         comm->nalloc_int = over_alloc_dd(ns1);
8275                         srenew(comm->buf_int, comm->nalloc_int);
8276                     }
8277                     if (ns1 > comm->vbuf.nalloc)
8278                     {
8279                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8280                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8281                     }
8282
8283                     for (i = 0; i < dth->nsend_zone; i++)
8284                     {
8285                         ind->index[nsend]    = dth->ind.index[i];
8286                         comm->buf_int[nsend] = dth->ibuf[i];
8287                         copy_rvec(dth->vbuf.v[i],
8288                                   comm->vbuf.v[nsend]);
8289                         nsend++;
8290                     }
8291                     nat              += dth->nat;
8292                     ind->nsend[zone] += dth->nsend_zone;
8293                 }
8294             }
8295             /* Clear the counts in case we do not have pbc */
8296             for (zone = nzone_send; zone < nzone; zone++)
8297             {
8298                 ind->nsend[zone] = 0;
8299             }
8300             ind->nsend[nzone]   = nsend;
8301             ind->nsend[nzone+1] = nat;
8302             /* Communicate the number of cg's and atoms to receive */
8303             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8304                             ind->nsend, nzone+2,
8305                             ind->nrecv, nzone+2);
8306
8307             /* The rvec buffer is also required for atom buffers of size nsend
8308              * in dd_move_x and dd_move_f.
8309              */
8310             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8311
8312             if (p > 0)
8313             {
8314                 /* We can receive in place if only the last zone is not empty */
8315                 for (zone = 0; zone < nzone-1; zone++)
8316                 {
8317                     if (ind->nrecv[zone] > 0)
8318                     {
8319                         cd->bInPlace = FALSE;
8320                     }
8321                 }
8322                 if (!cd->bInPlace)
8323                 {
8324                     /* The int buffer is only required here for the cg indices */
8325                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8326                     {
8327                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8328                         srenew(comm->buf_int2, comm->nalloc_int2);
8329                     }
8330                     /* The rvec buffer is also required for atom buffers
8331                      * of size nrecv in dd_move_x and dd_move_f.
8332                      */
8333                     i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8334                     vec_rvec_check_alloc(&comm->vbuf2, i);
8335                 }
8336             }
8337
8338             /* Make space for the global cg indices */
8339             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8340                 || dd->cg_nalloc == 0)
8341             {
8342                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8343                 srenew(index_gl, dd->cg_nalloc);
8344                 srenew(cgindex, dd->cg_nalloc+1);
8345             }
8346             /* Communicate the global cg indices */
8347             if (cd->bInPlace)
8348             {
8349                 recv_i = index_gl + pos_cg;
8350             }
8351             else
8352             {
8353                 recv_i = comm->buf_int2;
8354             }
8355             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8356                             comm->buf_int, nsend,
8357                             recv_i,        ind->nrecv[nzone]);
8358
8359             /* Make space for cg_cm */
8360             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8361             if (fr->cutoff_scheme == ecutsGROUP)
8362             {
8363                 cg_cm = fr->cg_cm;
8364             }
8365             else
8366             {
8367                 cg_cm = as_rvec_array(state->x.data());
8368             }
8369             /* Communicate cg_cm */
8370             if (cd->bInPlace)
8371             {
8372                 recv_vr = cg_cm + pos_cg;
8373             }
8374             else
8375             {
8376                 recv_vr = comm->vbuf2.v;
8377             }
8378             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8379                              comm->vbuf.v, nsend,
8380                              recv_vr,      ind->nrecv[nzone]);
8381
8382             /* Make the charge group index */
8383             if (cd->bInPlace)
8384             {
8385                 zone = (p == 0 ? 0 : nzone - 1);
8386                 while (zone < nzone)
8387                 {
8388                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8389                     {
8390                         cg_gl              = index_gl[pos_cg];
8391                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8392                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8393                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8394                         if (bBondComm)
8395                         {
8396                             /* Update the charge group presence,
8397                              * so we can use it in the next pass of the loop.
8398                              */
8399                             comm->bLocalCG[cg_gl] = TRUE;
8400                         }
8401                         pos_cg++;
8402                     }
8403                     if (p == 0)
8404                     {
8405                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8406                     }
8407                     zone++;
8408                     zone_cg_range[nzone+zone] = pos_cg;
8409                 }
8410             }
8411             else
8412             {
8413                 /* This part of the code is never executed with bBondComm. */
8414                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8415                                  index_gl, recv_i, cg_cm, recv_vr,
8416                                  cgindex, fr->cginfo_mb, fr->cginfo);
8417                 pos_cg += ind->nrecv[nzone];
8418             }
8419             nat_tot += ind->nrecv[nzone+1];
8420         }
8421         if (!cd->bInPlace)
8422         {
8423             /* Store the atom block for easy copying of communication buffers */
8424             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8425         }
8426         nzone += nzone;
8427     }
8428     dd->index_gl = index_gl;
8429     dd->cgindex  = cgindex;
8430
8431     dd->ncg_tot          = zone_cg_range[zones->n];
8432     dd->nat_tot          = nat_tot;
8433     comm->nat[ddnatHOME] = dd->nat_home;
8434     for (i = ddnatZONE; i < ddnatNR; i++)
8435     {
8436         comm->nat[i] = dd->nat_tot;
8437     }
8438
8439     if (!bBondComm)
8440     {
8441         /* We don't need to update cginfo, since that was alrady done above.
8442          * So we pass NULL for the forcerec.
8443          */
8444         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8445                       nullptr, comm->bLocalCG);
8446     }
8447
8448     if (debug)
8449     {
8450         fprintf(debug, "Finished setting up DD communication, zones:");
8451         for (c = 0; c < zones->n; c++)
8452         {
8453             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8454         }
8455         fprintf(debug, "\n");
8456     }
8457 }
8458
8459 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8460 {
8461     int c;
8462
8463     for (c = 0; c < zones->nizone; c++)
8464     {
8465         zones->izone[c].cg1  = zones->cg_range[c+1];
8466         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8467         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8468     }
8469 }
8470
8471 /* \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
8472  *
8473  * Also sets the atom density for the home zone when \p zone_start=0.
8474  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
8475  * how many charge groups will move but are still part of the current range.
8476  * \todo When converting domdec to use proper classes, all these variables
8477  *       should be private and a method should return the correct count
8478  *       depending on an internal state.
8479  *
8480  * \param[in,out] dd          The domain decomposition struct
8481  * \param[in]     box         The box
8482  * \param[in]     ddbox       The domain decomposition box struct
8483  * \param[in]     zone_start  The start of the zone range to set sizes for
8484  * \param[in]     zone_end    The end of the zone range to set sizes for
8485  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
8486  */
8487 static void set_zones_size(gmx_domdec_t *dd,
8488                            matrix box, const gmx_ddbox_t *ddbox,
8489                            int zone_start, int zone_end,
8490                            int numMovedChargeGroupsInHomeZone)
8491 {
8492     gmx_domdec_comm_t  *comm;
8493     gmx_domdec_zones_t *zones;
8494     gmx_bool            bDistMB;
8495     int                 z, zi, d, dim;
8496     real                rcs, rcmbs;
8497     int                 i, j;
8498     real                vol;
8499
8500     comm = dd->comm;
8501
8502     zones = &comm->zones;
8503
8504     /* Do we need to determine extra distances for multi-body bondeds? */
8505     bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
8506
8507     for (z = zone_start; z < zone_end; z++)
8508     {
8509         /* Copy cell limits to zone limits.
8510          * Valid for non-DD dims and non-shifted dims.
8511          */
8512         copy_rvec(comm->cell_x0, zones->size[z].x0);
8513         copy_rvec(comm->cell_x1, zones->size[z].x1);
8514     }
8515
8516     for (d = 0; d < dd->ndim; d++)
8517     {
8518         dim = dd->dim[d];
8519
8520         for (z = 0; z < zones->n; z++)
8521         {
8522             /* With a staggered grid we have different sizes
8523              * for non-shifted dimensions.
8524              */
8525             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
8526             {
8527                 if (d == 1)
8528                 {
8529                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8530                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8531                 }
8532                 else if (d == 2)
8533                 {
8534                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8535                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8536                 }
8537             }
8538         }
8539
8540         rcs   = comm->cutoff;
8541         rcmbs = comm->cutoff_mbody;
8542         if (ddbox->tric_dir[dim])
8543         {
8544             rcs   /= ddbox->skew_fac[dim];
8545             rcmbs /= ddbox->skew_fac[dim];
8546         }
8547
8548         /* Set the lower limit for the shifted zone dimensions */
8549         for (z = zone_start; z < zone_end; z++)
8550         {
8551             if (zones->shift[z][dim] > 0)
8552             {
8553                 dim = dd->dim[d];
8554                 if (!isDlbOn(dd->comm) || d == 0)
8555                 {
8556                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8557                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8558                 }
8559                 else
8560                 {
8561                     /* Here we take the lower limit of the zone from
8562                      * the lowest domain of the zone below.
8563                      */
8564                     if (z < 4)
8565                     {
8566                         zones->size[z].x0[dim] =
8567                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8568                     }
8569                     else
8570                     {
8571                         if (d == 1)
8572                         {
8573                             zones->size[z].x0[dim] =
8574                                 zones->size[zone_perm[2][z-4]].x0[dim];
8575                         }
8576                         else
8577                         {
8578                             zones->size[z].x0[dim] =
8579                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8580                         }
8581                     }
8582                     /* A temporary limit, is updated below */
8583                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8584
8585                     if (bDistMB)
8586                     {
8587                         for (zi = 0; zi < zones->nizone; zi++)
8588                         {
8589                             if (zones->shift[zi][dim] == 0)
8590                             {
8591                                 /* This takes the whole zone into account.
8592                                  * With multiple pulses this will lead
8593                                  * to a larger zone then strictly necessary.
8594                                  */
8595                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8596                                                                   zones->size[zi].x1[dim]+rcmbs);
8597                             }
8598                         }
8599                     }
8600                 }
8601             }
8602         }
8603
8604         /* Loop over the i-zones to set the upper limit of each
8605          * j-zone they see.
8606          */
8607         for (zi = 0; zi < zones->nizone; zi++)
8608         {
8609             if (zones->shift[zi][dim] == 0)
8610             {
8611                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8612                 {
8613                     if (zones->shift[z][dim] > 0)
8614                     {
8615                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8616                                                           zones->size[zi].x1[dim]+rcs);
8617                     }
8618                 }
8619             }
8620         }
8621     }
8622
8623     for (z = zone_start; z < zone_end; z++)
8624     {
8625         /* Initialization only required to keep the compiler happy */
8626         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8627         int  nc, c;
8628
8629         /* To determine the bounding box for a zone we need to find
8630          * the extreme corners of 4, 2 or 1 corners.
8631          */
8632         nc = 1 << (ddbox->nboundeddim - 1);
8633
8634         for (c = 0; c < nc; c++)
8635         {
8636             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8637             corner[XX] = 0;
8638             if ((c & 1) == 0)
8639             {
8640                 corner[YY] = zones->size[z].x0[YY];
8641             }
8642             else
8643             {
8644                 corner[YY] = zones->size[z].x1[YY];
8645             }
8646             if ((c & 2) == 0)
8647             {
8648                 corner[ZZ] = zones->size[z].x0[ZZ];
8649             }
8650             else
8651             {
8652                 corner[ZZ] = zones->size[z].x1[ZZ];
8653             }
8654             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
8655                 box[ZZ][1 - dd->dim[0]] != 0)
8656             {
8657                 /* With 1D domain decomposition the cg's are not in
8658                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
8659                  * Shift the corner of the z-vector back to along the box
8660                  * vector of dimension d, so it will later end up at 0 along d.
8661                  * This can affect the location of this corner along dd->dim[0]
8662                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
8663                  */
8664                 int d = 1 - dd->dim[0];
8665
8666                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
8667             }
8668             /* Apply the triclinic couplings */
8669             assert(ddbox->npbcdim <= DIM);
8670             for (i = YY; i < ddbox->npbcdim; i++)
8671             {
8672                 for (j = XX; j < i; j++)
8673                 {
8674                     corner[j] += corner[i]*box[i][j]/box[i][i];
8675                 }
8676             }
8677             if (c == 0)
8678             {
8679                 copy_rvec(corner, corner_min);
8680                 copy_rvec(corner, corner_max);
8681             }
8682             else
8683             {
8684                 for (i = 0; i < DIM; i++)
8685                 {
8686                     corner_min[i] = std::min(corner_min[i], corner[i]);
8687                     corner_max[i] = std::max(corner_max[i], corner[i]);
8688                 }
8689             }
8690         }
8691         /* Copy the extreme cornes without offset along x */
8692         for (i = 0; i < DIM; i++)
8693         {
8694             zones->size[z].bb_x0[i] = corner_min[i];
8695             zones->size[z].bb_x1[i] = corner_max[i];
8696         }
8697         /* Add the offset along x */
8698         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8699         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8700     }
8701
8702     if (zone_start == 0)
8703     {
8704         vol = 1;
8705         for (dim = 0; dim < DIM; dim++)
8706         {
8707             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8708         }
8709         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone)/vol;
8710     }
8711
8712     if (debug)
8713     {
8714         for (z = zone_start; z < zone_end; z++)
8715         {
8716             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8717                     z,
8718                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8719                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8720                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8721             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8722                     z,
8723                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8724                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8725                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8726         }
8727     }
8728 }
8729
8730 static int comp_cgsort(const void *a, const void *b)
8731 {
8732     int           comp;
8733
8734     gmx_cgsort_t *cga, *cgb;
8735     cga = (gmx_cgsort_t *)a;
8736     cgb = (gmx_cgsort_t *)b;
8737
8738     comp = cga->nsc - cgb->nsc;
8739     if (comp == 0)
8740     {
8741         comp = cga->ind_gl - cgb->ind_gl;
8742     }
8743
8744     return comp;
8745 }
8746
8747 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8748                          int *a, int *buf)
8749 {
8750     int i;
8751
8752     /* Order the data */
8753     for (i = 0; i < n; i++)
8754     {
8755         buf[i] = a[sort[i].ind];
8756     }
8757
8758     /* Copy back to the original array */
8759     for (i = 0; i < n; i++)
8760     {
8761         a[i] = buf[i];
8762     }
8763 }
8764
8765 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8766                          rvec *v, rvec *buf)
8767 {
8768     int i;
8769
8770     /* Order the data */
8771     for (i = 0; i < n; i++)
8772     {
8773         copy_rvec(v[sort[i].ind], buf[i]);
8774     }
8775
8776     /* Copy back to the original array */
8777     for (i = 0; i < n; i++)
8778     {
8779         copy_rvec(buf[i], v[i]);
8780     }
8781 }
8782
8783 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8784                            rvec *v, rvec *buf)
8785 {
8786     int a, atot, cg, cg0, cg1, i;
8787
8788     if (cgindex == nullptr)
8789     {
8790         /* Avoid the useless loop of the atoms within a cg */
8791         order_vec_cg(ncg, sort, v, buf);
8792
8793         return;
8794     }
8795
8796     /* Order the data */
8797     a = 0;
8798     for (cg = 0; cg < ncg; cg++)
8799     {
8800         cg0 = cgindex[sort[cg].ind];
8801         cg1 = cgindex[sort[cg].ind+1];
8802         for (i = cg0; i < cg1; i++)
8803         {
8804             copy_rvec(v[i], buf[a]);
8805             a++;
8806         }
8807     }
8808     atot = a;
8809
8810     /* Copy back to the original array */
8811     for (a = 0; a < atot; a++)
8812     {
8813         copy_rvec(buf[a], v[a]);
8814     }
8815 }
8816
8817 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8818                          int nsort_new, gmx_cgsort_t *sort_new,
8819                          gmx_cgsort_t *sort1)
8820 {
8821     int i1, i2, i_new;
8822
8823     /* The new indices are not very ordered, so we qsort them */
8824     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8825
8826     /* sort2 is already ordered, so now we can merge the two arrays */
8827     i1    = 0;
8828     i2    = 0;
8829     i_new = 0;
8830     while (i2 < nsort2 || i_new < nsort_new)
8831     {
8832         if (i2 == nsort2)
8833         {
8834             sort1[i1++] = sort_new[i_new++];
8835         }
8836         else if (i_new == nsort_new)
8837         {
8838             sort1[i1++] = sort2[i2++];
8839         }
8840         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8841                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8842                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8843         {
8844             sort1[i1++] = sort2[i2++];
8845         }
8846         else
8847         {
8848             sort1[i1++] = sort_new[i_new++];
8849         }
8850     }
8851 }
8852
8853 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8854 {
8855     gmx_domdec_sort_t *sort;
8856     gmx_cgsort_t      *cgsort, *sort_i;
8857     int                ncg_new, nsort2, nsort_new, i, *a, moved;
8858
8859     sort = dd->comm->sort;
8860
8861     a = fr->ns->grid->cell_index;
8862
8863     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
8864
8865     if (ncg_home_old >= 0)
8866     {
8867         /* The charge groups that remained in the same ns grid cell
8868          * are completely ordered. So we can sort efficiently by sorting
8869          * the charge groups that did move into the stationary list.
8870          */
8871         ncg_new   = 0;
8872         nsort2    = 0;
8873         nsort_new = 0;
8874         for (i = 0; i < dd->ncg_home; i++)
8875         {
8876             /* Check if this cg did not move to another node */
8877             if (a[i] < moved)
8878             {
8879                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8880                 {
8881                     /* This cg is new on this node or moved ns grid cell */
8882                     if (nsort_new >= sort->sort_new_nalloc)
8883                     {
8884                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8885                         srenew(sort->sort_new, sort->sort_new_nalloc);
8886                     }
8887                     sort_i = &(sort->sort_new[nsort_new++]);
8888                 }
8889                 else
8890                 {
8891                     /* This cg did not move */
8892                     sort_i = &(sort->sort2[nsort2++]);
8893                 }
8894                 /* Sort on the ns grid cell indices
8895                  * and the global topology index.
8896                  * index_gl is irrelevant with cell ns,
8897                  * but we set it here anyhow to avoid a conditional.
8898                  */
8899                 sort_i->nsc    = a[i];
8900                 sort_i->ind_gl = dd->index_gl[i];
8901                 sort_i->ind    = i;
8902                 ncg_new++;
8903             }
8904         }
8905         if (debug)
8906         {
8907             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8908                     nsort2, nsort_new);
8909         }
8910         /* Sort efficiently */
8911         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8912                      sort->sort);
8913     }
8914     else
8915     {
8916         cgsort  = sort->sort;
8917         ncg_new = 0;
8918         for (i = 0; i < dd->ncg_home; i++)
8919         {
8920             /* Sort on the ns grid cell indices
8921              * and the global topology index
8922              */
8923             cgsort[i].nsc    = a[i];
8924             cgsort[i].ind_gl = dd->index_gl[i];
8925             cgsort[i].ind    = i;
8926             if (cgsort[i].nsc < moved)
8927             {
8928                 ncg_new++;
8929             }
8930         }
8931         if (debug)
8932         {
8933             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8934         }
8935         /* Determine the order of the charge groups using qsort */
8936         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8937     }
8938
8939     return ncg_new;
8940 }
8941
8942 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8943 {
8944     gmx_cgsort_t *sort;
8945     int           ncg_new, i, na;
8946     const int    *a;
8947
8948     sort = dd->comm->sort->sort;
8949
8950     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8951
8952     ncg_new = 0;
8953     for (i = 0; i < na; i++)
8954     {
8955         if (a[i] >= 0)
8956         {
8957             sort[ncg_new].ind = a[i];
8958             ncg_new++;
8959         }
8960     }
8961
8962     return ncg_new;
8963 }
8964
8965 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
8966                           int ncg_home_old)
8967 {
8968     gmx_domdec_sort_t *sort;
8969     gmx_cgsort_t      *cgsort;
8970     int               *cgindex;
8971     int                ncg_new, i, *ibuf, cgsize;
8972     rvec              *vbuf;
8973
8974     sort = dd->comm->sort;
8975
8976     if (dd->ncg_home > sort->sort_nalloc)
8977     {
8978         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8979         srenew(sort->sort, sort->sort_nalloc);
8980         srenew(sort->sort2, sort->sort_nalloc);
8981     }
8982     cgsort = sort->sort;
8983
8984     switch (fr->cutoff_scheme)
8985     {
8986         case ecutsGROUP:
8987             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8988             break;
8989         case ecutsVERLET:
8990             ncg_new = dd_sort_order_nbnxn(dd, fr);
8991             break;
8992         default:
8993             gmx_incons("unimplemented");
8994             ncg_new = 0;
8995     }
8996
8997     /* We alloc with the old size, since cgindex is still old */
8998     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8999     vbuf = dd->comm->vbuf.v;
9000
9001     if (dd->comm->bCGs)
9002     {
9003         cgindex = dd->cgindex;
9004     }
9005     else
9006     {
9007         cgindex = nullptr;
9008     }
9009
9010     /* Remove the charge groups which are no longer at home here */
9011     dd->ncg_home = ncg_new;
9012     if (debug)
9013     {
9014         fprintf(debug, "Set the new home charge group count to %d\n",
9015                 dd->ncg_home);
9016     }
9017
9018     /* Reorder the state */
9019     if (state->flags & (1 << estX))
9020     {
9021         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->x.data()), vbuf);
9022     }
9023     if (state->flags & (1 << estV))
9024     {
9025         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->v.data()), vbuf);
9026     }
9027     if (state->flags & (1 << estCGP))
9028     {
9029         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->cg_p.data()), vbuf);
9030     }
9031
9032     if (fr->cutoff_scheme == ecutsGROUP)
9033     {
9034         /* Reorder cgcm */
9035         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
9036     }
9037
9038     if (dd->ncg_home+1 > sort->ibuf_nalloc)
9039     {
9040         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9041         srenew(sort->ibuf, sort->ibuf_nalloc);
9042     }
9043     ibuf = sort->ibuf;
9044     /* Reorder the global cg index */
9045     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9046     /* Reorder the cginfo */
9047     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9048     /* Rebuild the local cg index */
9049     if (dd->comm->bCGs)
9050     {
9051         ibuf[0] = 0;
9052         for (i = 0; i < dd->ncg_home; i++)
9053         {
9054             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9055             ibuf[i+1] = ibuf[i] + cgsize;
9056         }
9057         for (i = 0; i < dd->ncg_home+1; i++)
9058         {
9059             dd->cgindex[i] = ibuf[i];
9060         }
9061     }
9062     else
9063     {
9064         for (i = 0; i < dd->ncg_home+1; i++)
9065         {
9066             dd->cgindex[i] = i;
9067         }
9068     }
9069     /* Set the home atom number */
9070     dd->nat_home = dd->cgindex[dd->ncg_home];
9071
9072     if (fr->cutoff_scheme == ecutsVERLET)
9073     {
9074         /* The atoms are now exactly in grid order, update the grid order */
9075         nbnxn_set_atomorder(fr->nbv->nbs);
9076     }
9077     else
9078     {
9079         /* Copy the sorted ns cell indices back to the ns grid struct */
9080         for (i = 0; i < dd->ncg_home; i++)
9081         {
9082             fr->ns->grid->cell_index[i] = cgsort[i].nsc;
9083         }
9084         fr->ns->grid->nr = dd->ncg_home;
9085     }
9086 }
9087
9088 static void add_dd_statistics(gmx_domdec_t *dd)
9089 {
9090     gmx_domdec_comm_t *comm;
9091     int                ddnat;
9092
9093     comm = dd->comm;
9094
9095     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9096     {
9097         comm->sum_nat[ddnat-ddnatZONE] +=
9098             comm->nat[ddnat] - comm->nat[ddnat-1];
9099     }
9100     comm->ndecomp++;
9101 }
9102
9103 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9104 {
9105     gmx_domdec_comm_t *comm;
9106     int                ddnat;
9107
9108     comm = dd->comm;
9109
9110     /* Reset all the statistics and counters for total run counting */
9111     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9112     {
9113         comm->sum_nat[ddnat-ddnatZONE] = 0;
9114     }
9115     comm->ndecomp   = 0;
9116     comm->nload     = 0;
9117     comm->load_step = 0;
9118     comm->load_sum  = 0;
9119     comm->load_max  = 0;
9120     clear_ivec(comm->load_lim);
9121     comm->load_mdf = 0;
9122     comm->load_pme = 0;
9123 }
9124
9125 void print_dd_statistics(const t_commrec *cr, const t_inputrec *ir, FILE *fplog)
9126 {
9127     gmx_domdec_comm_t *comm;
9128     int                ddnat;
9129     double             av;
9130
9131     comm = cr->dd->comm;
9132
9133     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9134
9135     if (fplog == nullptr)
9136     {
9137         return;
9138     }
9139
9140     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9141
9142     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9143     {
9144         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9145         switch (ddnat)
9146         {
9147             case ddnatZONE:
9148                 fprintf(fplog,
9149                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9150                         2, av);
9151                 break;
9152             case ddnatVSITE:
9153                 if (cr->dd->vsite_comm)
9154                 {
9155                     fprintf(fplog,
9156                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9157                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9158                             av);
9159                 }
9160                 break;
9161             case ddnatCON:
9162                 if (cr->dd->constraint_comm)
9163                 {
9164                     fprintf(fplog,
9165                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9166                             1 + ir->nLincsIter, av);
9167                 }
9168                 break;
9169             default:
9170                 gmx_incons(" Unknown type for DD statistics");
9171         }
9172     }
9173     fprintf(fplog, "\n");
9174
9175     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9176     {
9177         print_dd_load_av(fplog, cr->dd);
9178     }
9179 }
9180
9181 void dd_partition_system(FILE                *fplog,
9182                          gmx_int64_t          step,
9183                          const t_commrec     *cr,
9184                          gmx_bool             bMasterState,
9185                          int                  nstglobalcomm,
9186                          t_state             *state_global,
9187                          const gmx_mtop_t    *top_global,
9188                          const t_inputrec    *ir,
9189                          t_state             *state_local,
9190                          PaddedRVecVector    *f,
9191                          gmx::MDAtoms        *mdAtoms,
9192                          gmx_localtop_t      *top_local,
9193                          t_forcerec          *fr,
9194                          gmx_vsite_t         *vsite,
9195                          gmx::Constraints    *constr,
9196                          t_nrnb              *nrnb,
9197                          gmx_wallcycle       *wcycle,
9198                          gmx_bool             bVerbose)
9199 {
9200     gmx_domdec_t      *dd;
9201     gmx_domdec_comm_t *comm;
9202     gmx_ddbox_t        ddbox = {0};
9203     t_block           *cgs_gl;
9204     gmx_int64_t        step_pcoupl;
9205     rvec               cell_ns_x0, cell_ns_x1;
9206     int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9207     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
9208     gmx_bool           bRedist, bSortCG, bResortAll;
9209     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9210     real               grid_density;
9211     char               sbuf[22];
9212
9213     wallcycle_start(wcycle, ewcDOMDEC);
9214
9215     dd   = cr->dd;
9216     comm = dd->comm;
9217
9218     bBoxChanged = (bMasterState || inputrecDeform(ir));
9219     if (ir->epc != epcNO)
9220     {
9221         /* With nstpcouple > 1 pressure coupling happens.
9222          * one step after calculating the pressure.
9223          * Box scaling happens at the end of the MD step,
9224          * after the DD partitioning.
9225          * We therefore have to do DLB in the first partitioning
9226          * after an MD step where P-coupling occurred.
9227          * We need to determine the last step in which p-coupling occurred.
9228          * MRS -- need to validate this for vv?
9229          */
9230         n = ir->nstpcouple;
9231         if (n == 1)
9232         {
9233             step_pcoupl = step - 1;
9234         }
9235         else
9236         {
9237             step_pcoupl = ((step - 1)/n)*n + 1;
9238         }
9239         if (step_pcoupl >= comm->partition_step)
9240         {
9241             bBoxChanged = TRUE;
9242         }
9243     }
9244
9245     bNStGlobalComm = (step % nstglobalcomm == 0);
9246
9247     if (!isDlbOn(comm))
9248     {
9249         bDoDLB = FALSE;
9250     }
9251     else
9252     {
9253         /* Should we do dynamic load balacing this step?
9254          * Since it requires (possibly expensive) global communication,
9255          * we might want to do DLB less frequently.
9256          */
9257         if (bBoxChanged || ir->epc != epcNO)
9258         {
9259             bDoDLB = bBoxChanged;
9260         }
9261         else
9262         {
9263             bDoDLB = bNStGlobalComm;
9264         }
9265     }
9266
9267     /* Check if we have recorded loads on the nodes */
9268     if (comm->bRecordLoad && dd_load_count(comm) > 0)
9269     {
9270         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
9271
9272         /* Print load every nstlog, first and last step to the log file */
9273         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9274                     comm->n_load_collect == 0 ||
9275                     (ir->nsteps >= 0 &&
9276                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9277
9278         /* Avoid extra communication due to verbose screen output
9279          * when nstglobalcomm is set.
9280          */
9281         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
9282             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9283         {
9284             get_load_distribution(dd, wcycle);
9285             if (DDMASTER(dd))
9286             {
9287                 if (bLogLoad)
9288                 {
9289                     dd_print_load(fplog, dd, step-1);
9290                 }
9291                 if (bVerbose)
9292                 {
9293                     dd_print_load_verbose(dd);
9294                 }
9295             }
9296             comm->n_load_collect++;
9297
9298             if (isDlbOn(comm))
9299             {
9300                 if (DDMASTER(dd))
9301                 {
9302                     /* Add the measured cycles to the running average */
9303                     const float averageFactor        = 0.1f;
9304                     comm->cyclesPerStepDlbExpAverage =
9305                         (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
9306                         averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
9307                 }
9308                 if (comm->dlbState == edlbsOnCanTurnOff &&
9309                     dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
9310                 {
9311                     gmx_bool turnOffDlb;
9312                     if (DDMASTER(dd))
9313                     {
9314                         /* If the running averaged cycles with DLB are more
9315                          * than before we turned on DLB, turn off DLB.
9316                          * We will again run and check the cycles without DLB
9317                          * and we can then decide if to turn off DLB forever.
9318                          */
9319                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
9320                                       comm->cyclesPerStepBeforeDLB);
9321                     }
9322                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
9323                     if (turnOffDlb)
9324                     {
9325                         /* To turn off DLB, we need to redistribute the atoms */
9326                         dd_collect_state(dd, state_local, state_global);
9327                         bMasterState = TRUE;
9328                         turn_off_dlb(fplog, cr, step);
9329                     }
9330                 }
9331             }
9332             else if (bCheckWhetherToTurnDlbOn)
9333             {
9334                 gmx_bool turnOffDlbForever = FALSE;
9335                 gmx_bool turnOnDlb         = FALSE;
9336
9337                 /* Since the timings are node dependent, the master decides */
9338                 if (DDMASTER(dd))
9339                 {
9340                     /* If we recently turned off DLB, we want to check if
9341                      * performance is better without DLB. We want to do this
9342                      * ASAP to minimize the chance that external factors
9343                      * slowed down the DLB step are gone here and we
9344                      * incorrectly conclude that DLB was causing the slowdown.
9345                      * So we measure one nstlist block, no running average.
9346                      */
9347                     if (comm->haveTurnedOffDlb &&
9348                         comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
9349                         comm->cyclesPerStepDlbExpAverage)
9350                     {
9351                         /* After turning off DLB we ran nstlist steps in fewer
9352                          * cycles than with DLB. This likely means that DLB
9353                          * in not benefical, but this could be due to a one
9354                          * time unlucky fluctuation, so we require two such
9355                          * observations in close succession to turn off DLB
9356                          * forever.
9357                          */
9358                         if (comm->dlbSlowerPartitioningCount > 0 &&
9359                             dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
9360                         {
9361                             turnOffDlbForever = TRUE;
9362                         }
9363                         comm->haveTurnedOffDlb           = false;
9364                         /* Register when we last measured DLB slowdown */
9365                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
9366                     }
9367                     else
9368                     {
9369                         /* Here we check if the max PME rank load is more than 0.98
9370                          * the max PP force load. If so, PP DLB will not help,
9371                          * since we are (almost) limited by PME. Furthermore,
9372                          * DLB will cause a significant extra x/f redistribution
9373                          * cost on the PME ranks, which will then surely result
9374                          * in lower total performance.
9375                          */
9376                         if (cr->npmenodes > 0 &&
9377                             dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
9378                         {
9379                             turnOnDlb = FALSE;
9380                         }
9381                         else
9382                         {
9383                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9384                         }
9385                     }
9386                 }
9387                 struct
9388                 {
9389                     gmx_bool turnOffDlbForever;
9390                     gmx_bool turnOnDlb;
9391                 }
9392                 bools {
9393                     turnOffDlbForever, turnOnDlb
9394                 };
9395                 dd_bcast(dd, sizeof(bools), &bools);
9396                 if (bools.turnOffDlbForever)
9397                 {
9398                     turn_off_dlb_forever(fplog, cr, step);
9399                 }
9400                 else if (bools.turnOnDlb)
9401                 {
9402                     turn_on_dlb(fplog, cr, step);
9403                     bDoDLB = TRUE;
9404                 }
9405             }
9406         }
9407         comm->n_load_have++;
9408     }
9409
9410     cgs_gl = &comm->cgs_gl;
9411
9412     bRedist = FALSE;
9413     if (bMasterState)
9414     {
9415         /* Clear the old state */
9416         clear_dd_indices(dd, 0, 0);
9417         ncgindex_set = 0;
9418
9419         rvec *xGlobal = (SIMMASTER(cr) ? as_rvec_array(state_global->x.data()) : nullptr);
9420
9421         set_ddbox(dd, bMasterState, ir,
9422                   SIMMASTER(cr) ? state_global->box : nullptr,
9423                   TRUE, cgs_gl, xGlobal,
9424                   &ddbox);
9425
9426         get_cg_distribution(fplog, dd, cgs_gl,
9427                             SIMMASTER(cr) ? state_global->box : nullptr,
9428                             &ddbox, xGlobal);
9429
9430         dd_distribute_state(dd, cgs_gl,
9431                             state_global, state_local, f);
9432
9433         dd_make_local_cgs(dd, &top_local->cgs);
9434
9435         /* Ensure that we have space for the new distribution */
9436         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9437
9438         if (fr->cutoff_scheme == ecutsGROUP)
9439         {
9440             calc_cgcm(fplog, 0, dd->ncg_home,
9441                       &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9442         }
9443
9444         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9445
9446         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9447     }
9448     else if (state_local->ddp_count != dd->ddp_count)
9449     {
9450         if (state_local->ddp_count > dd->ddp_count)
9451         {
9452             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9453         }
9454
9455         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9456         {
9457             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9458         }
9459
9460         /* Clear the old state */
9461         clear_dd_indices(dd, 0, 0);
9462
9463         /* Build the new indices */
9464         rebuild_cgindex(dd, cgs_gl->index, state_local);
9465         make_dd_indices(dd, cgs_gl->index, 0);
9466         ncgindex_set = dd->ncg_home;
9467
9468         if (fr->cutoff_scheme == ecutsGROUP)
9469         {
9470             /* Redetermine the cg COMs */
9471             calc_cgcm(fplog, 0, dd->ncg_home,
9472                       &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9473         }
9474
9475         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9476
9477         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9478
9479         set_ddbox(dd, bMasterState, ir, state_local->box,
9480                   TRUE, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9481
9482         bRedist = isDlbOn(comm);
9483     }
9484     else
9485     {
9486         /* We have the full state, only redistribute the cgs */
9487
9488         /* Clear the non-home indices */
9489         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9490         ncgindex_set = 0;
9491
9492         /* Avoid global communication for dim's without pbc and -gcom */
9493         if (!bNStGlobalComm)
9494         {
9495             copy_rvec(comm->box0, ddbox.box0    );
9496             copy_rvec(comm->box_size, ddbox.box_size);
9497         }
9498         set_ddbox(dd, bMasterState, ir, state_local->box,
9499                   bNStGlobalComm, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9500
9501         bBoxChanged = TRUE;
9502         bRedist     = TRUE;
9503     }
9504     /* For dim's without pbc and -gcom */
9505     copy_rvec(ddbox.box0, comm->box0    );
9506     copy_rvec(ddbox.box_size, comm->box_size);
9507
9508     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9509                       step, wcycle);
9510
9511     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9512     {
9513         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9514     }
9515
9516     /* Check if we should sort the charge groups */
9517     bSortCG = (bMasterState || bRedist);
9518
9519     ncg_home_old = dd->ncg_home;
9520
9521     /* When repartitioning we mark charge groups that will move to neighboring
9522      * DD cells, but we do not move them right away for performance reasons.
9523      * Thus we need to keep track of how many charge groups will move for
9524      * obtaining correct local charge group / atom counts.
9525      */
9526     ncg_moved = 0;
9527     if (bRedist)
9528     {
9529         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9530
9531         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9532                            state_local, f, fr,
9533                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9534
9535         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9536     }
9537
9538     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9539                           dd, &ddbox,
9540                           &comm->cell_x0, &comm->cell_x1,
9541                           dd->ncg_home, fr->cg_cm,
9542                           cell_ns_x0, cell_ns_x1, &grid_density);
9543
9544     if (bBoxChanged)
9545     {
9546         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9547     }
9548
9549     switch (fr->cutoff_scheme)
9550     {
9551         case ecutsGROUP:
9552             copy_ivec(fr->ns->grid->n, ncells_old);
9553             grid_first(fplog, fr->ns->grid, dd, &ddbox,
9554                        state_local->box, cell_ns_x0, cell_ns_x1,
9555                        fr->rlist, grid_density);
9556             break;
9557         case ecutsVERLET:
9558             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9559             break;
9560         default:
9561             gmx_incons("unimplemented");
9562     }
9563     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9564     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9565
9566     if (bSortCG)
9567     {
9568         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9569
9570         /* Sort the state on charge group position.
9571          * This enables exact restarts from this step.
9572          * It also improves performance by about 15% with larger numbers
9573          * of atoms per node.
9574          */
9575
9576         /* Fill the ns grid with the home cell,
9577          * so we can sort with the indices.
9578          */
9579         set_zones_ncg_home(dd);
9580
9581         switch (fr->cutoff_scheme)
9582         {
9583             case ecutsVERLET:
9584                 set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
9585
9586                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9587                                   0,
9588                                   comm->zones.size[0].bb_x0,
9589                                   comm->zones.size[0].bb_x1,
9590                                   0, dd->ncg_home,
9591                                   comm->zones.dens_zone0,
9592                                   fr->cginfo,
9593                                   as_rvec_array(state_local->x.data()),
9594                                   ncg_moved, bRedist ? comm->moved : nullptr,
9595                                   fr->nbv->grp[eintLocal].kernel_type,
9596                                   fr->nbv->nbat);
9597
9598                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9599                 break;
9600             case ecutsGROUP:
9601                 fill_grid(&comm->zones, fr->ns->grid, dd->ncg_home,
9602                           0, dd->ncg_home, fr->cg_cm);
9603
9604                 copy_ivec(fr->ns->grid->n, ncells_new);
9605                 break;
9606             default:
9607                 gmx_incons("unimplemented");
9608         }
9609
9610         bResortAll = bMasterState;
9611
9612         /* Check if we can user the old order and ns grid cell indices
9613          * of the charge groups to sort the charge groups efficiently.
9614          */
9615         if (ncells_new[XX] != ncells_old[XX] ||
9616             ncells_new[YY] != ncells_old[YY] ||
9617             ncells_new[ZZ] != ncells_old[ZZ])
9618         {
9619             bResortAll = TRUE;
9620         }
9621
9622         if (debug)
9623         {
9624             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9625                     gmx_step_str(step, sbuf), dd->ncg_home);
9626         }
9627         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9628                       bResortAll ? -1 : ncg_home_old);
9629
9630         /* After sorting and compacting we set the correct size */
9631         dd_resize_state(state_local, f, dd->nat_home);
9632
9633         /* Rebuild all the indices */
9634         ga2la_clear(dd->ga2la);
9635         ncgindex_set = 0;
9636
9637         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9638     }
9639
9640     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9641
9642     /* Setup up the communication and communicate the coordinates */
9643     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9644
9645     /* Set the indices */
9646     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9647
9648     /* Set the charge group boundaries for neighbor searching */
9649     set_cg_boundaries(&comm->zones);
9650
9651     if (fr->cutoff_scheme == ecutsVERLET)
9652     {
9653         set_zones_size(dd, state_local->box, &ddbox,
9654                        bSortCG ? 1 : 0, comm->zones.n,
9655                        0);
9656     }
9657
9658     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9659
9660     /*
9661        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9662                  -1,as_rvec_array(state_local->x.data()),state_local->box);
9663      */
9664
9665     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9666
9667     /* Extract a local topology from the global topology */
9668     for (i = 0; i < dd->ndim; i++)
9669     {
9670         np[dd->dim[i]] = comm->cd[i].np;
9671     }
9672     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9673                       comm->cellsize_min, np,
9674                       fr,
9675                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : as_rvec_array(state_local->x.data()),
9676                       vsite, top_global, top_local);
9677
9678     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9679
9680     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9681
9682     /* Set up the special atom communication */
9683     n = comm->nat[ddnatZONE];
9684     for (i = ddnatZONE+1; i < ddnatNR; i++)
9685     {
9686         switch (i)
9687         {
9688             case ddnatVSITE:
9689                 if (vsite && vsite->n_intercg_vsite)
9690                 {
9691                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9692                 }
9693                 break;
9694             case ddnatCON:
9695                 if (dd->bInterCGcons || dd->bInterCGsettles)
9696                 {
9697                     /* Only for inter-cg constraints we need special code */
9698                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9699                                                   constr, ir->nProjOrder,
9700                                                   top_local->idef.il);
9701                 }
9702                 break;
9703             default:
9704                 gmx_incons("Unknown special atom type setup");
9705         }
9706         comm->nat[i] = n;
9707     }
9708
9709     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9710
9711     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9712
9713     /* Make space for the extra coordinates for virtual site
9714      * or constraint communication.
9715      */
9716     state_local->natoms = comm->nat[ddnatNR-1];
9717
9718     dd_resize_state(state_local, f, state_local->natoms);
9719
9720     if (fr->haveDirectVirialContributions)
9721     {
9722         if (vsite && vsite->n_intercg_vsite)
9723         {
9724             nat_f_novirsum = comm->nat[ddnatVSITE];
9725         }
9726         else
9727         {
9728             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9729             {
9730                 nat_f_novirsum = dd->nat_tot;
9731             }
9732             else
9733             {
9734                 nat_f_novirsum = dd->nat_home;
9735             }
9736         }
9737     }
9738     else
9739     {
9740         nat_f_novirsum = 0;
9741     }
9742
9743     /* Set the number of atoms required for the force calculation.
9744      * Forces need to be constrained when doing energy
9745      * minimization. For simple simulations we could avoid some
9746      * allocation, zeroing and copying, but this is probably not worth
9747      * the complications and checking.
9748      */
9749     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9750                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9751
9752     /* Update atom data for mdatoms and several algorithms */
9753     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
9754                               nullptr, mdAtoms, vsite, nullptr);
9755
9756     auto mdatoms = mdAtoms->mdatoms();
9757     if (!thisRankHasDuty(cr, DUTY_PME))
9758     {
9759         /* Send the charges and/or c6/sigmas to our PME only node */
9760         gmx_pme_send_parameters(cr,
9761                                 fr->ic,
9762                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9763                                 mdatoms->chargeA, mdatoms->chargeB,
9764                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9765                                 mdatoms->sigmaA, mdatoms->sigmaB,
9766                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9767     }
9768
9769     if (constr)
9770     {
9771         set_constraints(constr, top_local, ir, mdatoms, cr);
9772     }
9773
9774     if (ir->bPull)
9775     {
9776         /* Update the local pull groups */
9777         dd_make_local_pull_groups(cr, ir->pull_work, mdatoms);
9778     }
9779
9780     if (ir->bRot)
9781     {
9782         /* Update the local rotation groups */
9783         dd_make_local_rotation_groups(dd, ir->rot);
9784     }
9785
9786     if (ir->eSwapCoords != eswapNO)
9787     {
9788         /* Update the local groups needed for ion swapping */
9789         dd_make_local_swap_groups(dd, ir->swap);
9790     }
9791
9792     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9793     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9794
9795     add_dd_statistics(dd);
9796
9797     /* Make sure we only count the cycles for this DD partitioning */
9798     clear_dd_cycle_counts(dd);
9799
9800     /* Because the order of the atoms might have changed since
9801      * the last vsite construction, we need to communicate the constructing
9802      * atom coordinates again (for spreading the forces this MD step).
9803      */
9804     dd_move_x_vsites(dd, state_local->box, as_rvec_array(state_local->x.data()));
9805
9806     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9807
9808     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9809     {
9810         dd_move_x(dd, state_local->box, as_rvec_array(state_local->x.data()), nullWallcycle);
9811         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9812                      -1, as_rvec_array(state_local->x.data()), state_local->box);
9813     }
9814
9815     /* Store the partitioning step */
9816     comm->partition_step = step;
9817
9818     /* Increase the DD partitioning counter */
9819     dd->ddp_count++;
9820     /* The state currently matches this DD partitioning count, store it */
9821     state_local->ddp_count = dd->ddp_count;
9822     if (bMasterState)
9823     {
9824         /* The DD master node knows the complete cg distribution,
9825          * store the count so we can possibly skip the cg info communication.
9826          */
9827         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9828     }
9829
9830     if (comm->DD_debug > 0)
9831     {
9832         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9833         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9834                                 "after partitioning");
9835     }
9836
9837     wallcycle_stop(wcycle, ewcDOMDEC);
9838 }
9839
9840 /*! \brief Check whether bonded interactions are missing, if appropriate */
9841 void checkNumberOfBondedInteractions(FILE                 *fplog,
9842                                      t_commrec            *cr,
9843                                      int                   totalNumberOfBondedInteractions,
9844                                      const gmx_mtop_t     *top_global,
9845                                      const gmx_localtop_t *top_local,
9846                                      const t_state        *state,
9847                                      bool                 *shouldCheckNumberOfBondedInteractions)
9848 {
9849     if (*shouldCheckNumberOfBondedInteractions)
9850     {
9851         if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
9852         {
9853             dd_print_missing_interactions(fplog, cr, totalNumberOfBondedInteractions, top_global, top_local, state); // Does not return
9854         }
9855         *shouldCheckNumberOfBondedInteractions = false;
9856     }
9857 }