src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <limits.h>
  44 #include <math.h>
  45 #include <stdio.h>
  46 #include <stdlib.h>
  47 #include <string.h>
  48
  49 #include <algorithm>
  50
  51 #include "gromacs/domdec/domdec_network.h"
  52 #include "gromacs/domdec/ga2la.h"
  53 #include "gromacs/ewald/pme.h"
  54 #include "gromacs/fileio/gmxfio.h"
  55 #include "gromacs/fileio/pdbio.h"
  56 #include "gromacs/gmxlib/chargegroup.h"
  57 #include "gromacs/gmxlib/network.h"
  58 #include "gromacs/gmxlib/nrnb.h"
  59 #include "gromacs/gpu_utils/gpu_utils.h"
  60 #include "gromacs/hardware/hw_info.h"
  61 #include "gromacs/imd/imd.h"
  62 #include "gromacs/listed-forces/manage-threading.h"
  63 #include "gromacs/math/functions.h"
  64 #include "gromacs/math/vec.h"
  65 #include "gromacs/math/vectypes.h"
  66 #include "gromacs/mdlib/constr.h"
  67 #include "gromacs/mdlib/force.h"
  68 #include "gromacs/mdlib/forcerec.h"
  69 #include "gromacs/mdlib/genborn.h"
  70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  71 #include "gromacs/mdlib/mdatoms.h"
  72 #include "gromacs/mdlib/mdrun.h"
  73 #include "gromacs/mdlib/mdsetup.h"
  74 #include "gromacs/mdlib/nb_verlet.h"
  75 #include "gromacs/mdlib/nbnxn_grid.h"
  76 #include "gromacs/mdlib/nsgrid.h"
  77 #include "gromacs/mdlib/vsite.h"
  78 #include "gromacs/mdtypes/commrec.h"
  79 #include "gromacs/mdtypes/df_history.h"
  80 #include "gromacs/mdtypes/forcerec.h"
  81 #include "gromacs/mdtypes/inputrec.h"
  82 #include "gromacs/mdtypes/md_enums.h"
  83 #include "gromacs/mdtypes/mdatom.h"
  84 #include "gromacs/mdtypes/nblist.h"
  85 #include "gromacs/mdtypes/state.h"
  86 #include "gromacs/pbcutil/ishift.h"
  87 #include "gromacs/pbcutil/pbc.h"
  88 #include "gromacs/pulling/pull.h"
  89 #include "gromacs/pulling/pull_rotation.h"
  90 #include "gromacs/swap/swapcoords.h"
  91 #include "gromacs/timing/wallcycle.h"
  92 #include "gromacs/topology/block.h"
  93 #include "gromacs/topology/idef.h"
  94 #include "gromacs/topology/ifunc.h"
  95 #include "gromacs/topology/mtop_lookup.h"
  96 #include "gromacs/topology/mtop_util.h"
  97 #include "gromacs/topology/topology.h"
  98 #include "gromacs/utility/basedefinitions.h"
  99 #include "gromacs/utility/basenetwork.h"
 100 #include "gromacs/utility/cstringutil.h"
 101 #include "gromacs/utility/exceptions.h"
 102 #include "gromacs/utility/fatalerror.h"
 103 #include "gromacs/utility/gmxmpi.h"
 104 #include "gromacs/utility/qsort_threadsafe.h"
 105 #include "gromacs/utility/real.h"
 106 #include "gromacs/utility/smalloc.h"
 107 #include "gromacs/utility/stringutil.h"
 108
 109 #include "domdec_constraints.h"
 110 #include "domdec_internal.h"
 111 #include "domdec_vsite.h"
 112
 113 #define DDRANK(dd, rank)    (rank)
 114 #define DDMASTERRANK(dd)   (dd->masterrank)
 115
 116 struct gmx_domdec_master_t
 117 {
 118     /* The cell boundaries */
 119     real **cell_x;
 120     /* The global charge group division */
 121     int   *ncg;    /* Number of home charge groups for each node */
 122     int   *index;  /* Index of nnodes+1 into cg */
 123     int   *cg;     /* Global charge group index */
 124     int   *nat;    /* Number of home atoms for each node. */
 125     int   *ibuf;   /* Buffer for communication */
 126     rvec  *vbuf;   /* Buffer for state scattering and gathering */
 127 };
 128
 129 #define DD_NLOAD_MAX 9
 130
 131 const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on", "on" };
 132
 133 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 134 #define DD_CGIBS 2
 135
 136 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 137 #define DD_FLAG_NRCG  65535
 138 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 139 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 140
 141 /* The DD zone order */
 142 static const ivec dd_zo[DD_MAXZONE] =
 143 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 144
 145 /* The non-bonded zone-pair setup for domain decomposition
 146  * The first number is the i-zone, the second number the first j-zone seen by
 147  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 148  * As is, this is for 3D decomposition, where there are 4 i-zones.
 149  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 150  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 151  */
 152 static const int
 153     ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
 154                                                  {1, 3, 6},
 155                                                  {2, 5, 6},
 156                                                  {3, 5, 7}};
 157
 158 /* Factors used to avoid problems due to rounding issues */
 159 #define DD_CELL_MARGIN       1.0001
 160 #define DD_CELL_MARGIN2      1.00005
 161 /* Factor to account for pressure scaling during nstlist steps */
 162 #define DD_PRES_SCALE_MARGIN 1.02
 163
 164 /* Turn on DLB when the load imbalance causes this amount of total loss.
 165  * There is a bit of overhead with DLB and it's difficult to achieve
 166  * a load imbalance of less than 2% with DLB.
 167  */
 168 #define DD_PERF_LOSS_DLB_ON  0.02
 169
 170 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 171 #define DD_PERF_LOSS_WARN    0.05
 172
 173 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 174
 175 /* Use separate MPI send and receive commands
 176  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 177  * This saves memory (and some copying for small nnodes).
 178  * For high parallelization scatter and gather calls are used.
 179  */
 180 #define GMX_DD_NNODES_SENDRECV 4
 181
 182
 183 /* We check if to turn on DLB at the first and every 100 DD partitionings.
 184  * With large imbalance DLB will turn on at the first step, so we can
 185  * make the interval so large that the MPI overhead of the check is negligible.
 186  */
 187 static const int c_checkTurnDlbOnInterval  = 100;
 188 /* We need to check if DLB results in worse performance and then turn it off.
 189  * We check this more often then for turning DLB on, because the DLB can scale
 190  * the domains very rapidly, so if unlucky the load imbalance can go up quickly
 191  * and furthermore, we are already synchronizing often with DLB, so
 192  * the overhead of the MPI Bcast is not that high.
 193  */
 194 static const int c_checkTurnDlbOffInterval =  20;
 195
 196 /* Forward declaration */
 197 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue);
 198
 199
 200 /*
 201    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 202
 203    static void index2xyz(ivec nc,int ind,ivec xyz)
 204    {
 205    xyz[XX] = ind % nc[XX];
 206    xyz[YY] = (ind / nc[XX]) % nc[YY];
 207    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 208    }
 209  */
 210
 211 /* This order is required to minimize the coordinate communication in PME
 212  * which uses decomposition in the x direction.
 213  */
 214 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 215
 216 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 217 {
 218     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 219     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 220     xyz[ZZ] = ind % nc[ZZ];
 221 }
 222
 223 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 224 {
 225     int ddindex;
 226     int ddnodeid = -1;
 227
 228     ddindex = dd_index(dd->nc, c);
 229     if (dd->comm->bCartesianPP_PME)
 230     {
 231         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 232     }
 233     else if (dd->comm->bCartesianPP)
 234     {
 235 #if GMX_MPI
 236         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 237 #endif
 238     }
 239     else
 240     {
 241         ddnodeid = ddindex;
 242     }
 243
 244     return ddnodeid;
 245 }
 246
 247 static gmx_bool dynamic_dd_box(const gmx_ddbox_t *ddbox, const t_inputrec *ir)
 248 {
 249     return (ddbox->nboundeddim < DIM || inputrecDynamicBox(ir));
 250 }
 251
 252 int ddglatnr(const gmx_domdec_t *dd, int i)
 253 {
 254     int atnr;
 255
 256     if (dd == nullptr)
 257     {
 258         atnr = i + 1;
 259     }
 260     else
 261     {
 262         if (i >= dd->comm->nat[ddnatNR-1])
 263         {
 264             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 265         }
 266         atnr = dd->gatindex[i] + 1;
 267     }
 268
 269     return atnr;
 270 }
 271
 272 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 273 {
 274     return &dd->comm->cgs_gl;
 275 }
 276
 277 /*! \brief Returns true if the DLB state indicates that the balancer is on. */
 278 static bool isDlbOn(const gmx_domdec_comm_t *comm)
 279 {
 280     return (comm->dlbState == edlbsOnCanTurnOff ||
 281             comm->dlbState == edlbsOnUser);
 282 }
 283 /*! \brief Returns true if the DLB state indicates that the balancer is off/disabled.
 284  */
 285 static bool isDlbDisabled(const gmx_domdec_comm_t *comm)
 286 {
 287     return (comm->dlbState == edlbsOffUser ||
 288             comm->dlbState == edlbsOffForever);
 289 }
 290
 291 static void vec_rvec_init(vec_rvec_t *v)
 292 {
 293     v->nalloc = 0;
 294     v->v      = nullptr;
 295 }
 296
 297 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 298 {
 299     if (n > v->nalloc)
 300     {
 301         v->nalloc = over_alloc_dd(n);
 302         srenew(v->v, v->nalloc);
 303     }
 304 }
 305
 306 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 307 {
 308     int i;
 309
 310     if (state->ddp_count != dd->ddp_count)
 311     {
 312         gmx_incons("The MD state does not match the domain decomposition state");
 313     }
 314
 315     state->cg_gl.resize(dd->ncg_home);
 316     for (i = 0; i < dd->ncg_home; i++)
 317     {
 318         state->cg_gl[i] = dd->index_gl[i];
 319     }
 320
 321     state->ddp_count_cg_gl = dd->ddp_count;
 322 }
 323
 324 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 325 {
 326     return &dd->comm->zones;
 327 }
 328
 329 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
 330                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 331 {
 332     gmx_domdec_zones_t *zones;
 333     int                 izone, d, dim;
 334
 335     zones = &dd->comm->zones;
 336
 337     izone = 0;
 338     while (icg >= zones->izone[izone].cg1)
 339     {
 340         izone++;
 341     }
 342
 343     if (izone == 0)
 344     {
 345         *jcg0 = icg;
 346     }
 347     else if (izone < zones->nizone)
 348     {
 349         *jcg0 = zones->izone[izone].jcg0;
 350     }
 351     else
 352     {
 353         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 354                   icg, izone, zones->nizone);
 355     }
 356
 357     *jcg1 = zones->izone[izone].jcg1;
 358
 359     for (d = 0; d < dd->ndim; d++)
 360     {
 361         dim         = dd->dim[d];
 362         shift0[dim] = zones->izone[izone].shift0[dim];
 363         shift1[dim] = zones->izone[izone].shift1[dim];
 364         if (dd->comm->tric_dir[dim] || (isDlbOn(dd->comm) && d > 0))
 365         {
 366             /* A conservative approach, this can be optimized */
 367             shift0[dim] -= 1;
 368             shift1[dim] += 1;
 369         }
 370     }
 371 }
 372
 373 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
 374 {
 375     /* We currently set mdatoms entries for all atoms:
 376      * local + non-local + communicated for vsite + constraints
 377      */
 378
 379     return dd->comm->nat[ddnatNR - 1];
 380 }
 381
 382 int dd_natoms_vsite(const gmx_domdec_t *dd)
 383 {
 384     return dd->comm->nat[ddnatVSITE];
 385 }
 386
 387 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
 388 {
 389     *at_start = dd->comm->nat[ddnatCON-1];
 390     *at_end   = dd->comm->nat[ddnatCON];
 391 }
 392
 393 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 394 {
 395     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 396     int                   *index, *cgindex;
 397     gmx_domdec_comm_t     *comm;
 398     gmx_domdec_comm_dim_t *cd;
 399     gmx_domdec_ind_t      *ind;
 400     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 401     gmx_bool               bPBC, bScrew;
 402
 403     comm = dd->comm;
 404
 405     cgindex = dd->cgindex;
 406
 407     buf = comm->vbuf.v;
 408
 409     nzone   = 1;
 410     nat_tot = dd->nat_home;
 411     for (d = 0; d < dd->ndim; d++)
 412     {
 413         bPBC   = (dd->ci[dd->dim[d]] == 0);
 414         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 415         if (bPBC)
 416         {
 417             copy_rvec(box[dd->dim[d]], shift);
 418         }
 419         cd = &comm->cd[d];
 420         for (p = 0; p < cd->np; p++)
 421         {
 422             ind   = &cd->ind[p];
 423             index = ind->index;
 424             n     = 0;
 425             if (!bPBC)
 426             {
 427                 for (i = 0; i < ind->nsend[nzone]; i++)
 428                 {
 429                     at0 = cgindex[index[i]];
 430                     at1 = cgindex[index[i]+1];
 431                     for (j = at0; j < at1; j++)
 432                     {
 433                         copy_rvec(x[j], buf[n]);
 434                         n++;
 435                     }
 436                 }
 437             }
 438             else if (!bScrew)
 439             {
 440                 for (i = 0; i < ind->nsend[nzone]; i++)
 441                 {
 442                     at0 = cgindex[index[i]];
 443                     at1 = cgindex[index[i]+1];
 444                     for (j = at0; j < at1; j++)
 445                     {
 446                         /* We need to shift the coordinates */
 447                         rvec_add(x[j], shift, buf[n]);
 448                         n++;
 449                     }
 450                 }
 451             }
 452             else
 453             {
 454                 for (i = 0; i < ind->nsend[nzone]; i++)
 455                 {
 456                     at0 = cgindex[index[i]];
 457                     at1 = cgindex[index[i]+1];
 458                     for (j = at0; j < at1; j++)
 459                     {
 460                         /* Shift x */
 461                         buf[n][XX] = x[j][XX] + shift[XX];
 462                         /* Rotate y and z.
 463                          * This operation requires a special shift force
 464                          * treatment, which is performed in calc_vir.
 465                          */
 466                         buf[n][YY] = box[YY][YY] - x[j][YY];
 467                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 468                         n++;
 469                     }
 470                 }
 471             }
 472
 473             if (cd->bInPlace)
 474             {
 475                 rbuf = x + nat_tot;
 476             }
 477             else
 478             {
 479                 rbuf = comm->vbuf2.v;
 480             }
 481             /* Send and receive the coordinates */
 482             dd_sendrecv_rvec(dd, d, dddirBackward,
 483                              buf,  ind->nsend[nzone+1],
 484                              rbuf, ind->nrecv[nzone+1]);
 485             if (!cd->bInPlace)
 486             {
 487                 j = 0;
 488                 for (zone = 0; zone < nzone; zone++)
 489                 {
 490                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 491                     {
 492                         copy_rvec(rbuf[j], x[i]);
 493                         j++;
 494                     }
 495                 }
 496             }
 497             nat_tot += ind->nrecv[nzone+1];
 498         }
 499         nzone += nzone;
 500     }
 501 }
 502
 503 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 504 {
 505     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 506     int                   *index, *cgindex;
 507     gmx_domdec_comm_t     *comm;
 508     gmx_domdec_comm_dim_t *cd;
 509     gmx_domdec_ind_t      *ind;
 510     rvec                  *buf, *sbuf;
 511     ivec                   vis;
 512     int                    is;
 513     gmx_bool               bShiftForcesNeedPbc, bScrew;
 514
 515     comm = dd->comm;
 516
 517     cgindex = dd->cgindex;
 518
 519     buf = comm->vbuf.v;
 520
 521     nzone   = comm->zones.n/2;
 522     nat_tot = dd->nat_tot;
 523     for (d = dd->ndim-1; d >= 0; d--)
 524     {
 525         /* Only forces in domains near the PBC boundaries need to
 526            consider PBC in the treatment of fshift */
 527         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 528         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 529         if (fshift == nullptr && !bScrew)
 530         {
 531             bShiftForcesNeedPbc = FALSE;
 532         }
 533         /* Determine which shift vector we need */
 534         clear_ivec(vis);
 535         vis[dd->dim[d]] = 1;
 536         is              = IVEC2IS(vis);
 537
 538         cd = &comm->cd[d];
 539         for (p = cd->np-1; p >= 0; p--)
 540         {
 541             ind      = &cd->ind[p];
 542             nat_tot -= ind->nrecv[nzone+1];
 543             if (cd->bInPlace)
 544             {
 545                 sbuf = f + nat_tot;
 546             }
 547             else
 548             {
 549                 sbuf = comm->vbuf2.v;
 550                 j    = 0;
 551                 for (zone = 0; zone < nzone; zone++)
 552                 {
 553                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 554                     {
 555                         copy_rvec(f[i], sbuf[j]);
 556                         j++;
 557                     }
 558                 }
 559             }
 560             /* Communicate the forces */
 561             dd_sendrecv_rvec(dd, d, dddirForward,
 562                              sbuf, ind->nrecv[nzone+1],
 563                              buf,  ind->nsend[nzone+1]);
 564             index = ind->index;
 565             /* Add the received forces */
 566             n = 0;
 567             if (!bShiftForcesNeedPbc)
 568             {
 569                 for (i = 0; i < ind->nsend[nzone]; i++)
 570                 {
 571                     at0 = cgindex[index[i]];
 572                     at1 = cgindex[index[i]+1];
 573                     for (j = at0; j < at1; j++)
 574                     {
 575                         rvec_inc(f[j], buf[n]);
 576                         n++;
 577                     }
 578                 }
 579             }
 580             else if (!bScrew)
 581             {
 582                 /* fshift should always be defined if this function is
 583                  * called when bShiftForcesNeedPbc is true */
 584                 assert(NULL != fshift);
 585                 for (i = 0; i < ind->nsend[nzone]; i++)
 586                 {
 587                     at0 = cgindex[index[i]];
 588                     at1 = cgindex[index[i]+1];
 589                     for (j = at0; j < at1; j++)
 590                     {
 591                         rvec_inc(f[j], buf[n]);
 592                         /* Add this force to the shift force */
 593                         rvec_inc(fshift[is], buf[n]);
 594                         n++;
 595                     }
 596                 }
 597             }
 598             else
 599             {
 600                 for (i = 0; i < ind->nsend[nzone]; i++)
 601                 {
 602                     at0 = cgindex[index[i]];
 603                     at1 = cgindex[index[i]+1];
 604                     for (j = at0; j < at1; j++)
 605                     {
 606                         /* Rotate the force */
 607                         f[j][XX] += buf[n][XX];
 608                         f[j][YY] -= buf[n][YY];
 609                         f[j][ZZ] -= buf[n][ZZ];
 610                         if (fshift)
 611                         {
 612                             /* Add this force to the shift force */
 613                             rvec_inc(fshift[is], buf[n]);
 614                         }
 615                         n++;
 616                     }
 617                 }
 618             }
 619         }
 620         nzone /= 2;
 621     }
 622 }
 623
 624 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 625 {
 626     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 627     int                   *index, *cgindex;
 628     gmx_domdec_comm_t     *comm;
 629     gmx_domdec_comm_dim_t *cd;
 630     gmx_domdec_ind_t      *ind;
 631     real                  *buf, *rbuf;
 632
 633     comm = dd->comm;
 634
 635     cgindex = dd->cgindex;
 636
 637     buf = &comm->vbuf.v[0][0];
 638
 639     nzone   = 1;
 640     nat_tot = dd->nat_home;
 641     for (d = 0; d < dd->ndim; d++)
 642     {
 643         cd = &comm->cd[d];
 644         for (p = 0; p < cd->np; p++)
 645         {
 646             ind   = &cd->ind[p];
 647             index = ind->index;
 648             n     = 0;
 649             for (i = 0; i < ind->nsend[nzone]; i++)
 650             {
 651                 at0 = cgindex[index[i]];
 652                 at1 = cgindex[index[i]+1];
 653                 for (j = at0; j < at1; j++)
 654                 {
 655                     buf[n] = v[j];
 656                     n++;
 657                 }
 658             }
 659
 660             if (cd->bInPlace)
 661             {
 662                 rbuf = v + nat_tot;
 663             }
 664             else
 665             {
 666                 rbuf = &comm->vbuf2.v[0][0];
 667             }
 668             /* Send and receive the coordinates */
 669             dd_sendrecv_real(dd, d, dddirBackward,
 670                              buf,  ind->nsend[nzone+1],
 671                              rbuf, ind->nrecv[nzone+1]);
 672             if (!cd->bInPlace)
 673             {
 674                 j = 0;
 675                 for (zone = 0; zone < nzone; zone++)
 676                 {
 677                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 678                     {
 679                         v[i] = rbuf[j];
 680                         j++;
 681                     }
 682                 }
 683             }
 684             nat_tot += ind->nrecv[nzone+1];
 685         }
 686         nzone += nzone;
 687     }
 688 }
 689
 690 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 691 {
 692     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 693     int                   *index, *cgindex;
 694     gmx_domdec_comm_t     *comm;
 695     gmx_domdec_comm_dim_t *cd;
 696     gmx_domdec_ind_t      *ind;
 697     real                  *buf, *sbuf;
 698
 699     comm = dd->comm;
 700
 701     cgindex = dd->cgindex;
 702
 703     buf = &comm->vbuf.v[0][0];
 704
 705     nzone   = comm->zones.n/2;
 706     nat_tot = dd->nat_tot;
 707     for (d = dd->ndim-1; d >= 0; d--)
 708     {
 709         cd = &comm->cd[d];
 710         for (p = cd->np-1; p >= 0; p--)
 711         {
 712             ind      = &cd->ind[p];
 713             nat_tot -= ind->nrecv[nzone+1];
 714             if (cd->bInPlace)
 715             {
 716                 sbuf = v + nat_tot;
 717             }
 718             else
 719             {
 720                 sbuf = &comm->vbuf2.v[0][0];
 721                 j    = 0;
 722                 for (zone = 0; zone < nzone; zone++)
 723                 {
 724                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 725                     {
 726                         sbuf[j] = v[i];
 727                         j++;
 728                     }
 729                 }
 730             }
 731             /* Communicate the forces */
 732             dd_sendrecv_real(dd, d, dddirForward,
 733                              sbuf, ind->nrecv[nzone+1],
 734                              buf,  ind->nsend[nzone+1]);
 735             index = ind->index;
 736             /* Add the received forces */
 737             n = 0;
 738             for (i = 0; i < ind->nsend[nzone]; i++)
 739             {
 740                 at0 = cgindex[index[i]];
 741                 at1 = cgindex[index[i]+1];
 742                 for (j = at0; j < at1; j++)
 743                 {
 744                     v[j] += buf[n];
 745                     n++;
 746                 }
 747             }
 748         }
 749         nzone /= 2;
 750     }
 751 }
 752
 753 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 754 {
 755     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 756             d, i, j,
 757             zone->min0, zone->max1,
 758             zone->mch0, zone->mch0,
 759             zone->p1_0, zone->p1_1);
 760 }
 761
 762
 763 #define DDZONECOMM_MAXZONE  5
 764 #define DDZONECOMM_BUFSIZE  3
 765
 766 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 767                                int ddimind, int direction,
 768                                gmx_ddzone_t *buf_s, int n_s,
 769                                gmx_ddzone_t *buf_r, int n_r)
 770 {
 771 #define ZBS  DDZONECOMM_BUFSIZE
 772     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 773     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 774     int  i;
 775
 776     for (i = 0; i < n_s; i++)
 777     {
 778         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 779         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 780         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 781         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 782         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 783         vbuf_s[i*ZBS+1][2] = 0;
 784         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 785         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 786         vbuf_s[i*ZBS+2][2] = 0;
 787     }
 788
 789     dd_sendrecv_rvec(dd, ddimind, direction,
 790                      vbuf_s, n_s*ZBS,
 791                      vbuf_r, n_r*ZBS);
 792
 793     for (i = 0; i < n_r; i++)
 794     {
 795         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 796         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 797         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 798         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 799         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 800         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 801         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 802     }
 803
 804 #undef ZBS
 805 }
 806
 807 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 808                           rvec cell_ns_x0, rvec cell_ns_x1)
 809 {
 810     int                d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
 811     gmx_ddzone_t      *zp;
 812     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 813     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 814     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 815     rvec               extr_s[2], extr_r[2];
 816     rvec               dh;
 817     real               dist_d, c = 0, det;
 818     gmx_domdec_comm_t *comm;
 819     gmx_bool           bPBC, bUse;
 820
 821     comm = dd->comm;
 822
 823     for (d = 1; d < dd->ndim; d++)
 824     {
 825         dim      = dd->dim[d];
 826         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 827         zp->min0 = cell_ns_x0[dim];
 828         zp->max1 = cell_ns_x1[dim];
 829         zp->min1 = cell_ns_x1[dim];
 830         zp->mch0 = cell_ns_x0[dim];
 831         zp->mch1 = cell_ns_x1[dim];
 832         zp->p1_0 = cell_ns_x0[dim];
 833         zp->p1_1 = cell_ns_x1[dim];
 834     }
 835
 836     for (d = dd->ndim-2; d >= 0; d--)
 837     {
 838         dim  = dd->dim[d];
 839         bPBC = (dim < ddbox->npbcdim);
 840
 841         /* Use an rvec to store two reals */
 842         extr_s[d][0] = comm->cell_f0[d+1];
 843         extr_s[d][1] = comm->cell_f1[d+1];
 844         extr_s[d][2] = comm->cell_f1[d+1];
 845
 846         pos = 0;
 847         /* Store the extremes in the backward sending buffer,
 848          * so the get updated separately from the forward communication.
 849          */
 850         for (d1 = d; d1 < dd->ndim-1; d1++)
 851         {
 852             /* We invert the order to be able to use the same loop for buf_e */
 853             buf_s[pos].min0 = extr_s[d1][1];
 854             buf_s[pos].max1 = extr_s[d1][0];
 855             buf_s[pos].min1 = extr_s[d1][2];
 856             buf_s[pos].mch0 = 0;
 857             buf_s[pos].mch1 = 0;
 858             /* Store the cell corner of the dimension we communicate along */
 859             buf_s[pos].p1_0 = comm->cell_x0[dim];
 860             buf_s[pos].p1_1 = 0;
 861             pos++;
 862         }
 863
 864         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 865         pos++;
 866
 867         if (dd->ndim == 3 && d == 0)
 868         {
 869             buf_s[pos] = comm->zone_d2[0][1];
 870             pos++;
 871             buf_s[pos] = comm->zone_d1[0];
 872             pos++;
 873         }
 874
 875         /* We only need to communicate the extremes
 876          * in the forward direction
 877          */
 878         npulse = comm->cd[d].np;
 879         if (bPBC)
 880         {
 881             /* Take the minimum to avoid double communication */
 882             npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
 883         }
 884         else
 885         {
 886             /* Without PBC we should really not communicate over
 887              * the boundaries, but implementing that complicates
 888              * the communication setup and therefore we simply
 889              * do all communication, but ignore some data.
 890              */
 891             npulse_min = npulse;
 892         }
 893         for (p = 0; p < npulse_min; p++)
 894         {
 895             /* Communicate the extremes forward */
 896             bUse = (bPBC || dd->ci[dim] > 0);
 897
 898             dd_sendrecv_rvec(dd, d, dddirForward,
 899                              extr_s+d, dd->ndim-d-1,
 900                              extr_r+d, dd->ndim-d-1);
 901
 902             if (bUse)
 903             {
 904                 for (d1 = d; d1 < dd->ndim-1; d1++)
 905                 {
 906                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 907                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 908                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 909                 }
 910             }
 911         }
 912
 913         buf_size = pos;
 914         for (p = 0; p < npulse; p++)
 915         {
 916             /* Communicate all the zone information backward */
 917             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 918
 919             dd_sendrecv_ddzone(dd, d, dddirBackward,
 920                                buf_s, buf_size,
 921                                buf_r, buf_size);
 922
 923             clear_rvec(dh);
 924             if (p > 0)
 925             {
 926                 for (d1 = d+1; d1 < dd->ndim; d1++)
 927                 {
 928                     /* Determine the decrease of maximum required
 929                      * communication height along d1 due to the distance along d,
 930                      * this avoids a lot of useless atom communication.
 931                      */
 932                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 933
 934                     if (ddbox->tric_dir[dim])
 935                     {
 936                         /* c is the off-diagonal coupling between the cell planes
 937                          * along directions d and d1.
 938                          */
 939                         c = ddbox->v[dim][dd->dim[d1]][dim];
 940                     }
 941                     else
 942                     {
 943                         c = 0;
 944                     }
 945                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 946                     if (det > 0)
 947                     {
 948                         dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
 949                     }
 950                     else
 951                     {
 952                         /* A negative value signals out of range */
 953                         dh[d1] = -1;
 954                     }
 955                 }
 956             }
 957
 958             /* Accumulate the extremes over all pulses */
 959             for (i = 0; i < buf_size; i++)
 960             {
 961                 if (p == 0)
 962                 {
 963                     buf_e[i] = buf_r[i];
 964                 }
 965                 else
 966                 {
 967                     if (bUse)
 968                     {
 969                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 970                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 971                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 972                     }
 973
 974                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 975                     {
 976                         d1 = 1;
 977                     }
 978                     else
 979                     {
 980                         d1 = d + 1;
 981                     }
 982                     if (bUse && dh[d1] >= 0)
 983                     {
 984                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 985                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 986                     }
 987                 }
 988                 /* Copy the received buffer to the send buffer,
 989                  * to pass the data through with the next pulse.
 990                  */
 991                 buf_s[i] = buf_r[i];
 992             }
 993             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 994                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 995             {
 996                 /* Store the extremes */
 997                 pos = 0;
 998
 999                 for (d1 = d; d1 < dd->ndim-1; d1++)
1000                 {
1001                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
1002                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
1003                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
1004                     pos++;
1005                 }
1006
1007                 if (d == 1 || (d == 0 && dd->ndim == 3))
1008                 {
1009                     for (i = d; i < 2; i++)
1010                     {
1011                         comm->zone_d2[1-d][i] = buf_e[pos];
1012                         pos++;
1013                     }
1014                 }
1015                 if (d == 0)
1016                 {
1017                     comm->zone_d1[1] = buf_e[pos];
1018                     pos++;
1019                 }
1020             }
1021         }
1022     }
1023
1024     if (dd->ndim >= 2)
1025     {
1026         dim = dd->dim[1];
1027         for (i = 0; i < 2; i++)
1028         {
1029             if (debug)
1030             {
1031                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1032             }
1033             cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1034             cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1035         }
1036     }
1037     if (dd->ndim >= 3)
1038     {
1039         dim = dd->dim[2];
1040         for (i = 0; i < 2; i++)
1041         {
1042             for (j = 0; j < 2; j++)
1043             {
1044                 if (debug)
1045                 {
1046                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1047                 }
1048                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1049                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1050             }
1051         }
1052     }
1053     for (d = 1; d < dd->ndim; d++)
1054     {
1055         comm->cell_f_max0[d] = extr_s[d-1][0];
1056         comm->cell_f_min1[d] = extr_s[d-1][1];
1057         if (debug)
1058         {
1059             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1060                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1061         }
1062     }
1063 }
1064
1065 static void dd_collect_cg(gmx_domdec_t *dd,
1066                           t_state      *state_local)
1067 {
1068     gmx_domdec_master_t *ma = nullptr;
1069     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = nullptr, nat_home = 0;
1070
1071     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1072     {
1073         /* The master has the correct distribution */
1074         return;
1075     }
1076
1077     if (state_local->ddp_count == dd->ddp_count)
1078     {
1079         /* The local state and DD are in sync, use the DD indices */
1080         ncg_home = dd->ncg_home;
1081         cg       = dd->index_gl;
1082         nat_home = dd->nat_home;
1083     }
1084     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1085     {
1086         /* The DD is out of sync with the local state, but we have stored
1087          * the cg indices with the local state, so we can use those.
1088          */
1089         t_block *cgs_gl;
1090
1091         cgs_gl = &dd->comm->cgs_gl;
1092
1093         ncg_home = state_local->cg_gl.size();
1094         cg       = state_local->cg_gl.data();
1095         nat_home = 0;
1096         for (i = 0; i < ncg_home; i++)
1097         {
1098             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1099         }
1100     }
1101     else
1102     {
1103         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1104     }
1105
1106     buf2[0] = ncg_home;
1107     buf2[1] = nat_home;
1108     if (DDMASTER(dd))
1109     {
1110         ma   = dd->ma;
1111         ibuf = ma->ibuf;
1112     }
1113     else
1114     {
1115         ibuf = nullptr;
1116     }
1117     /* Collect the charge group and atom counts on the master */
1118     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1119
1120     if (DDMASTER(dd))
1121     {
1122         ma->index[0] = 0;
1123         for (i = 0; i < dd->nnodes; i++)
1124         {
1125             ma->ncg[i]     = ma->ibuf[2*i];
1126             ma->nat[i]     = ma->ibuf[2*i+1];
1127             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1128
1129         }
1130         /* Make byte counts and indices */
1131         for (i = 0; i < dd->nnodes; i++)
1132         {
1133             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1134             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1135         }
1136         if (debug)
1137         {
1138             fprintf(debug, "Initial charge group distribution: ");
1139             for (i = 0; i < dd->nnodes; i++)
1140             {
1141                 fprintf(debug, " %d", ma->ncg[i]);
1142             }
1143             fprintf(debug, "\n");
1144         }
1145     }
1146
1147     /* Collect the charge group indices on the master */
1148     dd_gatherv(dd,
1149                ncg_home*sizeof(int), cg,
1150                DDMASTER(dd) ? ma->ibuf : nullptr,
1151                DDMASTER(dd) ? ma->ibuf+dd->nnodes : nullptr,
1152                DDMASTER(dd) ? ma->cg : nullptr);
1153
1154     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1155 }
1156
1157 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1158                                     const rvec *lv, rvec *v)
1159 {
1160     gmx_domdec_master_t *ma;
1161     int                  n, i, c, a, nalloc = 0;
1162     rvec                *buf = nullptr;
1163     t_block             *cgs_gl;
1164
1165     ma = dd->ma;
1166
1167     if (!DDMASTER(dd))
1168     {
1169 #if GMX_MPI
1170         MPI_Send(const_cast<void *>(static_cast<const void *>(lv)), dd->nat_home*sizeof(rvec), MPI_BYTE,
1171                  DDMASTERRANK(dd), dd->rank, dd->mpi_comm_all);
1172 #endif
1173     }
1174     else
1175     {
1176         /* Copy the master coordinates to the global array */
1177         cgs_gl = &dd->comm->cgs_gl;
1178
1179         n = DDMASTERRANK(dd);
1180         a = 0;
1181         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1182         {
1183             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1184             {
1185                 copy_rvec(lv[a++], v[c]);
1186             }
1187         }
1188
1189         for (n = 0; n < dd->nnodes; n++)
1190         {
1191             if (n != dd->rank)
1192             {
1193                 if (ma->nat[n] > nalloc)
1194                 {
1195                     nalloc = over_alloc_dd(ma->nat[n]);
1196                     srenew(buf, nalloc);
1197                 }
1198 #if GMX_MPI
1199                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1200                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1201 #endif
1202                 a = 0;
1203                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1204                 {
1205                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1206                     {
1207                         copy_rvec(buf[a++], v[c]);
1208                     }
1209                 }
1210             }
1211         }
1212         sfree(buf);
1213     }
1214 }
1215
1216 static void get_commbuffer_counts(gmx_domdec_t *dd,
1217                                   int **counts, int **disps)
1218 {
1219     gmx_domdec_master_t *ma;
1220     int                  n;
1221
1222     ma = dd->ma;
1223
1224     /* Make the rvec count and displacment arrays */
1225     *counts  = ma->ibuf;
1226     *disps   = ma->ibuf + dd->nnodes;
1227     for (n = 0; n < dd->nnodes; n++)
1228     {
1229         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1230         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1231     }
1232 }
1233
1234 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1235                                    const rvec *lv, rvec *v)
1236 {
1237     gmx_domdec_master_t *ma;
1238     int                 *rcounts = nullptr, *disps = nullptr;
1239     int                  n, i, c, a;
1240     rvec                *buf = nullptr;
1241     t_block             *cgs_gl;
1242
1243     ma = dd->ma;
1244
1245     if (DDMASTER(dd))
1246     {
1247         get_commbuffer_counts(dd, &rcounts, &disps);
1248
1249         buf = ma->vbuf;
1250     }
1251
1252     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1253
1254     if (DDMASTER(dd))
1255     {
1256         cgs_gl = &dd->comm->cgs_gl;
1257
1258         a = 0;
1259         for (n = 0; n < dd->nnodes; n++)
1260         {
1261             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1262             {
1263                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1264                 {
1265                     copy_rvec(buf[a++], v[c]);
1266                 }
1267             }
1268         }
1269     }
1270 }
1271
1272 void dd_collect_vec(gmx_domdec_t           *dd,
1273                     t_state                *state_local,
1274                     const PaddedRVecVector *localVector,
1275                     rvec                   *v)
1276 {
1277     dd_collect_cg(dd, state_local);
1278
1279     const rvec *lv = as_rvec_array(localVector->data());
1280
1281     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1282     {
1283         dd_collect_vec_sendrecv(dd, lv, v);
1284     }
1285     else
1286     {
1287         dd_collect_vec_gatherv(dd, lv, v);
1288     }
1289 }
1290
1291 void dd_collect_vec(gmx_domdec_t           *dd,
1292                     t_state                *state_local,
1293                     const PaddedRVecVector *localVector,
1294                     PaddedRVecVector       *vector)
1295 {
1296     dd_collect_vec(dd, state_local, localVector, as_rvec_array(vector->data()));
1297 }
1298
1299
1300 void dd_collect_state(gmx_domdec_t *dd,
1301                       t_state *state_local, t_state *state)
1302 {
1303     int nh = state->nhchainlength;
1304
1305     if (DDMASTER(dd))
1306     {
1307         for (int i = 0; i < efptNR; i++)
1308         {
1309             state->lambda[i] = state_local->lambda[i];
1310         }
1311         state->fep_state = state_local->fep_state;
1312         state->veta      = state_local->veta;
1313         state->vol0      = state_local->vol0;
1314         copy_mat(state_local->box, state->box);
1315         copy_mat(state_local->boxv, state->boxv);
1316         copy_mat(state_local->svir_prev, state->svir_prev);
1317         copy_mat(state_local->fvir_prev, state->fvir_prev);
1318         copy_mat(state_local->pres_prev, state->pres_prev);
1319
1320         for (int i = 0; i < state_local->ngtc; i++)
1321         {
1322             for (int j = 0; j < nh; j++)
1323             {
1324                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1325                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1326             }
1327             state->therm_integral[i] = state_local->therm_integral[i];
1328         }
1329         for (int i = 0; i < state_local->nnhpres; i++)
1330         {
1331             for (int j = 0; j < nh; j++)
1332             {
1333                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1334                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1335             }
1336         }
1337         state->baros_integral = state_local->baros_integral;
1338     }
1339     if (state_local->flags & (1 << estX))
1340     {
1341         dd_collect_vec(dd, state_local, &state_local->x, &state->x);
1342     }
1343     if (state_local->flags & (1 << estV))
1344     {
1345         dd_collect_vec(dd, state_local, &state_local->v, &state->v);
1346     }
1347     if (state_local->flags & (1 << estCGP))
1348     {
1349         dd_collect_vec(dd, state_local, &state_local->cg_p, &state->cg_p);
1350     }
1351 }
1352
1353 static void dd_resize_state(t_state *state, PaddedRVecVector *f, int natoms)
1354 {
1355     if (debug)
1356     {
1357         fprintf(debug, "Resizing state: currently %d, required %d\n", state->natoms, natoms);
1358     }
1359
1360     state_change_natoms(state, natoms);
1361
1362     if (f != nullptr)
1363     {
1364         /* We need to allocate one element extra, since we might use
1365          * (unaligned) 4-wide SIMD loads to access rvec entries.
1366          */
1367         f->resize(natoms + 1);
1368     }
1369 }
1370
1371 static void dd_check_alloc_ncg(t_forcerec       *fr,
1372                                t_state          *state,
1373                                PaddedRVecVector *f,
1374                                int               numChargeGroups)
1375 {
1376     if (numChargeGroups > fr->cg_nalloc)
1377     {
1378         if (debug)
1379         {
1380             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, numChargeGroups, over_alloc_dd(numChargeGroups));
1381         }
1382         fr->cg_nalloc = over_alloc_dd(numChargeGroups);
1383         srenew(fr->cginfo, fr->cg_nalloc);
1384         if (fr->cutoff_scheme == ecutsGROUP)
1385         {
1386             srenew(fr->cg_cm, fr->cg_nalloc);
1387         }
1388     }
1389     if (fr->cutoff_scheme == ecutsVERLET)
1390     {
1391         /* We don't use charge groups, we use x in state to set up
1392          * the atom communication.
1393          */
1394         dd_resize_state(state, f, numChargeGroups);
1395     }
1396 }
1397
1398 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1399                                        rvec *v, rvec *lv)
1400 {
1401     gmx_domdec_master_t *ma;
1402     int                  n, i, c, a, nalloc = 0;
1403     rvec                *buf = nullptr;
1404
1405     if (DDMASTER(dd))
1406     {
1407         ma  = dd->ma;
1408
1409         for (n = 0; n < dd->nnodes; n++)
1410         {
1411             if (n != dd->rank)
1412             {
1413                 if (ma->nat[n] > nalloc)
1414                 {
1415                     nalloc = over_alloc_dd(ma->nat[n]);
1416                     srenew(buf, nalloc);
1417                 }
1418                 /* Use lv as a temporary buffer */
1419                 a = 0;
1420                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1421                 {
1422                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1423                     {
1424                         copy_rvec(v[c], buf[a++]);
1425                     }
1426                 }
1427                 if (a != ma->nat[n])
1428                 {
1429                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1430                               a, ma->nat[n]);
1431                 }
1432
1433 #if GMX_MPI
1434                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1435                          DDRANK(dd, n), n, dd->mpi_comm_all);
1436 #endif
1437             }
1438         }
1439         sfree(buf);
1440         n = DDMASTERRANK(dd);
1441         a = 0;
1442         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1443         {
1444             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1445             {
1446                 copy_rvec(v[c], lv[a++]);
1447             }
1448         }
1449     }
1450     else
1451     {
1452 #if GMX_MPI
1453         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1454                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1455 #endif
1456     }
1457 }
1458
1459 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1460                                        rvec *v, rvec *lv)
1461 {
1462     gmx_domdec_master_t *ma;
1463     int                 *scounts = nullptr, *disps = nullptr;
1464     int                  n, i, c, a;
1465     rvec                *buf = nullptr;
1466
1467     if (DDMASTER(dd))
1468     {
1469         ma  = dd->ma;
1470
1471         get_commbuffer_counts(dd, &scounts, &disps);
1472
1473         buf = ma->vbuf;
1474         a   = 0;
1475         for (n = 0; n < dd->nnodes; n++)
1476         {
1477             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1478             {
1479                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1480                 {
1481                     copy_rvec(v[c], buf[a++]);
1482                 }
1483             }
1484         }
1485     }
1486
1487     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1488 }
1489
1490 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1491 {
1492     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1493     {
1494         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1495     }
1496     else
1497     {
1498         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1499     }
1500 }
1501
1502 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1503 {
1504     if (dfhist == nullptr)
1505     {
1506         return;
1507     }
1508
1509     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1510     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1511     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1512
1513     if (dfhist->nlambda > 0)
1514     {
1515         int nlam = dfhist->nlambda;
1516         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1517         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1518         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1519         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1520         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1521         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1522
1523         for (int i = 0; i < nlam; i++)
1524         {
1525             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1526             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1527             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1528             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1529             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1530             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1531         }
1532     }
1533 }
1534
1535 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1536                                 t_state *state, t_state *state_local,
1537                                 PaddedRVecVector *f)
1538 {
1539     int nh = state->nhchainlength;
1540
1541     if (DDMASTER(dd))
1542     {
1543         for (int i = 0; i < efptNR; i++)
1544         {
1545             state_local->lambda[i] = state->lambda[i];
1546         }
1547         state_local->fep_state = state->fep_state;
1548         state_local->veta      = state->veta;
1549         state_local->vol0      = state->vol0;
1550         copy_mat(state->box, state_local->box);
1551         copy_mat(state->box_rel, state_local->box_rel);
1552         copy_mat(state->boxv, state_local->boxv);
1553         copy_mat(state->svir_prev, state_local->svir_prev);
1554         copy_mat(state->fvir_prev, state_local->fvir_prev);
1555         if (state->dfhist != nullptr)
1556         {
1557             copy_df_history(state_local->dfhist, state->dfhist);
1558         }
1559         for (int i = 0; i < state_local->ngtc; i++)
1560         {
1561             for (int j = 0; j < nh; j++)
1562             {
1563                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1564                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1565             }
1566             state_local->therm_integral[i] = state->therm_integral[i];
1567         }
1568         for (int i = 0; i < state_local->nnhpres; i++)
1569         {
1570             for (int j = 0; j < nh; j++)
1571             {
1572                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1573                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1574             }
1575         }
1576         state_local->baros_integral = state->baros_integral;
1577     }
1578     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda.data());
1579     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1580     dd_bcast(dd, sizeof(real), &state_local->veta);
1581     dd_bcast(dd, sizeof(real), &state_local->vol0);
1582     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1583     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1584     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1585     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1586     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1587     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi.data());
1588     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi.data());
1589     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral.data());
1590     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi.data());
1591     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi.data());
1592
1593     /* communicate df_history -- required for restarting from checkpoint */
1594     dd_distribute_dfhist(dd, state_local->dfhist);
1595
1596     dd_resize_state(state_local, f, dd->nat_home);
1597
1598     if (state_local->flags & (1 << estX))
1599     {
1600         dd_distribute_vec(dd, cgs, as_rvec_array(state->x.data()), as_rvec_array(state_local->x.data()));
1601     }
1602     if (state_local->flags & (1 << estV))
1603     {
1604         dd_distribute_vec(dd, cgs, as_rvec_array(state->v.data()), as_rvec_array(state_local->v.data()));
1605     }
1606     if (state_local->flags & (1 << estCGP))
1607     {
1608         dd_distribute_vec(dd, cgs, as_rvec_array(state->cg_p.data()), as_rvec_array(state_local->cg_p.data()));
1609     }
1610 }
1611
1612 static char dim2char(int dim)
1613 {
1614     char c = '?';
1615
1616     switch (dim)
1617     {
1618         case XX: c = 'X'; break;
1619         case YY: c = 'Y'; break;
1620         case ZZ: c = 'Z'; break;
1621         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1622     }
1623
1624     return c;
1625 }
1626
1627 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1628                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1629 {
1630     rvec   grid_s[2], *grid_r = nullptr, cx, r;
1631     char   fname[STRLEN], buf[22];
1632     FILE  *out;
1633     int    a, i, d, z, y, x;
1634     matrix tric;
1635     real   vol;
1636
1637     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1638     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1639
1640     if (DDMASTER(dd))
1641     {
1642         snew(grid_r, 2*dd->nnodes);
1643     }
1644
1645     dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : nullptr);
1646
1647     if (DDMASTER(dd))
1648     {
1649         for (d = 0; d < DIM; d++)
1650         {
1651             for (i = 0; i < DIM; i++)
1652             {
1653                 if (d == i)
1654                 {
1655                     tric[d][i] = 1;
1656                 }
1657                 else
1658                 {
1659                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1660                     {
1661                         tric[d][i] = box[i][d]/box[i][i];
1662                     }
1663                     else
1664                     {
1665                         tric[d][i] = 0;
1666                     }
1667                 }
1668             }
1669         }
1670         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1671         out = gmx_fio_fopen(fname, "w");
1672         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1673         a = 1;
1674         for (i = 0; i < dd->nnodes; i++)
1675         {
1676             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1677             for (d = 0; d < DIM; d++)
1678             {
1679                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1680             }
1681             for (z = 0; z < 2; z++)
1682             {
1683                 for (y = 0; y < 2; y++)
1684                 {
1685                     for (x = 0; x < 2; x++)
1686                     {
1687                         cx[XX] = grid_r[i*2+x][XX];
1688                         cx[YY] = grid_r[i*2+y][YY];
1689                         cx[ZZ] = grid_r[i*2+z][ZZ];
1690                         mvmul(tric, cx, r);
1691                         gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
1692                                                  10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
1693                     }
1694                 }
1695             }
1696             for (d = 0; d < DIM; d++)
1697             {
1698                 for (x = 0; x < 4; x++)
1699                 {
1700                     switch (d)
1701                     {
1702                         case 0: y = 1 + i*8 + 2*x; break;
1703                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1704                         case 2: y = 1 + i*8 + x; break;
1705                     }
1706                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
1707                 }
1708             }
1709         }
1710         gmx_fio_fclose(out);
1711         sfree(grid_r);
1712     }
1713 }
1714
1715 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
1716                   const gmx_mtop_t *mtop, t_commrec *cr,
1717                   int natoms, rvec x[], matrix box)
1718 {
1719     char          fname[STRLEN], buf[22];
1720     FILE         *out;
1721     int           i, ii, resnr, c;
1722     const char   *atomname, *resname;
1723     real          b;
1724     gmx_domdec_t *dd;
1725
1726     dd = cr->dd;
1727     if (natoms == -1)
1728     {
1729         natoms = dd->comm->nat[ddnatVSITE];
1730     }
1731
1732     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
1733
1734     out = gmx_fio_fopen(fname, "w");
1735
1736     fprintf(out, "TITLE     %s\n", title);
1737     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1738     int molb = 0;
1739     for (i = 0; i < natoms; i++)
1740     {
1741         ii = dd->gatindex[i];
1742         mtopGetAtomAndResidueName(mtop, ii, &molb, &atomname, &resnr, &resname, nullptr);
1743         if (i < dd->comm->nat[ddnatZONE])
1744         {
1745             c = 0;
1746             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1747             {
1748                 c++;
1749             }
1750             b = c;
1751         }
1752         else if (i < dd->comm->nat[ddnatVSITE])
1753         {
1754             b = dd->comm->zones.n;
1755         }
1756         else
1757         {
1758             b = dd->comm->zones.n + 1;
1759         }
1760         gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
1761                                  10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
1762     }
1763     fprintf(out, "TER\n");
1764
1765     gmx_fio_fclose(out);
1766 }
1767
1768 real dd_cutoff_multibody(const gmx_domdec_t *dd)
1769 {
1770     gmx_domdec_comm_t *comm;
1771     int                di;
1772     real               r;
1773
1774     comm = dd->comm;
1775
1776     r = -1;
1777     if (comm->bInterCGBondeds)
1778     {
1779         if (comm->cutoff_mbody > 0)
1780         {
1781             r = comm->cutoff_mbody;
1782         }
1783         else
1784         {
1785             /* cutoff_mbody=0 means we do not have DLB */
1786             r = comm->cellsize_min[dd->dim[0]];
1787             for (di = 1; di < dd->ndim; di++)
1788             {
1789                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
1790             }
1791             if (comm->bBondComm)
1792             {
1793                 r = std::max(r, comm->cutoff_mbody);
1794             }
1795             else
1796             {
1797                 r = std::min(r, comm->cutoff);
1798             }
1799         }
1800     }
1801
1802     return r;
1803 }
1804
1805 real dd_cutoff_twobody(const gmx_domdec_t *dd)
1806 {
1807     real r_mb;
1808
1809     r_mb = dd_cutoff_multibody(dd);
1810
1811     return std::max(dd->comm->cutoff, r_mb);
1812 }
1813
1814
1815 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
1816                                    ivec coord_pme)
1817 {
1818     int nc, ntot;
1819
1820     nc   = dd->nc[dd->comm->cartpmedim];
1821     ntot = dd->comm->ntot[dd->comm->cartpmedim];
1822     copy_ivec(coord, coord_pme);
1823     coord_pme[dd->comm->cartpmedim] =
1824         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
1825 }
1826
1827 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
1828 {
1829     int npp, npme;
1830
1831     npp  = dd->nnodes;
1832     npme = dd->comm->npmenodes;
1833
1834     /* Here we assign a PME node to communicate with this DD node
1835      * by assuming that the major index of both is x.
1836      * We add cr->npmenodes/2 to obtain an even distribution.
1837      */
1838     return (ddindex*npme + npme/2)/npp;
1839 }
1840
1841 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
1842 {
1843     int *pme_rank;
1844     int  n, i, p0, p1;
1845
1846     snew(pme_rank, dd->comm->npmenodes);
1847     n = 0;
1848     for (i = 0; i < dd->nnodes; i++)
1849     {
1850         p0 = ddindex2pmeindex(dd, i);
1851         p1 = ddindex2pmeindex(dd, i+1);
1852         if (i+1 == dd->nnodes || p1 > p0)
1853         {
1854             if (debug)
1855             {
1856                 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
1857             }
1858             pme_rank[n] = i + 1 + n;
1859             n++;
1860         }
1861     }
1862
1863     return pme_rank;
1864 }
1865
1866 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
1867 {
1868     gmx_domdec_t *dd;
1869     ivec          coords;
1870     int           slab;
1871
1872     dd = cr->dd;
1873     /*
1874        if (dd->comm->bCartesian) {
1875        gmx_ddindex2xyz(dd->nc,ddindex,coords);
1876        dd_coords2pmecoords(dd,coords,coords_pme);
1877        copy_ivec(dd->ntot,nc);
1878        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
1879        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1880
1881        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
1882        } else {
1883        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
1884        }
1885      */
1886     coords[XX] = x;
1887     coords[YY] = y;
1888     coords[ZZ] = z;
1889     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
1890
1891     return slab;
1892 }
1893
1894 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
1895 {
1896     gmx_domdec_comm_t *comm;
1897     ivec               coords;
1898     int                ddindex, nodeid = -1;
1899
1900     comm = cr->dd->comm;
1901
1902     coords[XX] = x;
1903     coords[YY] = y;
1904     coords[ZZ] = z;
1905     if (comm->bCartesianPP_PME)
1906     {
1907 #if GMX_MPI
1908         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
1909 #endif
1910     }
1911     else
1912     {
1913         ddindex = dd_index(cr->dd->nc, coords);
1914         if (comm->bCartesianPP)
1915         {
1916             nodeid = comm->ddindex2simnodeid[ddindex];
1917         }
1918         else
1919         {
1920             if (comm->pmenodes)
1921             {
1922                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
1923             }
1924             else
1925             {
1926                 nodeid = ddindex;
1927             }
1928         }
1929     }
1930
1931     return nodeid;
1932 }
1933
1934 static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
1935                               const t_commrec gmx_unused *cr,
1936                               int                         sim_nodeid)
1937 {
1938     int pmenode = -1;
1939
1940     const gmx_domdec_comm_t *comm = dd->comm;
1941
1942     /* This assumes a uniform x domain decomposition grid cell size */
1943     if (comm->bCartesianPP_PME)
1944     {
1945 #if GMX_MPI
1946         ivec coord, coord_pme;
1947         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
1948         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1949         {
1950             /* This is a PP node */
1951             dd_cart_coord2pmecoord(dd, coord, coord_pme);
1952             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
1953         }
1954 #endif
1955     }
1956     else if (comm->bCartesianPP)
1957     {
1958         if (sim_nodeid < dd->nnodes)
1959         {
1960             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1961         }
1962     }
1963     else
1964     {
1965         /* This assumes DD cells with identical x coordinates
1966          * are numbered sequentially.
1967          */
1968         if (dd->comm->pmenodes == nullptr)
1969         {
1970             if (sim_nodeid < dd->nnodes)
1971             {
1972                 /* The DD index equals the nodeid */
1973                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1974             }
1975         }
1976         else
1977         {
1978             int i = 0;
1979             while (sim_nodeid > dd->comm->pmenodes[i])
1980             {
1981                 i++;
1982             }
1983             if (sim_nodeid < dd->comm->pmenodes[i])
1984             {
1985                 pmenode = dd->comm->pmenodes[i];
1986             }
1987         }
1988     }
1989
1990     return pmenode;
1991 }
1992
1993 void get_pme_nnodes(const gmx_domdec_t *dd,
1994                     int *npmenodes_x, int *npmenodes_y)
1995 {
1996     if (dd != nullptr)
1997     {
1998         *npmenodes_x = dd->comm->npmenodes_x;
1999         *npmenodes_y = dd->comm->npmenodes_y;
2000     }
2001     else
2002     {
2003         *npmenodes_x = 1;
2004         *npmenodes_y = 1;
2005     }
2006 }
2007
2008 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2009                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2010 {
2011     gmx_domdec_t *dd;
2012     int           x, y, z;
2013     ivec          coord, coord_pme;
2014
2015     dd = cr->dd;
2016
2017     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2018
2019     *nmy_ddnodes = 0;
2020     for (x = 0; x < dd->nc[XX]; x++)
2021     {
2022         for (y = 0; y < dd->nc[YY]; y++)
2023         {
2024             for (z = 0; z < dd->nc[ZZ]; z++)
2025             {
2026                 if (dd->comm->bCartesianPP_PME)
2027                 {
2028                     coord[XX] = x;
2029                     coord[YY] = y;
2030                     coord[ZZ] = z;
2031                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2032                     if (dd->ci[XX] == coord_pme[XX] &&
2033                         dd->ci[YY] == coord_pme[YY] &&
2034                         dd->ci[ZZ] == coord_pme[ZZ])
2035                     {
2036                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2037                     }
2038                 }
2039                 else
2040                 {
2041                     /* The slab corresponds to the nodeid in the PME group */
2042                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2043                     {
2044                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2045                     }
2046                 }
2047             }
2048         }
2049     }
2050
2051     /* The last PP-only node is the peer node */
2052     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2053
2054     if (debug)
2055     {
2056         fprintf(debug, "Receive coordinates from PP ranks:");
2057         for (x = 0; x < *nmy_ddnodes; x++)
2058         {
2059             fprintf(debug, " %d", (*my_ddnodes)[x]);
2060         }
2061         fprintf(debug, "\n");
2062     }
2063 }
2064
2065 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
2066 {
2067     gmx_bool bReceive = TRUE;
2068
2069     if (cr->npmenodes < dd->nnodes)
2070     {
2071         gmx_domdec_comm_t *comm = dd->comm;
2072         if (comm->bCartesianPP_PME)
2073         {
2074 #if GMX_MPI
2075             int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2076             ivec coords;
2077             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2078             coords[comm->cartpmedim]++;
2079             if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2080             {
2081                 int rank;
2082                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2083                 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
2084                 {
2085                     /* This is not the last PP node for pmenode */
2086                     bReceive = FALSE;
2087                 }
2088             }
2089 #else
2090             GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
2091 #endif
2092         }
2093         else
2094         {
2095             int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2096             if (cr->sim_nodeid+1 < cr->nnodes &&
2097                 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
2098             {
2099                 /* This is not the last PP node for pmenode */
2100                 bReceive = FALSE;
2101             }
2102         }
2103     }
2104
2105     return bReceive;
2106 }
2107
2108 static void set_zones_ncg_home(gmx_domdec_t *dd)
2109 {
2110     gmx_domdec_zones_t *zones;
2111     int                 i;
2112
2113     zones = &dd->comm->zones;
2114
2115     zones->cg_range[0] = 0;
2116     for (i = 1; i < zones->n+1; i++)
2117     {
2118         zones->cg_range[i] = dd->ncg_home;
2119     }
2120     /* zone_ncg1[0] should always be equal to ncg_home */
2121     dd->comm->zone_ncg1[0] = dd->ncg_home;
2122 }
2123
2124 static void rebuild_cgindex(gmx_domdec_t *dd,
2125                             const int *gcgs_index, const t_state *state)
2126 {
2127     int * gmx_restrict dd_cg_gl = dd->index_gl;
2128     int * gmx_restrict cgindex  = dd->cgindex;
2129     int                nat      = 0;
2130
2131     /* Copy back the global charge group indices from state
2132      * and rebuild the local charge group to atom index.
2133      */
2134     cgindex[0] = nat;
2135     for (unsigned int i = 0; i < state->cg_gl.size(); i++)
2136     {
2137         cgindex[i]  = nat;
2138         int cg_gl   = state->cg_gl[i];
2139         dd_cg_gl[i] = cg_gl;
2140         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2141     }
2142     cgindex[state->cg_gl.size()] = nat;
2143
2144     dd->ncg_home = state->cg_gl.size();
2145     dd->nat_home = nat;
2146
2147     set_zones_ncg_home(dd);
2148 }
2149
2150 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2151 {
2152     while (cg >= cginfo_mb->cg_end)
2153     {
2154         cginfo_mb++;
2155     }
2156
2157     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2158 }
2159
2160 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2161                           t_forcerec *fr, char *bLocalCG)
2162 {
2163     cginfo_mb_t *cginfo_mb;
2164     int         *cginfo;
2165     int          cg;
2166
2167     if (fr != nullptr)
2168     {
2169         cginfo_mb = fr->cginfo_mb;
2170         cginfo    = fr->cginfo;
2171
2172         for (cg = cg0; cg < cg1; cg++)
2173         {
2174             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2175         }
2176     }
2177
2178     if (bLocalCG != nullptr)
2179     {
2180         for (cg = cg0; cg < cg1; cg++)
2181         {
2182             bLocalCG[index_gl[cg]] = TRUE;
2183         }
2184     }
2185 }
2186
2187 static void make_dd_indices(gmx_domdec_t *dd,
2188                             const int *gcgs_index, int cg_start)
2189 {
2190     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2191     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2192     gmx_bool     bCGs;
2193
2194     if (dd->nat_tot > dd->gatindex_nalloc)
2195     {
2196         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2197         srenew(dd->gatindex, dd->gatindex_nalloc);
2198     }
2199
2200     nzone      = dd->comm->zones.n;
2201     zone2cg    = dd->comm->zones.cg_range;
2202     zone_ncg1  = dd->comm->zone_ncg1;
2203     index_gl   = dd->index_gl;
2204     gatindex   = dd->gatindex;
2205     bCGs       = dd->comm->bCGs;
2206
2207     if (zone2cg[1] != dd->ncg_home)
2208     {
2209         gmx_incons("dd->ncg_zone is not up to date");
2210     }
2211
2212     /* Make the local to global and global to local atom index */
2213     a = dd->cgindex[cg_start];
2214     for (zone = 0; zone < nzone; zone++)
2215     {
2216         if (zone == 0)
2217         {
2218             cg0 = cg_start;
2219         }
2220         else
2221         {
2222             cg0 = zone2cg[zone];
2223         }
2224         cg1    = zone2cg[zone+1];
2225         cg1_p1 = cg0 + zone_ncg1[zone];
2226
2227         for (cg = cg0; cg < cg1; cg++)
2228         {
2229             zone1 = zone;
2230             if (cg >= cg1_p1)
2231             {
2232                 /* Signal that this cg is from more than one pulse away */
2233                 zone1 += nzone;
2234             }
2235             cg_gl = index_gl[cg];
2236             if (bCGs)
2237             {
2238                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2239                 {
2240                     gatindex[a] = a_gl;
2241                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2242                     a++;
2243                 }
2244             }
2245             else
2246             {
2247                 gatindex[a] = cg_gl;
2248                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2249                 a++;
2250             }
2251         }
2252     }
2253 }
2254
2255 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2256                           const char *where)
2257 {
2258     int i, ngl, nerr;
2259
2260     nerr = 0;
2261     if (bLocalCG == nullptr)
2262     {
2263         return nerr;
2264     }
2265     for (i = 0; i < dd->ncg_tot; i++)
2266     {
2267         if (!bLocalCG[dd->index_gl[i]])
2268         {
2269             fprintf(stderr,
2270                     "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2271             nerr++;
2272         }
2273     }
2274     ngl = 0;
2275     for (i = 0; i < ncg_sys; i++)
2276     {
2277         if (bLocalCG[i])
2278         {
2279             ngl++;
2280         }
2281     }
2282     if (ngl != dd->ncg_tot)
2283     {
2284         fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2285         nerr++;
2286     }
2287
2288     return nerr;
2289 }
2290
2291 static void check_index_consistency(gmx_domdec_t *dd,
2292                                     int natoms_sys, int ncg_sys,
2293                                     const char *where)
2294 {
2295     int   nerr, ngl, i, a, cell;
2296     int  *have;
2297
2298     nerr = 0;
2299
2300     if (dd->comm->DD_debug > 1)
2301     {
2302         snew(have, natoms_sys);
2303         for (a = 0; a < dd->nat_tot; a++)
2304         {
2305             if (have[dd->gatindex[a]] > 0)
2306             {
2307                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2308             }
2309             else
2310             {
2311                 have[dd->gatindex[a]] = a + 1;
2312             }
2313         }
2314         sfree(have);
2315     }
2316
2317     snew(have, dd->nat_tot);
2318
2319     ngl  = 0;
2320     for (i = 0; i < natoms_sys; i++)
2321     {
2322         if (ga2la_get(dd->ga2la, i, &a, &cell))
2323         {
2324             if (a >= dd->nat_tot)
2325             {
2326                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2327                 nerr++;
2328             }
2329             else
2330             {
2331                 have[a] = 1;
2332                 if (dd->gatindex[a] != i)
2333                 {
2334                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2335                     nerr++;
2336                 }
2337             }
2338             ngl++;
2339         }
2340     }
2341     if (ngl != dd->nat_tot)
2342     {
2343         fprintf(stderr,
2344                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2345                 dd->rank, where, ngl, dd->nat_tot);
2346     }
2347     for (a = 0; a < dd->nat_tot; a++)
2348     {
2349         if (have[a] == 0)
2350         {
2351             fprintf(stderr,
2352                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
2353                     dd->rank, where, a+1, dd->gatindex[a]+1);
2354         }
2355     }
2356     sfree(have);
2357
2358     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2359
2360     if (nerr > 0)
2361     {
2362         gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2363                   dd->rank, where, nerr);
2364     }
2365 }
2366
2367 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2368 {
2369     int   i;
2370     char *bLocalCG;
2371
2372     if (a_start == 0)
2373     {
2374         /* Clear the whole list without searching */
2375         ga2la_clear(dd->ga2la);
2376     }
2377     else
2378     {
2379         for (i = a_start; i < dd->nat_tot; i++)
2380         {
2381             ga2la_del(dd->ga2la, dd->gatindex[i]);
2382         }
2383     }
2384
2385     bLocalCG = dd->comm->bLocalCG;
2386     if (bLocalCG)
2387     {
2388         for (i = cg_start; i < dd->ncg_tot; i++)
2389         {
2390             bLocalCG[dd->index_gl[i]] = FALSE;
2391         }
2392     }
2393
2394     dd_clear_local_vsite_indices(dd);
2395
2396     if (dd->constraints)
2397     {
2398         dd_clear_local_constraint_indices(dd);
2399     }
2400 }
2401
2402 /* This function should be used for moving the domain boudaries during DLB,
2403  * for obtaining the minimum cell size. It checks the initially set limit
2404  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2405  * and, possibly, a longer cut-off limit set for PME load balancing.
2406  */
2407 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2408 {
2409     real cellsize_min;
2410
2411     cellsize_min = comm->cellsize_min[dim];
2412
2413     if (!comm->bVacDLBNoLimit)
2414     {
2415         /* The cut-off might have changed, e.g. by PME load balacning,
2416          * from the value used to set comm->cellsize_min, so check it.
2417          */
2418         cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2419
2420         if (comm->bPMELoadBalDLBLimits)
2421         {
2422             /* Check for the cut-off limit set by the PME load balancing */
2423             cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2424         }
2425     }
2426
2427     return cellsize_min;
2428 }
2429
2430 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2431                             int dim_ind)
2432 {
2433     real grid_jump_limit;
2434
2435     /* The distance between the boundaries of cells at distance
2436      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2437      * and by the fact that cells should not be shifted by more than
2438      * half their size, such that cg's only shift by one cell
2439      * at redecomposition.
2440      */
2441     grid_jump_limit = comm->cellsize_limit;
2442     if (!comm->bVacDLBNoLimit)
2443     {
2444         if (comm->bPMELoadBalDLBLimits)
2445         {
2446             cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
2447         }
2448         grid_jump_limit = std::max(grid_jump_limit,
2449                                    cutoff/comm->cd[dim_ind].np);
2450     }
2451
2452     return grid_jump_limit;
2453 }
2454
2455 static gmx_bool check_grid_jump(gmx_int64_t     step,
2456                                 gmx_domdec_t   *dd,
2457                                 real            cutoff,
2458                                 gmx_ddbox_t    *ddbox,
2459                                 gmx_bool        bFatal)
2460 {
2461     gmx_domdec_comm_t *comm;
2462     int                d, dim;
2463     real               limit, bfac;
2464     gmx_bool           bInvalid;
2465
2466     bInvalid = FALSE;
2467
2468     comm = dd->comm;
2469
2470     for (d = 1; d < dd->ndim; d++)
2471     {
2472         dim   = dd->dim[d];
2473         limit = grid_jump_limit(comm, cutoff, d);
2474         bfac  = ddbox->box_size[dim];
2475         if (ddbox->tric_dir[dim])
2476         {
2477             bfac *= ddbox->skew_fac[dim];
2478         }
2479         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2480                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2481         {
2482             bInvalid = TRUE;
2483
2484             if (bFatal)
2485             {
2486                 char buf[22];
2487
2488                 /* This error should never be triggered under normal
2489                  * circumstances, but you never know ...
2490                  */
2491                 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2492                           gmx_step_str(step, buf),
2493                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2494             }
2495         }
2496     }
2497
2498     return bInvalid;
2499 }
2500
2501 static int dd_load_count(gmx_domdec_comm_t *comm)
2502 {
2503     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2504 }
2505
2506 static float dd_force_load(gmx_domdec_comm_t *comm)
2507 {
2508     float load;
2509
2510     if (comm->eFlop)
2511     {
2512         load = comm->flop;
2513         if (comm->eFlop > 1)
2514         {
2515             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2516         }
2517     }
2518     else
2519     {
2520         load = comm->cycl[ddCyclF];
2521         if (comm->cycl_n[ddCyclF] > 1)
2522         {
2523             /* Subtract the maximum of the last n cycle counts
2524              * to get rid of possible high counts due to other sources,
2525              * for instance system activity, that would otherwise
2526              * affect the dynamic load balancing.
2527              */
2528             load -= comm->cycl_max[ddCyclF];
2529         }
2530
2531 #if GMX_MPI
2532         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2533         {
2534             float gpu_wait, gpu_wait_sum;
2535
2536             gpu_wait = comm->cycl[ddCyclWaitGPU];
2537             if (comm->cycl_n[ddCyclF] > 1)
2538             {
2539                 /* We should remove the WaitGPU time of the same MD step
2540                  * as the one with the maximum F time, since the F time
2541                  * and the wait time are not independent.
2542                  * Furthermore, the step for the max F time should be chosen
2543                  * the same on all ranks that share the same GPU.
2544                  * But to keep the code simple, we remove the average instead.
2545                  * The main reason for artificially long times at some steps
2546                  * is spurious CPU activity or MPI time, so we don't expect
2547                  * that changes in the GPU wait time matter a lot here.
2548                  */
2549                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2550             }
2551             /* Sum the wait times over the ranks that share the same GPU */
2552             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2553                           comm->mpi_comm_gpu_shared);
2554             /* Replace the wait time by the average over the ranks */
2555             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2556         }
2557 #endif
2558     }
2559
2560     return load;
2561 }
2562
2563 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2564 {
2565     gmx_domdec_comm_t *comm;
2566     int                i;
2567
2568     comm = dd->comm;
2569
2570     snew(*dim_f, dd->nc[dim]+1);
2571     (*dim_f)[0] = 0;
2572     for (i = 1; i < dd->nc[dim]; i++)
2573     {
2574         if (comm->slb_frac[dim])
2575         {
2576             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2577         }
2578         else
2579         {
2580             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2581         }
2582     }
2583     (*dim_f)[dd->nc[dim]] = 1;
2584 }
2585
2586 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2587 {
2588     int  pmeindex, slab, nso, i;
2589     ivec xyz;
2590
2591     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2592     {
2593         ddpme->dim = YY;
2594     }
2595     else
2596     {
2597         ddpme->dim = dimind;
2598     }
2599     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2600
2601     ddpme->nslab = (ddpme->dim == 0 ?
2602                     dd->comm->npmenodes_x :
2603                     dd->comm->npmenodes_y);
2604
2605     if (ddpme->nslab <= 1)
2606     {
2607         return;
2608     }
2609
2610     nso = dd->comm->npmenodes/ddpme->nslab;
2611     /* Determine for each PME slab the PP location range for dimension dim */
2612     snew(ddpme->pp_min, ddpme->nslab);
2613     snew(ddpme->pp_max, ddpme->nslab);
2614     for (slab = 0; slab < ddpme->nslab; slab++)
2615     {
2616         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2617         ddpme->pp_max[slab] = 0;
2618     }
2619     for (i = 0; i < dd->nnodes; i++)
2620     {
2621         ddindex2xyz(dd->nc, i, xyz);
2622         /* For y only use our y/z slab.
2623          * This assumes that the PME x grid size matches the DD grid size.
2624          */
2625         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2626         {
2627             pmeindex = ddindex2pmeindex(dd, i);
2628             if (dimind == 0)
2629             {
2630                 slab = pmeindex/nso;
2631             }
2632             else
2633             {
2634                 slab = pmeindex % ddpme->nslab;
2635             }
2636             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
2637             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
2638         }
2639     }
2640
2641     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2642 }
2643
2644 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
2645 {
2646     if (dd->comm->ddpme[0].dim == XX)
2647     {
2648         return dd->comm->ddpme[0].maxshift;
2649     }
2650     else
2651     {
2652         return 0;
2653     }
2654 }
2655
2656 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
2657 {
2658     if (dd->comm->ddpme[0].dim == YY)
2659     {
2660         return dd->comm->ddpme[0].maxshift;
2661     }
2662     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2663     {
2664         return dd->comm->ddpme[1].maxshift;
2665     }
2666     else
2667     {
2668         return 0;
2669     }
2670 }
2671
2672 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2673                              gmx_bool bUniform, const gmx_ddbox_t *ddbox,
2674                              const real *cell_f)
2675 {
2676     gmx_domdec_comm_t *comm;
2677     int                nc, ns, s;
2678     int               *xmin, *xmax;
2679     real               range, pme_boundary;
2680     int                sh;
2681
2682     comm = dd->comm;
2683     nc   = dd->nc[ddpme->dim];
2684     ns   = ddpme->nslab;
2685
2686     if (!ddpme->dim_match)
2687     {
2688         /* PP decomposition is not along dim: the worst situation */
2689         sh = ns/2;
2690     }
2691     else if (ns <= 3 || (bUniform && ns == nc))
2692     {
2693         /* The optimal situation */
2694         sh = 1;
2695     }
2696     else
2697     {
2698         /* We need to check for all pme nodes which nodes they
2699          * could possibly need to communicate with.
2700          */
2701         xmin = ddpme->pp_min;
2702         xmax = ddpme->pp_max;
2703         /* Allow for atoms to be maximally 2/3 times the cut-off
2704          * out of their DD cell. This is a reasonable balance between
2705          * between performance and support for most charge-group/cut-off
2706          * combinations.
2707          */
2708         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2709         /* Avoid extra communication when we are exactly at a boundary */
2710         range *= 0.999;
2711
2712         sh = 1;
2713         for (s = 0; s < ns; s++)
2714         {
2715             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2716             pme_boundary = (real)s/ns;
2717             while (sh+1 < ns &&
2718                    ((s-(sh+1) >= 0 &&
2719                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2720                     (s-(sh+1) <  0 &&
2721                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2722             {
2723                 sh++;
2724             }
2725             pme_boundary = (real)(s+1)/ns;
2726             while (sh+1 < ns &&
2727                    ((s+(sh+1) <  ns &&
2728                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2729                     (s+(sh+1) >= ns &&
2730                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2731             {
2732                 sh++;
2733             }
2734         }
2735     }
2736
2737     ddpme->maxshift = sh;
2738
2739     if (debug)
2740     {
2741         fprintf(debug, "PME slab communication range for dim %d is %d\n",
2742                 ddpme->dim, ddpme->maxshift);
2743     }
2744 }
2745
2746 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
2747 {
2748     int d, dim;
2749
2750     for (d = 0; d < dd->ndim; d++)
2751     {
2752         dim = dd->dim[d];
2753         if (dim < ddbox->nboundeddim &&
2754             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2755             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2756         {
2757             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2758                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2759                       dd->nc[dim], dd->comm->cellsize_limit);
2760         }
2761     }
2762 }
2763
2764 enum {
2765     setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
2766 };
2767
2768 /* Set the domain boundaries. Use for static (or no) load balancing,
2769  * and also for the starting state for dynamic load balancing.
2770  * setmode determine if and where the boundaries are stored, use enum above.
2771  * Returns the number communication pulses in npulse.
2772  */
2773 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, const gmx_ddbox_t *ddbox,
2774                                   int setmode, ivec npulse)
2775 {
2776     gmx_domdec_comm_t *comm;
2777     int                d, j;
2778     rvec               cellsize_min;
2779     real              *cell_x, cell_dx, cellsize;
2780
2781     comm = dd->comm;
2782
2783     for (d = 0; d < DIM; d++)
2784     {
2785         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2786         npulse[d]       = 1;
2787         if (dd->nc[d] == 1 || comm->slb_frac[d] == nullptr)
2788         {
2789             /* Uniform grid */
2790             cell_dx = ddbox->box_size[d]/dd->nc[d];
2791             switch (setmode)
2792             {
2793                 case setcellsizeslbMASTER:
2794                     for (j = 0; j < dd->nc[d]+1; j++)
2795                     {
2796                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2797                     }
2798                     break;
2799                 case setcellsizeslbLOCAL:
2800                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2801                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2802                     break;
2803                 default:
2804                     break;
2805             }
2806             cellsize = cell_dx*ddbox->skew_fac[d];
2807             while (cellsize*npulse[d] < comm->cutoff)
2808             {
2809                 npulse[d]++;
2810             }
2811             cellsize_min[d] = cellsize;
2812         }
2813         else
2814         {
2815             /* Statically load balanced grid */
2816             /* Also when we are not doing a master distribution we determine
2817              * all cell borders in a loop to obtain identical values
2818              * to the master distribution case and to determine npulse.
2819              */
2820             if (setmode == setcellsizeslbMASTER)
2821             {
2822                 cell_x = dd->ma->cell_x[d];
2823             }
2824             else
2825             {
2826                 snew(cell_x, dd->nc[d]+1);
2827             }
2828             cell_x[0] = ddbox->box0[d];
2829             for (j = 0; j < dd->nc[d]; j++)
2830             {
2831                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
2832                 cell_x[j+1] = cell_x[j] + cell_dx;
2833                 cellsize    = cell_dx*ddbox->skew_fac[d];
2834                 while (cellsize*npulse[d] < comm->cutoff &&
2835                        npulse[d] < dd->nc[d]-1)
2836                 {
2837                     npulse[d]++;
2838                 }
2839                 cellsize_min[d] = std::min(cellsize_min[d], cellsize);
2840             }
2841             if (setmode == setcellsizeslbLOCAL)
2842             {
2843                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2844                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2845             }
2846             if (setmode != setcellsizeslbMASTER)
2847             {
2848                 sfree(cell_x);
2849             }
2850         }
2851         /* The following limitation is to avoid that a cell would receive
2852          * some of its own home charge groups back over the periodic boundary.
2853          * Double charge groups cause trouble with the global indices.
2854          */
2855         if (d < ddbox->npbcdim &&
2856             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2857         {
2858             char error_string[STRLEN];
2859
2860             sprintf(error_string,
2861                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2862                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
2863                     comm->cutoff,
2864                     dd->nc[d], dd->nc[d],
2865                     dd->nnodes > dd->nc[d] ? "cells" : "ranks");
2866
2867             if (setmode == setcellsizeslbLOCAL)
2868             {
2869                 gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2870                                      error_string);
2871             }
2872             else
2873             {
2874                 gmx_fatal(FARGS, error_string);
2875             }
2876         }
2877     }
2878
2879     if (!isDlbOn(comm))
2880     {
2881         copy_rvec(cellsize_min, comm->cellsize_min);
2882     }
2883
2884     for (d = 0; d < comm->npmedecompdim; d++)
2885     {
2886         set_pme_maxshift(dd, &comm->ddpme[d],
2887                          comm->slb_frac[dd->dim[d]] == nullptr, ddbox,
2888                          comm->ddpme[d].slb_dim_f);
2889     }
2890 }
2891
2892
2893 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2894                                                   int d, int dim, domdec_root_t *root,
2895                                                   const gmx_ddbox_t *ddbox,
2896                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
2897 {
2898     gmx_domdec_comm_t *comm;
2899     int                ncd, i, j, nmin, nmin_old;
2900     gmx_bool           bLimLo, bLimHi;
2901     real              *cell_size;
2902     real               fac, halfway, cellsize_limit_f_i, region_size;
2903     gmx_bool           bPBC, bLastHi = FALSE;
2904     int                nrange[] = {range[0], range[1]};
2905
2906     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
2907
2908     comm = dd->comm;
2909
2910     ncd = dd->nc[dim];
2911
2912     bPBC = (dim < ddbox->npbcdim);
2913
2914     cell_size = root->buf_ncd;
2915
2916     if (debug)
2917     {
2918         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
2919     }
2920
2921     /* First we need to check if the scaling does not make cells
2922      * smaller than the smallest allowed size.
2923      * We need to do this iteratively, since if a cell is too small,
2924      * it needs to be enlarged, which makes all the other cells smaller,
2925      * which could in turn make another cell smaller than allowed.
2926      */
2927     for (i = range[0]; i < range[1]; i++)
2928     {
2929         root->bCellMin[i] = FALSE;
2930     }
2931     nmin = 0;
2932     do
2933     {
2934         nmin_old = nmin;
2935         /* We need the total for normalization */
2936         fac = 0;
2937         for (i = range[0]; i < range[1]; i++)
2938         {
2939             if (root->bCellMin[i] == FALSE)
2940             {
2941                 fac += cell_size[i];
2942             }
2943         }
2944         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
2945         /* Determine the cell boundaries */
2946         for (i = range[0]; i < range[1]; i++)
2947         {
2948             if (root->bCellMin[i] == FALSE)
2949             {
2950                 cell_size[i] *= fac;
2951                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
2952                 {
2953                     cellsize_limit_f_i = 0;
2954                 }
2955                 else
2956                 {
2957                     cellsize_limit_f_i = cellsize_limit_f;
2958                 }
2959                 if (cell_size[i] < cellsize_limit_f_i)
2960                 {
2961                     root->bCellMin[i] = TRUE;
2962                     cell_size[i]      = cellsize_limit_f_i;
2963                     nmin++;
2964                 }
2965             }
2966             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
2967         }
2968     }
2969     while (nmin > nmin_old);
2970
2971     i            = range[1]-1;
2972     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
2973     /* For this check we should not use DD_CELL_MARGIN,
2974      * but a slightly smaller factor,
2975      * since rounding could get use below the limit.
2976      */
2977     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
2978     {
2979         char buf[22];
2980         gmx_fatal(FARGS, "step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
2981                   gmx_step_str(step, buf),
2982                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2983                   ncd, comm->cellsize_min[dim]);
2984     }
2985
2986     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
2987
2988     if (!bUniform)
2989     {
2990         /* Check if the boundary did not displace more than halfway
2991          * each of the cells it bounds, as this could cause problems,
2992          * especially when the differences between cell sizes are large.
2993          * If changes are applied, they will not make cells smaller
2994          * than the cut-off, as we check all the boundaries which
2995          * might be affected by a change and if the old state was ok,
2996          * the cells will at most be shrunk back to their old size.
2997          */
2998         for (i = range[0]+1; i < range[1]; i++)
2999         {
3000             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3001             if (root->cell_f[i] < halfway)
3002             {
3003                 root->cell_f[i] = halfway;
3004                 /* Check if the change also causes shifts of the next boundaries */
3005                 for (j = i+1; j < range[1]; j++)
3006                 {
3007                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3008                     {
3009                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3010                     }
3011                 }
3012             }
3013             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3014             if (root->cell_f[i] > halfway)
3015             {
3016                 root->cell_f[i] = halfway;
3017                 /* Check if the change also causes shifts of the next boundaries */
3018                 for (j = i-1; j >= range[0]+1; j--)
3019                 {
3020                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3021                     {
3022                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3023                     }
3024                 }
3025             }
3026         }
3027     }
3028
3029     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3030     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3031      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3032      * for a and b nrange is used */
3033     if (d > 0)
3034     {
3035         /* Take care of the staggering of the cell boundaries */
3036         if (bUniform)
3037         {
3038             for (i = range[0]; i < range[1]; i++)
3039             {
3040                 root->cell_f_max0[i] = root->cell_f[i];
3041                 root->cell_f_min1[i] = root->cell_f[i+1];
3042             }
3043         }
3044         else
3045         {
3046             for (i = range[0]+1; i < range[1]; i++)
3047             {
3048                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3049                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3050                 if (bLimLo && bLimHi)
3051                 {
3052                     /* Both limits violated, try the best we can */
3053                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3054                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3055                     nrange[0]       = range[0];
3056                     nrange[1]       = i;
3057                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3058
3059                     nrange[0] = i;
3060                     nrange[1] = range[1];
3061                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3062
3063                     return;
3064                 }
3065                 else if (bLimLo)
3066                 {
3067                     /* root->cell_f[i] = root->bound_min[i]; */
3068                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3069                     bLastHi   = FALSE;
3070                 }
3071                 else if (bLimHi && !bLastHi)
3072                 {
3073                     bLastHi = TRUE;
3074                     if (nrange[1] < range[1])   /* found a LimLo before */
3075                     {
3076                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3077                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3078                         nrange[0] = nrange[1];
3079                     }
3080                     root->cell_f[i] = root->bound_max[i];
3081                     nrange[1]       = i;
3082                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3083                     nrange[0] = i;
3084                     nrange[1] = range[1];
3085                 }
3086             }
3087             if (nrange[1] < range[1])   /* found last a LimLo */
3088             {
3089                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3090                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3091                 nrange[0] = nrange[1];
3092                 nrange[1] = range[1];
3093                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3094             }
3095             else if (nrange[0] > range[0]) /* found at least one LimHi */
3096             {
3097                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3098             }
3099         }
3100     }
3101 }
3102
3103
3104 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3105                                        int d, int dim, domdec_root_t *root,
3106                                        const gmx_ddbox_t *ddbox,
3107                                        gmx_bool bDynamicBox,
3108                                        gmx_bool bUniform, gmx_int64_t step)
3109 {
3110     gmx_domdec_comm_t *comm;
3111     int                ncd, d1, i, pos;
3112     real              *cell_size;
3113     real               load_aver, load_i, imbalance, change, change_max, sc;
3114     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3115     real               change_limit;
3116     real               relax = 0.5;
3117     gmx_bool           bPBC;
3118     int                range[] = { 0, 0 };
3119
3120     comm = dd->comm;
3121
3122     /* Convert the maximum change from the input percentage to a fraction */
3123     change_limit = comm->dlb_scale_lim*0.01;
3124
3125     ncd = dd->nc[dim];
3126
3127     bPBC = (dim < ddbox->npbcdim);
3128
3129     cell_size = root->buf_ncd;
3130
3131     /* Store the original boundaries */
3132     for (i = 0; i < ncd+1; i++)
3133     {
3134         root->old_cell_f[i] = root->cell_f[i];
3135     }
3136     if (bUniform)
3137     {
3138         for (i = 0; i < ncd; i++)
3139         {
3140             cell_size[i] = 1.0/ncd;
3141         }
3142     }
3143     else if (dd_load_count(comm) > 0)
3144     {
3145         load_aver  = comm->load[d].sum_m/ncd;
3146         change_max = 0;
3147         for (i = 0; i < ncd; i++)
3148         {
3149             /* Determine the relative imbalance of cell i */
3150             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3151             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3152             /* Determine the change of the cell size using underrelaxation */
3153             change     = -relax*imbalance;
3154             change_max = std::max(change_max, std::max(change, -change));
3155         }
3156         /* Limit the amount of scaling.
3157          * We need to use the same rescaling for all cells in one row,
3158          * otherwise the load balancing might not converge.
3159          */
3160         sc = relax;
3161         if (change_max > change_limit)
3162         {
3163             sc *= change_limit/change_max;
3164         }
3165         for (i = 0; i < ncd; i++)
3166         {
3167             /* Determine the relative imbalance of cell i */
3168             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3169             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3170             /* Determine the change of the cell size using underrelaxation */
3171             change       = -sc*imbalance;
3172             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3173         }
3174     }
3175
3176     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3177     cellsize_limit_f *= DD_CELL_MARGIN;
3178     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3179     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3180     if (ddbox->tric_dir[dim])
3181     {
3182         cellsize_limit_f /= ddbox->skew_fac[dim];
3183         dist_min_f       /= ddbox->skew_fac[dim];
3184     }
3185     if (bDynamicBox && d > 0)
3186     {
3187         dist_min_f *= DD_PRES_SCALE_MARGIN;
3188     }
3189     if (d > 0 && !bUniform)
3190     {
3191         /* Make sure that the grid is not shifted too much */
3192         for (i = 1; i < ncd; i++)
3193         {
3194             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3195             {
3196                 gmx_incons("Inconsistent DD boundary staggering limits!");
3197             }
3198             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3199             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3200             if (space > 0)
3201             {
3202                 root->bound_min[i] += 0.5*space;
3203             }
3204             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3205             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3206             if (space < 0)
3207             {
3208                 root->bound_max[i] += 0.5*space;
3209             }
3210             if (debug)
3211             {
3212                 fprintf(debug,
3213                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3214                         d, i,
3215                         root->cell_f_max0[i-1] + dist_min_f,
3216                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3217                         root->cell_f_min1[i] - dist_min_f);
3218             }
3219         }
3220     }
3221     range[1]          = ncd;
3222     root->cell_f[0]   = 0;
3223     root->cell_f[ncd] = 1;
3224     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3225
3226
3227     /* After the checks above, the cells should obey the cut-off
3228      * restrictions, but it does not hurt to check.
3229      */
3230     for (i = 0; i < ncd; i++)
3231     {
3232         if (debug)
3233         {
3234             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3235                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3236         }
3237
3238         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3239             root->cell_f[i+1] - root->cell_f[i] <
3240             cellsize_limit_f/DD_CELL_MARGIN)
3241         {
3242             char buf[22];
3243             fprintf(stderr,
3244                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3245                     gmx_step_str(step, buf), dim2char(dim), i,
3246                     (root->cell_f[i+1] - root->cell_f[i])
3247                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3248         }
3249     }
3250
3251     pos = ncd + 1;
3252     /* Store the cell boundaries of the lower dimensions at the end */
3253     for (d1 = 0; d1 < d; d1++)
3254     {
3255         root->cell_f[pos++] = comm->cell_f0[d1];
3256         root->cell_f[pos++] = comm->cell_f1[d1];
3257     }
3258
3259     if (d < comm->npmedecompdim)
3260     {
3261         /* The master determines the maximum shift for
3262          * the coordinate communication between separate PME nodes.
3263          */
3264         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3265     }
3266     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3267     if (d >= 1)
3268     {
3269         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3270     }
3271 }
3272
3273 static void relative_to_absolute_cell_bounds(gmx_domdec_t      *dd,
3274                                              const gmx_ddbox_t *ddbox,
3275                                              int                dimind)
3276 {
3277     gmx_domdec_comm_t *comm;
3278     int                dim;
3279
3280     comm = dd->comm;
3281
3282     /* Set the cell dimensions */
3283     dim                = dd->dim[dimind];
3284     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3285     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3286     if (dim >= ddbox->nboundeddim)
3287     {
3288         comm->cell_x0[dim] += ddbox->box0[dim];
3289         comm->cell_x1[dim] += ddbox->box0[dim];
3290     }
3291 }
3292
3293 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3294                                          int d, int dim, real *cell_f_row,
3295                                          const gmx_ddbox_t *ddbox)
3296 {
3297     gmx_domdec_comm_t *comm;
3298     int                d1, pos;
3299
3300     comm = dd->comm;
3301
3302 #if GMX_MPI
3303     /* Each node would only need to know two fractions,
3304      * but it is probably cheaper to broadcast the whole array.
3305      */
3306     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3307               0, comm->mpi_comm_load[d]);
3308 #endif
3309     /* Copy the fractions for this dimension from the buffer */
3310     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3311     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3312     /* The whole array was communicated, so set the buffer position */
3313     pos = dd->nc[dim] + 1;
3314     for (d1 = 0; d1 <= d; d1++)
3315     {
3316         if (d1 < d)
3317         {
3318             /* Copy the cell fractions of the lower dimensions */
3319             comm->cell_f0[d1] = cell_f_row[pos++];
3320             comm->cell_f1[d1] = cell_f_row[pos++];
3321         }
3322         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3323     }
3324     /* Convert the communicated shift from float to int */
3325     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3326     if (d >= 1)
3327     {
3328         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3329     }
3330 }
3331
3332 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3333                                          const gmx_ddbox_t *ddbox,
3334                                          gmx_bool bDynamicBox,
3335                                          gmx_bool bUniform, gmx_int64_t step)
3336 {
3337     gmx_domdec_comm_t *comm;
3338     int                d, dim, d1;
3339     gmx_bool           bRowMember, bRowRoot;
3340     real              *cell_f_row;
3341
3342     comm = dd->comm;
3343
3344     for (d = 0; d < dd->ndim; d++)
3345     {
3346         dim        = dd->dim[d];
3347         bRowMember = TRUE;
3348         bRowRoot   = TRUE;
3349         for (d1 = d; d1 < dd->ndim; d1++)
3350         {
3351             if (dd->ci[dd->dim[d1]] > 0)
3352             {
3353                 if (d1 != d)
3354                 {
3355                     bRowMember = FALSE;
3356                 }
3357                 bRowRoot = FALSE;
3358             }
3359         }
3360         if (bRowMember)
3361         {
3362             if (bRowRoot)
3363             {
3364                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3365                                            ddbox, bDynamicBox, bUniform, step);
3366                 cell_f_row = comm->root[d]->cell_f;
3367             }
3368             else
3369             {
3370                 cell_f_row = comm->cell_f_row;
3371             }
3372             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3373         }
3374     }
3375 }
3376
3377 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t      *dd,
3378                                            const gmx_ddbox_t *ddbox)
3379 {
3380     int d;
3381
3382     /* This function assumes the box is static and should therefore
3383      * not be called when the box has changed since the last
3384      * call to dd_partition_system.
3385      */
3386     for (d = 0; d < dd->ndim; d++)
3387     {
3388         relative_to_absolute_cell_bounds(dd, ddbox, d);
3389     }
3390 }
3391
3392
3393
3394 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3395                                   const gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3396                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3397                                   gmx_wallcycle_t wcycle)
3398 {
3399     gmx_domdec_comm_t *comm;
3400     int                dim;
3401
3402     comm = dd->comm;
3403
3404     if (bDoDLB)
3405     {
3406         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3407         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3408         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3409     }
3410     else if (bDynamicBox)
3411     {
3412         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3413     }
3414
3415     /* Set the dimensions for which no DD is used */
3416     for (dim = 0; dim < DIM; dim++)
3417     {
3418         if (dd->nc[dim] == 1)
3419         {
3420             comm->cell_x0[dim] = 0;
3421             comm->cell_x1[dim] = ddbox->box_size[dim];
3422             if (dim >= ddbox->nboundeddim)
3423             {
3424                 comm->cell_x0[dim] += ddbox->box0[dim];
3425                 comm->cell_x1[dim] += ddbox->box0[dim];
3426             }
3427         }
3428     }
3429 }
3430
3431 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3432 {
3433     int                    d, np, i;
3434     gmx_domdec_comm_dim_t *cd;
3435
3436     for (d = 0; d < dd->ndim; d++)
3437     {
3438         cd = &dd->comm->cd[d];
3439         np = npulse[dd->dim[d]];
3440         if (np > cd->np_nalloc)
3441         {
3442             if (debug)
3443             {
3444                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3445                         dim2char(dd->dim[d]), np);
3446             }
3447             if (DDMASTER(dd) && cd->np_nalloc > 0)
3448             {
3449                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3450             }
3451             srenew(cd->ind, np);
3452             for (i = cd->np_nalloc; i < np; i++)
3453             {
3454                 cd->ind[i].index  = nullptr;
3455                 cd->ind[i].nalloc = 0;
3456             }
3457             cd->np_nalloc = np;
3458         }
3459         cd->np = np;
3460     }
3461 }
3462
3463
3464 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3465                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3466                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3467                               gmx_wallcycle_t wcycle)
3468 {
3469     gmx_domdec_comm_t *comm;
3470     int                d;
3471     ivec               npulse;
3472
3473     comm = dd->comm;
3474
3475     /* Copy the old cell boundaries for the cg displacement check */
3476     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3477     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3478
3479     if (isDlbOn(comm))
3480     {
3481         if (DDMASTER(dd))
3482         {
3483             check_box_size(dd, ddbox);
3484         }
3485         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3486     }
3487     else
3488     {
3489         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3490         realloc_comm_ind(dd, npulse);
3491     }
3492
3493     if (debug)
3494     {
3495         for (d = 0; d < DIM; d++)
3496         {
3497             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3498                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3499         }
3500     }
3501 }
3502
3503 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3504                                   gmx_ddbox_t *ddbox,
3505                                   rvec cell_ns_x0, rvec cell_ns_x1,
3506                                   gmx_int64_t step)
3507 {
3508     gmx_domdec_comm_t *comm;
3509     int                dim_ind, dim;
3510
3511     comm = dd->comm;
3512
3513     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3514     {
3515         dim = dd->dim[dim_ind];
3516
3517         /* Without PBC we don't have restrictions on the outer cells */
3518         if (!(dim >= ddbox->npbcdim &&
3519               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3520             isDlbOn(comm) &&
3521             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3522             comm->cellsize_min[dim])
3523         {
3524             char buf[22];
3525             gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3526                       gmx_step_str(step, buf), dim2char(dim),
3527                       comm->cell_x1[dim] - comm->cell_x0[dim],
3528                       ddbox->skew_fac[dim],
3529                       dd->comm->cellsize_min[dim],
3530                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3531         }
3532     }
3533
3534     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3535     {
3536         /* Communicate the boundaries and update cell_ns_x0/1 */
3537         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3538         if (isDlbOn(dd->comm) && dd->ndim > 1)
3539         {
3540             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3541         }
3542     }
3543 }
3544
3545 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3546 {
3547     if (YY < npbcdim)
3548     {
3549         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3550     }
3551     else
3552     {
3553         tcm[YY][XX] = 0;
3554     }
3555     if (ZZ < npbcdim)
3556     {
3557         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3558         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3559     }
3560     else
3561     {
3562         tcm[ZZ][XX] = 0;
3563         tcm[ZZ][YY] = 0;
3564     }
3565 }
3566
3567 static void check_screw_box(matrix box)
3568 {
3569     /* Mathematical limitation */
3570     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3571     {
3572         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3573     }
3574
3575     /* Limitation due to the asymmetry of the eighth shell method */
3576     if (box[ZZ][YY] != 0)
3577     {
3578         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3579     }
3580 }
3581
3582 static void distribute_cg(FILE *fplog,
3583                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3584                           gmx_domdec_t *dd)
3585 {
3586     gmx_domdec_master_t *ma;
3587     int                **tmp_ind = nullptr, *tmp_nalloc = nullptr;
3588     int                  i, icg, j, k, k0, k1, d;
3589     matrix               tcm;
3590     rvec                 cg_cm;
3591     ivec                 ind;
3592     real                 nrcg, inv_ncg, pos_d;
3593     int                 *cgindex;
3594     gmx_bool             bScrew;
3595
3596     ma = dd->ma;
3597
3598     snew(tmp_nalloc, dd->nnodes);
3599     snew(tmp_ind, dd->nnodes);
3600     for (i = 0; i < dd->nnodes; i++)
3601     {
3602         tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3603         snew(tmp_ind[i], tmp_nalloc[i]);
3604     }
3605
3606     /* Clear the count */
3607     for (i = 0; i < dd->nnodes; i++)
3608     {
3609         ma->ncg[i] = 0;
3610         ma->nat[i] = 0;
3611     }
3612
3613     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3614
3615     cgindex = cgs->index;
3616
3617     /* Compute the center of geometry for all charge groups */
3618     for (icg = 0; icg < cgs->nr; icg++)
3619     {
3620         k0      = cgindex[icg];
3621         k1      = cgindex[icg+1];
3622         nrcg    = k1 - k0;
3623         if (nrcg == 1)
3624         {
3625             copy_rvec(pos[k0], cg_cm);
3626         }
3627         else
3628         {
3629             inv_ncg = 1.0/nrcg;
3630
3631             clear_rvec(cg_cm);
3632             for (k = k0; (k < k1); k++)
3633             {
3634                 rvec_inc(cg_cm, pos[k]);
3635             }
3636             for (d = 0; (d < DIM); d++)
3637             {
3638                 cg_cm[d] *= inv_ncg;
3639             }
3640         }
3641         /* Put the charge group in the box and determine the cell index */
3642         for (d = DIM-1; d >= 0; d--)
3643         {
3644             pos_d = cg_cm[d];
3645             if (d < dd->npbcdim)
3646             {
3647                 bScrew = (dd->bScrewPBC && d == XX);
3648                 if (tric_dir[d] && dd->nc[d] > 1)
3649                 {
3650                     /* Use triclinic coordintates for this dimension */
3651                     for (j = d+1; j < DIM; j++)
3652                     {
3653                         pos_d += cg_cm[j]*tcm[j][d];
3654                     }
3655                 }
3656                 while (pos_d >= box[d][d])
3657                 {
3658                     pos_d -= box[d][d];
3659                     rvec_dec(cg_cm, box[d]);
3660                     if (bScrew)
3661                     {
3662                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3663                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3664                     }
3665                     for (k = k0; (k < k1); k++)
3666                     {
3667                         rvec_dec(pos[k], box[d]);
3668                         if (bScrew)
3669                         {
3670                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3671                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3672                         }
3673                     }
3674                 }
3675                 while (pos_d < 0)
3676                 {
3677                     pos_d += box[d][d];
3678                     rvec_inc(cg_cm, box[d]);
3679                     if (bScrew)
3680                     {
3681                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3682                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3683                     }
3684                     for (k = k0; (k < k1); k++)
3685                     {
3686                         rvec_inc(pos[k], box[d]);
3687                         if (bScrew)
3688                         {
3689                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3690                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3691                         }
3692                     }
3693                 }
3694             }
3695             /* This could be done more efficiently */
3696             ind[d] = 0;
3697             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3698             {
3699                 ind[d]++;
3700             }
3701         }
3702         i = dd_index(dd->nc, ind);
3703         if (ma->ncg[i] == tmp_nalloc[i])
3704         {
3705             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3706             srenew(tmp_ind[i], tmp_nalloc[i]);
3707         }
3708         tmp_ind[i][ma->ncg[i]] = icg;
3709         ma->ncg[i]++;
3710         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3711     }
3712
3713     k1 = 0;
3714     for (i = 0; i < dd->nnodes; i++)
3715     {
3716         ma->index[i] = k1;
3717         for (k = 0; k < ma->ncg[i]; k++)
3718         {
3719             ma->cg[k1++] = tmp_ind[i][k];
3720         }
3721     }
3722     ma->index[dd->nnodes] = k1;
3723
3724     for (i = 0; i < dd->nnodes; i++)
3725     {
3726         sfree(tmp_ind[i]);
3727     }
3728     sfree(tmp_ind);
3729     sfree(tmp_nalloc);
3730
3731     if (fplog)
3732     {
3733         // Use double for the sums to avoid natoms^2 overflowing
3734         // (65537^2 > 2^32)
3735         int    nat_sum, nat_min, nat_max;
3736         double nat2_sum;
3737
3738         nat_sum  = 0;
3739         nat2_sum = 0;
3740         nat_min  = ma->nat[0];
3741         nat_max  = ma->nat[0];
3742         for (i = 0; i < dd->nnodes; i++)
3743         {
3744             nat_sum  += ma->nat[i];
3745             // cast to double to avoid integer overflows when squaring
3746             nat2_sum += gmx::square(static_cast<double>(ma->nat[i]));
3747             nat_min   = std::min(nat_min, ma->nat[i]);
3748             nat_max   = std::max(nat_max, ma->nat[i]);
3749         }
3750         nat_sum  /= dd->nnodes;
3751         nat2_sum /= dd->nnodes;
3752
3753         fprintf(fplog, "Atom distribution over %d domains: av %d stddev %d min %d max %d\n",
3754                 dd->nnodes,
3755                 nat_sum,
3756                 static_cast<int>(std::sqrt(nat2_sum - gmx::square(static_cast<double>(nat_sum)) + 0.5)),
3757                 nat_min, nat_max);
3758     }
3759 }
3760
3761 static void get_cg_distribution(FILE *fplog, gmx_domdec_t *dd,
3762                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
3763                                 rvec pos[])
3764 {
3765     gmx_domdec_master_t *ma = nullptr;
3766     ivec                 npulse;
3767     int                  i, cg_gl;
3768     int                 *ibuf, buf2[2] = { 0, 0 };
3769     gmx_bool             bMaster = DDMASTER(dd);
3770
3771     if (bMaster)
3772     {
3773         ma = dd->ma;
3774
3775         if (dd->bScrewPBC)
3776         {
3777             check_screw_box(box);
3778         }
3779
3780         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
3781
3782         distribute_cg(fplog, box, ddbox->tric_dir, cgs, pos, dd);
3783         for (i = 0; i < dd->nnodes; i++)
3784         {
3785             ma->ibuf[2*i]   = ma->ncg[i];
3786             ma->ibuf[2*i+1] = ma->nat[i];
3787         }
3788         ibuf = ma->ibuf;
3789     }
3790     else
3791     {
3792         ibuf = nullptr;
3793     }
3794     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
3795
3796     dd->ncg_home = buf2[0];
3797     dd->nat_home = buf2[1];
3798     dd->ncg_tot  = dd->ncg_home;
3799     dd->nat_tot  = dd->nat_home;
3800     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3801     {
3802         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3803         srenew(dd->index_gl, dd->cg_nalloc);
3804         srenew(dd->cgindex, dd->cg_nalloc+1);
3805     }
3806     if (bMaster)
3807     {
3808         for (i = 0; i < dd->nnodes; i++)
3809         {
3810             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
3811             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3812         }
3813     }
3814
3815     dd_scatterv(dd,
3816                 bMaster ? ma->ibuf : nullptr,
3817                 bMaster ? ma->ibuf+dd->nnodes : nullptr,
3818                 bMaster ? ma->cg : nullptr,
3819                 dd->ncg_home*sizeof(int), dd->index_gl);
3820
3821     /* Determine the home charge group sizes */
3822     dd->cgindex[0] = 0;
3823     for (i = 0; i < dd->ncg_home; i++)
3824     {
3825         cg_gl            = dd->index_gl[i];
3826         dd->cgindex[i+1] =
3827             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3828     }
3829
3830     if (debug)
3831     {
3832         fprintf(debug, "Home charge groups:\n");
3833         for (i = 0; i < dd->ncg_home; i++)
3834         {
3835             fprintf(debug, " %d", dd->index_gl[i]);
3836             if (i % 10 == 9)
3837             {
3838                 fprintf(debug, "\n");
3839             }
3840         }
3841         fprintf(debug, "\n");
3842     }
3843 }
3844
3845 static int compact_and_copy_vec_at(int ncg, int *move,
3846                                    int *cgindex,
3847                                    int nvec, int vec,
3848                                    rvec *src, gmx_domdec_comm_t *comm,
3849                                    gmx_bool bCompact)
3850 {
3851     int m, icg, i, i0, i1, nrcg;
3852     int home_pos;
3853     int pos_vec[DIM*2];
3854
3855     home_pos = 0;
3856
3857     for (m = 0; m < DIM*2; m++)
3858     {
3859         pos_vec[m] = 0;
3860     }
3861
3862     i0 = 0;
3863     for (icg = 0; icg < ncg; icg++)
3864     {
3865         i1 = cgindex[icg+1];
3866         m  = move[icg];
3867         if (m == -1)
3868         {
3869             if (bCompact)
3870             {
3871                 /* Compact the home array in place */
3872                 for (i = i0; i < i1; i++)
3873                 {
3874                     copy_rvec(src[i], src[home_pos++]);
3875                 }
3876             }
3877         }
3878         else
3879         {
3880             /* Copy to the communication buffer */
3881             nrcg        = i1 - i0;
3882             pos_vec[m] += 1 + vec*nrcg;
3883             for (i = i0; i < i1; i++)
3884             {
3885                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
3886             }
3887             pos_vec[m] += (nvec - vec - 1)*nrcg;
3888         }
3889         if (!bCompact)
3890         {
3891             home_pos += i1 - i0;
3892         }
3893         i0 = i1;
3894     }
3895
3896     return home_pos;
3897 }
3898
3899 static int compact_and_copy_vec_cg(int ncg, int *move,
3900                                    int *cgindex,
3901                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
3902                                    gmx_bool bCompact)
3903 {
3904     int m, icg, i0, i1, nrcg;
3905     int home_pos;
3906     int pos_vec[DIM*2];
3907
3908     home_pos = 0;
3909
3910     for (m = 0; m < DIM*2; m++)
3911     {
3912         pos_vec[m] = 0;
3913     }
3914
3915     i0 = 0;
3916     for (icg = 0; icg < ncg; icg++)
3917     {
3918         i1 = cgindex[icg+1];
3919         m  = move[icg];
3920         if (m == -1)
3921         {
3922             if (bCompact)
3923             {
3924                 /* Compact the home array in place */
3925                 copy_rvec(src[icg], src[home_pos++]);
3926             }
3927         }
3928         else
3929         {
3930             nrcg = i1 - i0;
3931             /* Copy to the communication buffer */
3932             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
3933             pos_vec[m] += 1 + nrcg*nvec;
3934         }
3935         i0 = i1;
3936     }
3937     if (!bCompact)
3938     {
3939         home_pos = ncg;
3940     }
3941
3942     return home_pos;
3943 }
3944
3945 static int compact_ind(int ncg, int *move,
3946                        int *index_gl, int *cgindex,
3947                        int *gatindex,
3948                        gmx_ga2la_t *ga2la, char *bLocalCG,
3949                        int *cginfo)
3950 {
3951     int cg, nat, a0, a1, a, a_gl;
3952     int home_pos;
3953
3954     home_pos = 0;
3955     nat      = 0;
3956     for (cg = 0; cg < ncg; cg++)
3957     {
3958         a0 = cgindex[cg];
3959         a1 = cgindex[cg+1];
3960         if (move[cg] == -1)
3961         {
3962             /* Compact the home arrays in place.
3963              * Anything that can be done here avoids access to global arrays.
3964              */
3965             cgindex[home_pos] = nat;
3966             for (a = a0; a < a1; a++)
3967             {
3968                 a_gl          = gatindex[a];
3969                 gatindex[nat] = a_gl;
3970                 /* The cell number stays 0, so we don't need to set it */
3971                 ga2la_change_la(ga2la, a_gl, nat);
3972                 nat++;
3973             }
3974             index_gl[home_pos] = index_gl[cg];
3975             cginfo[home_pos]   = cginfo[cg];
3976             /* The charge group remains local, so bLocalCG does not change */
3977             home_pos++;
3978         }
3979         else
3980         {
3981             /* Clear the global indices */
3982             for (a = a0; a < a1; a++)
3983             {
3984                 ga2la_del(ga2la, gatindex[a]);
3985             }
3986             if (bLocalCG)
3987             {
3988                 bLocalCG[index_gl[cg]] = FALSE;
3989             }
3990         }
3991     }
3992     cgindex[home_pos] = nat;
3993
3994     return home_pos;
3995 }
3996
3997 static void clear_and_mark_ind(int ncg, int *move,
3998                                int *index_gl, int *cgindex, int *gatindex,
3999                                gmx_ga2la_t *ga2la, char *bLocalCG,
4000                                int *cell_index)
4001 {
4002     int cg, a0, a1, a;
4003
4004     for (cg = 0; cg < ncg; cg++)
4005     {
4006         if (move[cg] >= 0)
4007         {
4008             a0 = cgindex[cg];
4009             a1 = cgindex[cg+1];
4010             /* Clear the global indices */
4011             for (a = a0; a < a1; a++)
4012             {
4013                 ga2la_del(ga2la, gatindex[a]);
4014             }
4015             if (bLocalCG)
4016             {
4017                 bLocalCG[index_gl[cg]] = FALSE;
4018             }
4019             /* Signal that this cg has moved using the ns cell index.
4020              * Here we set it to -1. fill_grid will change it
4021              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4022              */
4023             cell_index[cg] = -1;
4024         }
4025     }
4026 }
4027
4028 static void print_cg_move(FILE *fplog,
4029                           gmx_domdec_t *dd,
4030                           gmx_int64_t step, int cg, int dim, int dir,
4031                           gmx_bool bHaveCgcmOld, real limitd,
4032                           rvec cm_old, rvec cm_new, real pos_d)
4033 {
4034     gmx_domdec_comm_t *comm;
4035     char               buf[22];
4036
4037     comm = dd->comm;
4038
4039     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4040     if (limitd > 0)
4041     {
4042         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4043                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4044                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4045     }
4046     else
4047     {
4048         /* We don't have a limiting distance available: don't print it */
4049         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4050                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4051                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4052     }
4053     fprintf(fplog, "distance out of cell %f\n",
4054             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4055     if (bHaveCgcmOld)
4056     {
4057         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4058                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4059     }
4060     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4061             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4062     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4063             dim2char(dim),
4064             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4065     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4066             dim2char(dim),
4067             comm->cell_x0[dim], comm->cell_x1[dim]);
4068 }
4069
4070 static void cg_move_error(FILE *fplog,
4071                           gmx_domdec_t *dd,
4072                           gmx_int64_t step, int cg, int dim, int dir,
4073                           gmx_bool bHaveCgcmOld, real limitd,
4074                           rvec cm_old, rvec cm_new, real pos_d)
4075 {
4076     if (fplog)
4077     {
4078         print_cg_move(fplog, dd, step, cg, dim, dir,
4079                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4080     }
4081     print_cg_move(stderr, dd, step, cg, dim, dir,
4082                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4083     gmx_fatal(FARGS,
4084               "%s moved too far between two domain decomposition steps\n"
4085               "This usually means that your system is not well equilibrated",
4086               dd->comm->bCGs ? "A charge group" : "An atom");
4087 }
4088
4089 static void rotate_state_atom(t_state *state, int a)
4090 {
4091     if (state->flags & (1 << estX))
4092     {
4093         /* Rotate the complete state; for a rectangular box only */
4094         state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4095         state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4096     }
4097     if (state->flags & (1 << estV))
4098     {
4099         state->v[a][YY] = -state->v[a][YY];
4100         state->v[a][ZZ] = -state->v[a][ZZ];
4101     }
4102     if (state->flags & (1 << estCGP))
4103     {
4104         state->cg_p[a][YY] = -state->cg_p[a][YY];
4105         state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4106     }
4107 }
4108
4109 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4110 {
4111     if (natoms > comm->moved_nalloc)
4112     {
4113         /* Contents should be preserved here */
4114         comm->moved_nalloc = over_alloc_dd(natoms);
4115         srenew(comm->moved, comm->moved_nalloc);
4116     }
4117
4118     return comm->moved;
4119 }
4120
4121 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4122                          gmx_domdec_t *dd,
4123                          t_state *state,
4124                          ivec tric_dir, matrix tcm,
4125                          rvec cell_x0, rvec cell_x1,
4126                          rvec limitd, rvec limit0, rvec limit1,
4127                          const int *cgindex,
4128                          int cg_start, int cg_end,
4129                          rvec *cg_cm,
4130                          int *move)
4131 {
4132     int      npbcdim;
4133     int      cg, k, k0, k1, d, dim, d2;
4134     int      mc, nrcg;
4135     int      flag;
4136     gmx_bool bScrew;
4137     ivec     dev;
4138     real     inv_ncg, pos_d;
4139     rvec     cm_new;
4140
4141     npbcdim = dd->npbcdim;
4142
4143     for (cg = cg_start; cg < cg_end; cg++)
4144     {
4145         k0   = cgindex[cg];
4146         k1   = cgindex[cg+1];
4147         nrcg = k1 - k0;
4148         if (nrcg == 1)
4149         {
4150             copy_rvec(state->x[k0], cm_new);
4151         }
4152         else
4153         {
4154             inv_ncg = 1.0/nrcg;
4155
4156             clear_rvec(cm_new);
4157             for (k = k0; (k < k1); k++)
4158             {
4159                 rvec_inc(cm_new, state->x[k]);
4160             }
4161             for (d = 0; (d < DIM); d++)
4162             {
4163                 cm_new[d] = inv_ncg*cm_new[d];
4164             }
4165         }
4166
4167         clear_ivec(dev);
4168         /* Do pbc and check DD cell boundary crossings */
4169         for (d = DIM-1; d >= 0; d--)
4170         {
4171             if (dd->nc[d] > 1)
4172             {
4173                 bScrew = (dd->bScrewPBC && d == XX);
4174                 /* Determine the location of this cg in lattice coordinates */
4175                 pos_d = cm_new[d];
4176                 if (tric_dir[d])
4177                 {
4178                     for (d2 = d+1; d2 < DIM; d2++)
4179                     {
4180                         pos_d += cm_new[d2]*tcm[d2][d];
4181                     }
4182                 }
4183                 /* Put the charge group in the triclinic unit-cell */
4184                 if (pos_d >= cell_x1[d])
4185                 {
4186                     if (pos_d >= limit1[d])
4187                     {
4188                         cg_move_error(fplog, dd, step, cg, d, 1,
4189                                       cg_cm != as_rvec_array(state->x.data()), limitd[d],
4190                                       cg_cm[cg], cm_new, pos_d);
4191                     }
4192                     dev[d] = 1;
4193                     if (dd->ci[d] == dd->nc[d] - 1)
4194                     {
4195                         rvec_dec(cm_new, state->box[d]);
4196                         if (bScrew)
4197                         {
4198                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4199                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4200                         }
4201                         for (k = k0; (k < k1); k++)
4202                         {
4203                             rvec_dec(state->x[k], state->box[d]);
4204                             if (bScrew)
4205                             {
4206                                 rotate_state_atom(state, k);
4207                             }
4208                         }
4209                     }
4210                 }
4211                 else if (pos_d < cell_x0[d])
4212                 {
4213                     if (pos_d < limit0[d])
4214                     {
4215                         cg_move_error(fplog, dd, step, cg, d, -1,
4216                                       cg_cm != as_rvec_array(state->x.data()), limitd[d],
4217                                       cg_cm[cg], cm_new, pos_d);
4218                     }
4219                     dev[d] = -1;
4220                     if (dd->ci[d] == 0)
4221                     {
4222                         rvec_inc(cm_new, state->box[d]);
4223                         if (bScrew)
4224                         {
4225                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4226                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4227                         }
4228                         for (k = k0; (k < k1); k++)
4229                         {
4230                             rvec_inc(state->x[k], state->box[d]);
4231                             if (bScrew)
4232                             {
4233                                 rotate_state_atom(state, k);
4234                             }
4235                         }
4236                     }
4237                 }
4238             }
4239             else if (d < npbcdim)
4240             {
4241                 /* Put the charge group in the rectangular unit-cell */
4242                 while (cm_new[d] >= state->box[d][d])
4243                 {
4244                     rvec_dec(cm_new, state->box[d]);
4245                     for (k = k0; (k < k1); k++)
4246                     {
4247                         rvec_dec(state->x[k], state->box[d]);
4248                     }
4249                 }
4250                 while (cm_new[d] < 0)
4251                 {
4252                     rvec_inc(cm_new, state->box[d]);
4253                     for (k = k0; (k < k1); k++)
4254                     {
4255                         rvec_inc(state->x[k], state->box[d]);
4256                     }
4257                 }
4258             }
4259         }
4260
4261         copy_rvec(cm_new, cg_cm[cg]);
4262
4263         /* Determine where this cg should go */
4264         flag = 0;
4265         mc   = -1;
4266         for (d = 0; d < dd->ndim; d++)
4267         {
4268             dim = dd->dim[d];
4269             if (dev[dim] == 1)
4270             {
4271                 flag |= DD_FLAG_FW(d);
4272                 if (mc == -1)
4273                 {
4274                     mc = d*2;
4275                 }
4276             }
4277             else if (dev[dim] == -1)
4278             {
4279                 flag |= DD_FLAG_BW(d);
4280                 if (mc == -1)
4281                 {
4282                     if (dd->nc[dim] > 2)
4283                     {
4284                         mc = d*2 + 1;
4285                     }
4286                     else
4287                     {
4288                         mc = d*2;
4289                     }
4290                 }
4291             }
4292         }
4293         /* Temporarily store the flag in move */
4294         move[cg] = mc + flag;
4295     }
4296 }
4297
4298 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4299                                gmx_domdec_t *dd, ivec tric_dir,
4300                                t_state *state, PaddedRVecVector *f,
4301                                t_forcerec *fr,
4302                                gmx_bool bCompact,
4303                                t_nrnb *nrnb,
4304                                int *ncg_stay_home,
4305                                int *ncg_moved)
4306 {
4307     int               *move;
4308     int                npbcdim;
4309     int                ncg[DIM*2] = { 0 }, nat[DIM*2] = { 0 };
4310     int                i, cg, k, d, dim, dim2, dir, d2, d3;
4311     int                mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
4312     int                sbuf[2], rbuf[2];
4313     int                home_pos_cg, home_pos_at, buf_pos;
4314     int                flag;
4315     real               pos_d;
4316     matrix             tcm;
4317     rvec              *cg_cm = nullptr, cell_x0, cell_x1, limitd, limit0, limit1;
4318     int               *cgindex;
4319     cginfo_mb_t       *cginfo_mb;
4320     gmx_domdec_comm_t *comm;
4321     int               *moved;
4322     int                nthread, thread;
4323
4324     if (dd->bScrewPBC)
4325     {
4326         check_screw_box(state->box);
4327     }
4328
4329     comm  = dd->comm;
4330     if (fr->cutoff_scheme == ecutsGROUP)
4331     {
4332         cg_cm = fr->cg_cm;
4333     }
4334
4335     // Positions are always present, so there's nothing to flag
4336     bool bV   = state->flags & (1<<estV);
4337     bool bCGP = state->flags & (1<<estCGP);
4338
4339     if (dd->ncg_tot > comm->nalloc_int)
4340     {
4341         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4342         srenew(comm->buf_int, comm->nalloc_int);
4343     }
4344     move = comm->buf_int;
4345
4346     npbcdim = dd->npbcdim;
4347
4348     for (d = 0; (d < DIM); d++)
4349     {
4350         limitd[d] = dd->comm->cellsize_min[d];
4351         if (d >= npbcdim && dd->ci[d] == 0)
4352         {
4353             cell_x0[d] = -GMX_FLOAT_MAX;
4354         }
4355         else
4356         {
4357             cell_x0[d] = comm->cell_x0[d];
4358         }
4359         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4360         {
4361             cell_x1[d] = GMX_FLOAT_MAX;
4362         }
4363         else
4364         {
4365             cell_x1[d] = comm->cell_x1[d];
4366         }
4367         if (d < npbcdim)
4368         {
4369             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4370             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4371         }
4372         else
4373         {
4374             /* We check after communication if a charge group moved
4375              * more than one cell. Set the pre-comm check limit to float_max.
4376              */
4377             limit0[d] = -GMX_FLOAT_MAX;
4378             limit1[d] =  GMX_FLOAT_MAX;
4379         }
4380     }
4381
4382     make_tric_corr_matrix(npbcdim, state->box, tcm);
4383
4384     cgindex = dd->cgindex;
4385
4386     nthread = gmx_omp_nthreads_get(emntDomdec);
4387
4388     /* Compute the center of geometry for all home charge groups
4389      * and put them in the box and determine where they should go.
4390      */
4391 #pragma omp parallel for num_threads(nthread) schedule(static)
4392     for (thread = 0; thread < nthread; thread++)
4393     {
4394         try
4395         {
4396             calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4397                          cell_x0, cell_x1, limitd, limit0, limit1,
4398                          cgindex,
4399                          ( thread   *dd->ncg_home)/nthread,
4400                          ((thread+1)*dd->ncg_home)/nthread,
4401                          fr->cutoff_scheme == ecutsGROUP ? cg_cm : as_rvec_array(state->x.data()),
4402                          move);
4403         }
4404         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4405     }
4406
4407     for (cg = 0; cg < dd->ncg_home; cg++)
4408     {
4409         if (move[cg] >= 0)
4410         {
4411             mc       = move[cg];
4412             flag     = mc & ~DD_FLAG_NRCG;
4413             mc       = mc & DD_FLAG_NRCG;
4414             move[cg] = mc;
4415
4416             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4417             {
4418                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4419                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4420             }
4421             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4422             /* We store the cg size in the lower 16 bits
4423              * and the place where the charge group should go
4424              * in the next 6 bits. This saves some communication volume.
4425              */
4426             nrcg = cgindex[cg+1] - cgindex[cg];
4427             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4428             ncg[mc] += 1;
4429             nat[mc] += nrcg;
4430         }
4431     }
4432
4433     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4434     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4435
4436     *ncg_moved = 0;
4437     for (i = 0; i < dd->ndim*2; i++)
4438     {
4439         *ncg_moved += ncg[i];
4440     }
4441
4442     nvec = 1;
4443     if (bV)
4444     {
4445         nvec++;
4446     }
4447     if (bCGP)
4448     {
4449         nvec++;
4450     }
4451
4452     /* Make sure the communication buffers are large enough */
4453     for (mc = 0; mc < dd->ndim*2; mc++)
4454     {
4455         nvr = ncg[mc] + nat[mc]*nvec;
4456         if (nvr > comm->cgcm_state_nalloc[mc])
4457         {
4458             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4459             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4460         }
4461     }
4462
4463     switch (fr->cutoff_scheme)
4464     {
4465         case ecutsGROUP:
4466             /* Recalculating cg_cm might be cheaper than communicating,
4467              * but that could give rise to rounding issues.
4468              */
4469             home_pos_cg =
4470                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4471                                         nvec, cg_cm, comm, bCompact);
4472             break;
4473         case ecutsVERLET:
4474             /* Without charge groups we send the moved atom coordinates
4475              * over twice. This is so the code below can be used without
4476              * many conditionals for both for with and without charge groups.
4477              */
4478             home_pos_cg =
4479                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4480                                         nvec, as_rvec_array(state->x.data()), comm, FALSE);
4481             if (bCompact)
4482             {
4483                 home_pos_cg -= *ncg_moved;
4484             }
4485             break;
4486         default:
4487             gmx_incons("unimplemented");
4488             home_pos_cg = 0;
4489     }
4490
4491     vec         = 0;
4492     home_pos_at =
4493         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4494                                 nvec, vec++, as_rvec_array(state->x.data()),
4495                                 comm, bCompact);
4496     if (bV)
4497     {
4498         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4499                                 nvec, vec++, as_rvec_array(state->v.data()),
4500                                 comm, bCompact);
4501     }
4502     if (bCGP)
4503     {
4504         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4505                                 nvec, vec++, as_rvec_array(state->cg_p.data()),
4506                                 comm, bCompact);
4507     }
4508
4509     if (bCompact)
4510     {
4511         compact_ind(dd->ncg_home, move,
4512                     dd->index_gl, dd->cgindex, dd->gatindex,
4513                     dd->ga2la, comm->bLocalCG,
4514                     fr->cginfo);
4515     }
4516     else
4517     {
4518         if (fr->cutoff_scheme == ecutsVERLET)
4519         {
4520             moved = get_moved(comm, dd->ncg_home);
4521
4522             for (k = 0; k < dd->ncg_home; k++)
4523             {
4524                 moved[k] = 0;
4525             }
4526         }
4527         else
4528         {
4529             moved = fr->ns->grid->cell_index;
4530         }
4531
4532         clear_and_mark_ind(dd->ncg_home, move,
4533                            dd->index_gl, dd->cgindex, dd->gatindex,
4534                            dd->ga2la, comm->bLocalCG,
4535                            moved);
4536     }
4537
4538     cginfo_mb = fr->cginfo_mb;
4539
4540     *ncg_stay_home = home_pos_cg;
4541     for (d = 0; d < dd->ndim; d++)
4542     {
4543         dim      = dd->dim[d];
4544         ncg_recv = 0;
4545         nvr      = 0;
4546         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4547         {
4548             cdd = d*2 + dir;
4549             /* Communicate the cg and atom counts */
4550             sbuf[0] = ncg[cdd];
4551             sbuf[1] = nat[cdd];
4552             if (debug)
4553             {
4554                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4555                         d, dir, sbuf[0], sbuf[1]);
4556             }
4557             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4558
4559             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4560             {
4561                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4562                 srenew(comm->buf_int, comm->nalloc_int);
4563             }
4564
4565             /* Communicate the charge group indices, sizes and flags */
4566             dd_sendrecv_int(dd, d, dir,
4567                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4568                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4569
4570             nvs = ncg[cdd] + nat[cdd]*nvec;
4571             i   = rbuf[0]  + rbuf[1] *nvec;
4572             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4573
4574             /* Communicate cgcm and state */
4575             dd_sendrecv_rvec(dd, d, dir,
4576                              comm->cgcm_state[cdd], nvs,
4577                              comm->vbuf.v+nvr, i);
4578             ncg_recv += rbuf[0];
4579             nvr      += i;
4580         }
4581
4582         dd_check_alloc_ncg(fr, state, f, home_pos_cg + ncg_recv);
4583         if (fr->cutoff_scheme == ecutsGROUP)
4584         {
4585             /* Here we resize to more than necessary and shrink later */
4586             dd_resize_state(state, f, home_pos_at + ncg_recv*MAX_CGCGSIZE);
4587         }
4588
4589         /* Process the received charge groups */
4590         buf_pos = 0;
4591         for (cg = 0; cg < ncg_recv; cg++)
4592         {
4593             flag = comm->buf_int[cg*DD_CGIBS+1];
4594
4595             if (dim >= npbcdim && dd->nc[dim] > 2)
4596             {
4597                 /* No pbc in this dim and more than one domain boundary.
4598                  * We do a separate check if a charge group didn't move too far.
4599                  */
4600                 if (((flag & DD_FLAG_FW(d)) &&
4601                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4602                     ((flag & DD_FLAG_BW(d)) &&
4603                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4604                 {
4605                     cg_move_error(fplog, dd, step, cg, dim,
4606                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4607                                   fr->cutoff_scheme == ecutsGROUP, 0,
4608                                   comm->vbuf.v[buf_pos],
4609                                   comm->vbuf.v[buf_pos],
4610                                   comm->vbuf.v[buf_pos][dim]);
4611                 }
4612             }
4613
4614             mc = -1;
4615             if (d < dd->ndim-1)
4616             {
4617                 /* Check which direction this cg should go */
4618                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4619                 {
4620                     if (isDlbOn(dd->comm))
4621                     {
4622                         /* The cell boundaries for dimension d2 are not equal
4623                          * for each cell row of the lower dimension(s),
4624                          * therefore we might need to redetermine where
4625                          * this cg should go.
4626                          */
4627                         dim2 = dd->dim[d2];
4628                         /* If this cg crosses the box boundary in dimension d2
4629                          * we can use the communicated flag, so we do not
4630                          * have to worry about pbc.
4631                          */
4632                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4633                                (flag & DD_FLAG_FW(d2))) ||
4634                               (dd->ci[dim2] == 0 &&
4635                                (flag & DD_FLAG_BW(d2)))))
4636                         {
4637                             /* Clear the two flags for this dimension */
4638                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4639                             /* Determine the location of this cg
4640                              * in lattice coordinates
4641                              */
4642                             pos_d = comm->vbuf.v[buf_pos][dim2];
4643                             if (tric_dir[dim2])
4644                             {
4645                                 for (d3 = dim2+1; d3 < DIM; d3++)
4646                                 {
4647                                     pos_d +=
4648                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4649                                 }
4650                             }
4651                             /* Check of we are not at the box edge.
4652                              * pbc is only handled in the first step above,
4653                              * but this check could move over pbc while
4654                              * the first step did not due to different rounding.
4655                              */
4656                             if (pos_d >= cell_x1[dim2] &&
4657                                 dd->ci[dim2] != dd->nc[dim2]-1)
4658                             {
4659                                 flag |= DD_FLAG_FW(d2);
4660                             }
4661                             else if (pos_d < cell_x0[dim2] &&
4662                                      dd->ci[dim2] != 0)
4663                             {
4664                                 flag |= DD_FLAG_BW(d2);
4665                             }
4666                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4667                         }
4668                     }
4669                     /* Set to which neighboring cell this cg should go */
4670                     if (flag & DD_FLAG_FW(d2))
4671                     {
4672                         mc = d2*2;
4673                     }
4674                     else if (flag & DD_FLAG_BW(d2))
4675                     {
4676                         if (dd->nc[dd->dim[d2]] > 2)
4677                         {
4678                             mc = d2*2+1;
4679                         }
4680                         else
4681                         {
4682                             mc = d2*2;
4683                         }
4684                     }
4685                 }
4686             }
4687
4688             nrcg = flag & DD_FLAG_NRCG;
4689             if (mc == -1)
4690             {
4691                 if (home_pos_cg+1 > dd->cg_nalloc)
4692                 {
4693                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4694                     srenew(dd->index_gl, dd->cg_nalloc);
4695                     srenew(dd->cgindex, dd->cg_nalloc+1);
4696                 }
4697                 /* Set the global charge group index and size */
4698                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
4699                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4700                 /* Copy the state from the buffer */
4701                 if (fr->cutoff_scheme == ecutsGROUP)
4702                 {
4703                     cg_cm = fr->cg_cm;
4704                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4705                 }
4706                 buf_pos++;
4707
4708                 /* Set the cginfo */
4709                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4710                                                    dd->index_gl[home_pos_cg]);
4711                 if (comm->bLocalCG)
4712                 {
4713                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4714                 }
4715
4716                 for (i = 0; i < nrcg; i++)
4717                 {
4718                     copy_rvec(comm->vbuf.v[buf_pos++],
4719                               state->x[home_pos_at+i]);
4720                 }
4721                 if (bV)
4722                 {
4723                     for (i = 0; i < nrcg; i++)
4724                     {
4725                         copy_rvec(comm->vbuf.v[buf_pos++],
4726                                   state->v[home_pos_at+i]);
4727                     }
4728                 }
4729                 if (bCGP)
4730                 {
4731                     for (i = 0; i < nrcg; i++)
4732                     {
4733                         copy_rvec(comm->vbuf.v[buf_pos++],
4734                                   state->cg_p[home_pos_at+i]);
4735                     }
4736                 }
4737                 home_pos_cg += 1;
4738                 home_pos_at += nrcg;
4739             }
4740             else
4741             {
4742                 /* Reallocate the buffers if necessary  */
4743                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4744                 {
4745                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4746                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4747                 }
4748                 nvr = ncg[mc] + nat[mc]*nvec;
4749                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4750                 {
4751                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4752                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4753                 }
4754                 /* Copy from the receive to the send buffers */
4755                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4756                        comm->buf_int + cg*DD_CGIBS,
4757                        DD_CGIBS*sizeof(int));
4758                 memcpy(comm->cgcm_state[mc][nvr],
4759                        comm->vbuf.v[buf_pos],
4760                        (1+nrcg*nvec)*sizeof(rvec));
4761                 buf_pos += 1 + nrcg*nvec;
4762                 ncg[mc] += 1;
4763                 nat[mc] += nrcg;
4764             }
4765         }
4766     }
4767
4768     /* With sorting (!bCompact) the indices are now only partially up to date
4769      * and ncg_home and nat_home are not the real count, since there are
4770      * "holes" in the arrays for the charge groups that moved to neighbors.
4771      */
4772     if (fr->cutoff_scheme == ecutsVERLET)
4773     {
4774         moved = get_moved(comm, home_pos_cg);
4775
4776         for (i = dd->ncg_home; i < home_pos_cg; i++)
4777         {
4778             moved[i] = 0;
4779         }
4780     }
4781     dd->ncg_home = home_pos_cg;
4782     dd->nat_home = home_pos_at;
4783
4784     if (fr->cutoff_scheme == ecutsGROUP && !bCompact)
4785     {
4786         /* We overallocated before, we need to set the right size here */
4787         dd_resize_state(state, f, dd->nat_home);
4788     }
4789
4790     if (debug)
4791     {
4792         fprintf(debug,
4793                 "Finished repartitioning: cgs moved out %d, new home %d\n",
4794                 *ncg_moved, dd->ncg_home-*ncg_moved);
4795
4796     }
4797 }
4798
4799 void dd_cycles_add(const gmx_domdec_t *dd, float cycles, int ddCycl)
4800 {
4801     /* Note that the cycles value can be incorrect, either 0 or some
4802      * extremely large value, when our thread migrated to another core
4803      * with an unsynchronized cycle counter. If this happens less often
4804      * that once per nstlist steps, this will not cause issues, since
4805      * we later subtract the maximum value from the sum over nstlist steps.
4806      * A zero count will slightly lower the total, but that's a small effect.
4807      * Note that the main purpose of the subtraction of the maximum value
4808      * is to avoid throwing off the load balancing when stalls occur due
4809      * e.g. system activity or network congestion.
4810      */
4811     dd->comm->cycl[ddCycl] += cycles;
4812     dd->comm->cycl_n[ddCycl]++;
4813     if (cycles > dd->comm->cycl_max[ddCycl])
4814     {
4815         dd->comm->cycl_max[ddCycl] = cycles;
4816     }
4817 }
4818
4819 static double force_flop_count(t_nrnb *nrnb)
4820 {
4821     int         i;
4822     double      sum;
4823     const char *name;
4824
4825     sum = 0;
4826     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
4827     {
4828         /* To get closer to the real timings, we half the count
4829          * for the normal loops and again half it for water loops.
4830          */
4831         name = nrnb_str(i);
4832         if (strstr(name, "W3") != nullptr || strstr(name, "W4") != nullptr)
4833         {
4834             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4835         }
4836         else
4837         {
4838             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4839         }
4840     }
4841     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
4842     {
4843         name = nrnb_str(i);
4844         if (strstr(name, "W3") != nullptr || strstr(name, "W4") != nullptr)
4845         {
4846             sum += nrnb->n[i]*cost_nrnb(i);
4847         }
4848     }
4849     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
4850     {
4851         sum += nrnb->n[i]*cost_nrnb(i);
4852     }
4853
4854     return sum;
4855 }
4856
4857 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
4858 {
4859     if (dd->comm->eFlop)
4860     {
4861         dd->comm->flop -= force_flop_count(nrnb);
4862     }
4863 }
4864 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
4865 {
4866     if (dd->comm->eFlop)
4867     {
4868         dd->comm->flop += force_flop_count(nrnb);
4869         dd->comm->flop_n++;
4870     }
4871 }
4872
4873 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4874 {
4875     int i;
4876
4877     for (i = 0; i < ddCyclNr; i++)
4878     {
4879         dd->comm->cycl[i]     = 0;
4880         dd->comm->cycl_n[i]   = 0;
4881         dd->comm->cycl_max[i] = 0;
4882     }
4883     dd->comm->flop   = 0;
4884     dd->comm->flop_n = 0;
4885 }
4886
4887 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
4888 {
4889     gmx_domdec_comm_t *comm;
4890     domdec_load_t     *load;
4891     domdec_root_t     *root = nullptr;
4892     int                d, dim, i, pos;
4893     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
4894     gmx_bool           bSepPME;
4895
4896     if (debug)
4897     {
4898         fprintf(debug, "get_load_distribution start\n");
4899     }
4900
4901     wallcycle_start(wcycle, ewcDDCOMMLOAD);
4902
4903     comm = dd->comm;
4904
4905     bSepPME = (dd->pme_nodeid >= 0);
4906
4907     if (dd->ndim == 0 && bSepPME)
4908     {
4909         /* Without decomposition, but with PME nodes, we need the load */
4910         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
4911         comm->load[0].pme = comm->cycl[ddCyclPME];
4912     }
4913
4914     for (d = dd->ndim-1; d >= 0; d--)
4915     {
4916         dim = dd->dim[d];
4917         /* Check if we participate in the communication in this dimension */
4918         if (d == dd->ndim-1 ||
4919             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
4920         {
4921             load = &comm->load[d];
4922             if (isDlbOn(dd->comm))
4923             {
4924                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4925             }
4926             pos = 0;
4927             if (d == dd->ndim-1)
4928             {
4929                 sbuf[pos++] = dd_force_load(comm);
4930                 sbuf[pos++] = sbuf[0];
4931                 if (isDlbOn(dd->comm))
4932                 {
4933                     sbuf[pos++] = sbuf[0];
4934                     sbuf[pos++] = cell_frac;
4935                     if (d > 0)
4936                     {
4937                         sbuf[pos++] = comm->cell_f_max0[d];
4938                         sbuf[pos++] = comm->cell_f_min1[d];
4939                     }
4940                 }
4941                 if (bSepPME)
4942                 {
4943                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4944                     sbuf[pos++] = comm->cycl[ddCyclPME];
4945                 }
4946             }
4947             else
4948             {
4949                 sbuf[pos++] = comm->load[d+1].sum;
4950                 sbuf[pos++] = comm->load[d+1].max;
4951                 if (isDlbOn(dd->comm))
4952                 {
4953                     sbuf[pos++] = comm->load[d+1].sum_m;
4954                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4955                     sbuf[pos++] = comm->load[d+1].flags;
4956                     if (d > 0)
4957                     {
4958                         sbuf[pos++] = comm->cell_f_max0[d];
4959                         sbuf[pos++] = comm->cell_f_min1[d];
4960                     }
4961                 }
4962                 if (bSepPME)
4963                 {
4964                     sbuf[pos++] = comm->load[d+1].mdf;
4965                     sbuf[pos++] = comm->load[d+1].pme;
4966                 }
4967             }
4968             load->nload = pos;
4969             /* Communicate a row in DD direction d.
4970              * The communicators are setup such that the root always has rank 0.
4971              */
4972 #if GMX_MPI
4973             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
4974                        load->load, load->nload*sizeof(float), MPI_BYTE,
4975                        0, comm->mpi_comm_load[d]);
4976 #endif
4977             if (dd->ci[dim] == dd->master_ci[dim])
4978             {
4979                 /* We are the root, process this row */
4980                 if (isDlbOn(comm))
4981                 {
4982                     root = comm->root[d];
4983                 }
4984                 load->sum      = 0;
4985                 load->max      = 0;
4986                 load->sum_m    = 0;
4987                 load->cvol_min = 1;
4988                 load->flags    = 0;
4989                 load->mdf      = 0;
4990                 load->pme      = 0;
4991                 pos            = 0;
4992                 for (i = 0; i < dd->nc[dim]; i++)
4993                 {
4994                     load->sum += load->load[pos++];
4995                     load->max  = std::max(load->max, load->load[pos]);
4996                     pos++;
4997                     if (isDlbOn(dd->comm))
4998                     {
4999                         if (root->bLimited)
5000                         {
5001                             /* This direction could not be load balanced properly,
5002                              * therefore we need to use the maximum iso the average load.
5003                              */
5004                             load->sum_m = std::max(load->sum_m, load->load[pos]);
5005                         }
5006                         else
5007                         {
5008                             load->sum_m += load->load[pos];
5009                         }
5010                         pos++;
5011                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
5012                         pos++;
5013                         if (d < dd->ndim-1)
5014                         {
5015                             load->flags = (int)(load->load[pos++] + 0.5);
5016                         }
5017                         if (d > 0)
5018                         {
5019                             root->cell_f_max0[i] = load->load[pos++];
5020                             root->cell_f_min1[i] = load->load[pos++];
5021                         }
5022                     }
5023                     if (bSepPME)
5024                     {
5025                         load->mdf = std::max(load->mdf, load->load[pos]);
5026                         pos++;
5027                         load->pme = std::max(load->pme, load->load[pos]);
5028                         pos++;
5029                     }
5030                 }
5031                 if (isDlbOn(comm) && root->bLimited)
5032                 {
5033                     load->sum_m *= dd->nc[dim];
5034                     load->flags |= (1<<d);
5035                 }
5036             }
5037         }
5038     }
5039
5040     if (DDMASTER(dd))
5041     {
5042         comm->nload      += dd_load_count(comm);
5043         comm->load_step  += comm->cycl[ddCyclStep];
5044         comm->load_sum   += comm->load[0].sum;
5045         comm->load_max   += comm->load[0].max;
5046         if (isDlbOn(comm))
5047         {
5048             for (d = 0; d < dd->ndim; d++)
5049             {
5050                 if (comm->load[0].flags & (1<<d))
5051                 {
5052                     comm->load_lim[d]++;
5053                 }
5054             }
5055         }
5056         if (bSepPME)
5057         {
5058             comm->load_mdf += comm->load[0].mdf;
5059             comm->load_pme += comm->load[0].pme;
5060         }
5061     }
5062
5063     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5064
5065     if (debug)
5066     {
5067         fprintf(debug, "get_load_distribution finished\n");
5068     }
5069 }
5070
5071 static float dd_force_load_fraction(gmx_domdec_t *dd)
5072 {
5073     /* Return the relative performance loss on the total run time
5074      * due to the force calculation load imbalance.
5075      */
5076     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5077     {
5078         return dd->comm->load_sum/(dd->comm->load_step*dd->nnodes);
5079     }
5080     else
5081     {
5082         return 0;
5083     }
5084 }
5085
5086 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5087 {
5088     /* Return the relative performance loss on the total run time
5089      * due to the force calculation load imbalance.
5090      */
5091     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5092     {
5093         return
5094             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5095             (dd->comm->load_step*dd->nnodes);
5096     }
5097     else
5098     {
5099         return 0;
5100     }
5101 }
5102
5103 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5104 {
5105     gmx_domdec_comm_t *comm = dd->comm;
5106
5107     /* Only the master rank prints loads and only if we measured loads */
5108     if (!DDMASTER(dd) || comm->nload == 0)
5109     {
5110         return;
5111     }
5112
5113     char  buf[STRLEN];
5114     int   numPpRanks   = dd->nnodes;
5115     int   numPmeRanks  = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5116     int   numRanks     = numPpRanks + numPmeRanks;
5117     float lossFraction = 0;
5118
5119     /* Print the average load imbalance and performance loss */
5120     if (dd->nnodes > 1 && comm->load_sum > 0)
5121     {
5122         float imbalance = comm->load_max*numPpRanks/comm->load_sum - 1;
5123         lossFraction    = dd_force_imb_perf_loss(dd);
5124
5125         std::string msg         = "\n Dynamic load balancing report:\n";
5126         std::string dlbStateStr = "";
5127
5128         switch (dd->comm->dlbState)
5129         {
5130             case edlbsOffUser:
5131                 dlbStateStr = "DLB was off during the run per user request.";
5132                 break;
5133             case edlbsOffForever:
5134                 /* Currectly this can happen due to performance loss observed, cell size
5135                  * limitations or incompatibility with other settings observed during
5136                  * determineInitialDlbState(). */
5137                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
5138                 break;
5139             case edlbsOffCanTurnOn:
5140                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
5141                 break;
5142             case edlbsOffTemporarilyLocked:
5143                 dlbStateStr = "DLB was locked at the end of the run due to unfinished PP-PME balancing.";
5144                 break;
5145             case edlbsOnCanTurnOff:
5146                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
5147                 break;
5148             case edlbsOnUser:
5149                 dlbStateStr = "DLB was permanently on during the run per user request.";
5150                 break;
5151             default:
5152                 GMX_ASSERT(false, "Undocumented DLB state");
5153         }
5154
5155         msg += " " + dlbStateStr + "\n";
5156         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance*100);
5157         msg += gmx::formatString(" The balanceable part of the MD step is %d%%, load imbalance is computed from this.\n",
5158                                  static_cast<int>(dd_force_load_fraction(dd)*100 + 0.5));
5159         msg += gmx::formatString(" Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
5160                                  lossFraction*100);
5161         fprintf(fplog, "%s", msg.c_str());
5162         fprintf(stderr, "%s", msg.c_str());
5163     }
5164
5165     /* Print during what percentage of steps the  load balancing was limited */
5166     bool dlbWasLimited = false;
5167     if (isDlbOn(comm))
5168     {
5169         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5170         for (int d = 0; d < dd->ndim; d++)
5171         {
5172             int limitPercentage = (200*comm->load_lim[d] + 1)/(2*comm->nload);
5173             sprintf(buf+strlen(buf), " %c %d %%",
5174                     dim2char(dd->dim[d]), limitPercentage);
5175             if (limitPercentage >= 50)
5176             {
5177                 dlbWasLimited = true;
5178             }
5179         }
5180         sprintf(buf + strlen(buf), "\n");
5181         fprintf(fplog, "%s", buf);
5182         fprintf(stderr, "%s", buf);
5183     }
5184
5185     /* Print the performance loss due to separate PME - PP rank imbalance */
5186     float lossFractionPme = 0;
5187     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
5188     {
5189         float pmeForceRatio = comm->load_pme/comm->load_mdf;
5190         lossFractionPme     = (comm->load_pme - comm->load_mdf)/comm->load_step;
5191         if (lossFractionPme <= 0)
5192         {
5193             lossFractionPme *= numPmeRanks/static_cast<float>(numRanks);
5194         }
5195         else
5196         {
5197             lossFractionPme *= numPpRanks/static_cast<float>(numRanks);
5198         }
5199         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
5200         fprintf(fplog, "%s", buf);
5201         fprintf(stderr, "%s", buf);
5202         sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossFractionPme)*100);
5203         fprintf(fplog, "%s", buf);
5204         fprintf(stderr, "%s", buf);
5205     }
5206     fprintf(fplog, "\n");
5207     fprintf(stderr, "\n");
5208
5209     if (lossFraction >= DD_PERF_LOSS_WARN)
5210     {
5211         sprintf(buf,
5212                 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5213                 "      in the domain decomposition.\n", lossFraction*100);
5214         if (!isDlbOn(comm))
5215         {
5216             sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5217         }
5218         else if (dlbWasLimited)
5219         {
5220             sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5221         }
5222         fprintf(fplog, "%s\n", buf);
5223         fprintf(stderr, "%s\n", buf);
5224     }
5225     if (numPmeRanks > 0 && fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
5226     {
5227         sprintf(buf,
5228                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
5229                 "      had %s work to do than the PP ranks.\n"
5230                 "      You might want to %s the number of PME ranks\n"
5231                 "      or %s the cut-off and the grid spacing.\n",
5232                 fabs(lossFractionPme*100),
5233                 (lossFractionPme < 0) ? "less"     : "more",
5234                 (lossFractionPme < 0) ? "decrease" : "increase",
5235                 (lossFractionPme < 0) ? "decrease" : "increase");
5236         fprintf(fplog, "%s\n", buf);
5237         fprintf(stderr, "%s\n", buf);
5238     }
5239 }
5240
5241 static float dd_vol_min(gmx_domdec_t *dd)
5242 {
5243     return dd->comm->load[0].cvol_min*dd->nnodes;
5244 }
5245
5246 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5247 {
5248     return dd->comm->load[0].flags;
5249 }
5250
5251 static float dd_f_imbal(gmx_domdec_t *dd)
5252 {
5253     if (dd->comm->load[0].sum > 0)
5254     {
5255         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
5256     }
5257     else
5258     {
5259         /* Something is wrong in the cycle counting, report no load imbalance */
5260         return 0.0f;
5261     }
5262 }
5263
5264 float dd_pme_f_ratio(gmx_domdec_t *dd)
5265 {
5266     /* Should only be called on the DD master rank */
5267     assert(DDMASTER(dd));
5268
5269     if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
5270     {
5271         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5272     }
5273     else
5274     {
5275         return -1.0;
5276     }
5277 }
5278
5279 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5280 {
5281     int  flags, d;
5282     char buf[22];
5283
5284     flags = dd_load_flags(dd);
5285     if (flags)
5286     {
5287         fprintf(fplog,
5288                 "DD  load balancing is limited by minimum cell size in dimension");
5289         for (d = 0; d < dd->ndim; d++)
5290         {
5291             if (flags & (1<<d))
5292             {
5293                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5294             }
5295         }
5296         fprintf(fplog, "\n");
5297     }
5298     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5299     if (isDlbOn(dd->comm))
5300     {
5301         fprintf(fplog, "  vol min/aver %5.3f%c",
5302                 dd_vol_min(dd), flags ? '!' : ' ');
5303     }
5304     if (dd->nnodes > 1)
5305     {
5306         fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5307     }
5308     if (dd->comm->cycl_n[ddCyclPME])
5309     {
5310         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5311     }
5312     fprintf(fplog, "\n\n");
5313 }
5314
5315 static void dd_print_load_verbose(gmx_domdec_t *dd)
5316 {
5317     if (isDlbOn(dd->comm))
5318     {
5319         fprintf(stderr, "vol %4.2f%c ",
5320                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5321     }
5322     if (dd->nnodes > 1)
5323     {
5324         fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5325     }
5326     if (dd->comm->cycl_n[ddCyclPME])
5327     {
5328         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5329     }
5330 }
5331
5332 #if GMX_MPI
5333 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5334 {
5335     MPI_Comm           c_row;
5336     int                dim, i, rank;
5337     ivec               loc_c;
5338     domdec_root_t     *root;
5339     gmx_bool           bPartOfGroup = FALSE;
5340
5341     dim = dd->dim[dim_ind];
5342     copy_ivec(loc, loc_c);
5343     for (i = 0; i < dd->nc[dim]; i++)
5344     {
5345         loc_c[dim] = i;
5346         rank       = dd_index(dd->nc, loc_c);
5347         if (rank == dd->rank)
5348         {
5349             /* This process is part of the group */
5350             bPartOfGroup = TRUE;
5351         }
5352     }
5353     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5354                    &c_row);
5355     if (bPartOfGroup)
5356     {
5357         dd->comm->mpi_comm_load[dim_ind] = c_row;
5358         if (!isDlbDisabled(dd->comm))
5359         {
5360             if (dd->ci[dim] == dd->master_ci[dim])
5361             {
5362                 /* This is the root process of this row */
5363                 snew(dd->comm->root[dim_ind], 1);
5364                 root = dd->comm->root[dim_ind];
5365                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5366                 snew(root->old_cell_f, dd->nc[dim]+1);
5367                 snew(root->bCellMin, dd->nc[dim]);
5368                 if (dim_ind > 0)
5369                 {
5370                     snew(root->cell_f_max0, dd->nc[dim]);
5371                     snew(root->cell_f_min1, dd->nc[dim]);
5372                     snew(root->bound_min, dd->nc[dim]);
5373                     snew(root->bound_max, dd->nc[dim]);
5374                 }
5375                 snew(root->buf_ncd, dd->nc[dim]);
5376             }
5377             else
5378             {
5379                 /* This is not a root process, we only need to receive cell_f */
5380                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5381             }
5382         }
5383         if (dd->ci[dim] == dd->master_ci[dim])
5384         {
5385             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5386         }
5387     }
5388 }
5389 #endif
5390
5391 void dd_setup_dlb_resource_sharing(t_commrec            *cr,
5392                                    int                   gpu_id)
5393 {
5394 #if GMX_MPI
5395     int           physicalnode_id_hash;
5396     gmx_domdec_t *dd;
5397     MPI_Comm      mpi_comm_pp_physicalnode;
5398
5399     if (!(cr->duty & DUTY_PP) || gpu_id < 0)
5400     {
5401         /* Only ranks with short-ranged tasks (currently) use GPUs.
5402          * If we don't have GPUs assigned, there are no resources to share.
5403          */
5404         return;
5405     }
5406
5407     physicalnode_id_hash = gmx_physicalnode_id_hash();
5408
5409     dd = cr->dd;
5410
5411     if (debug)
5412     {
5413         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5414         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5415                 dd->rank, physicalnode_id_hash, gpu_id);
5416     }
5417     /* Split the PP communicator over the physical nodes */
5418     /* TODO: See if we should store this (before), as it's also used for
5419      * for the nodecomm summution.
5420      */
5421     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5422                    &mpi_comm_pp_physicalnode);
5423     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5424                    &dd->comm->mpi_comm_gpu_shared);
5425     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5426     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5427
5428     if (debug)
5429     {
5430         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5431     }
5432
5433     /* Note that some ranks could share a GPU, while others don't */
5434
5435     if (dd->comm->nrank_gpu_shared == 1)
5436     {
5437         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5438     }
5439 #else
5440     GMX_UNUSED_VALUE(cr);
5441     GMX_UNUSED_VALUE(gpu_id);
5442 #endif
5443 }
5444
5445 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5446 {
5447 #if GMX_MPI
5448     int  dim0, dim1, i, j;
5449     ivec loc;
5450
5451     if (debug)
5452     {
5453         fprintf(debug, "Making load communicators\n");
5454     }
5455
5456     snew(dd->comm->load,          std::max(dd->ndim, 1));
5457     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
5458
5459     if (dd->ndim == 0)
5460     {
5461         return;
5462     }
5463
5464     clear_ivec(loc);
5465     make_load_communicator(dd, 0, loc);
5466     if (dd->ndim > 1)
5467     {
5468         dim0 = dd->dim[0];
5469         for (i = 0; i < dd->nc[dim0]; i++)
5470         {
5471             loc[dim0] = i;
5472             make_load_communicator(dd, 1, loc);
5473         }
5474     }
5475     if (dd->ndim > 2)
5476     {
5477         dim0 = dd->dim[0];
5478         for (i = 0; i < dd->nc[dim0]; i++)
5479         {
5480             loc[dim0] = i;
5481             dim1      = dd->dim[1];
5482             for (j = 0; j < dd->nc[dim1]; j++)
5483             {
5484                 loc[dim1] = j;
5485                 make_load_communicator(dd, 2, loc);
5486             }
5487         }
5488     }
5489
5490     if (debug)
5491     {
5492         fprintf(debug, "Finished making load communicators\n");
5493     }
5494 #endif
5495 }
5496
5497 /*! \brief Sets up the relation between neighboring domains and zones */
5498 static void setup_neighbor_relations(gmx_domdec_t *dd)
5499 {
5500     int                     d, dim, i, j, m;
5501     ivec                    tmp, s;
5502     gmx_domdec_zones_t     *zones;
5503     gmx_domdec_ns_ranges_t *izone;
5504
5505     for (d = 0; d < dd->ndim; d++)
5506     {
5507         dim = dd->dim[d];
5508         copy_ivec(dd->ci, tmp);
5509         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5510         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5511         copy_ivec(dd->ci, tmp);
5512         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5513         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5514         if (debug)
5515         {
5516             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5517                     dd->rank, dim,
5518                     dd->neighbor[d][0],
5519                     dd->neighbor[d][1]);
5520         }
5521     }
5522
5523     int nzone  = (1 << dd->ndim);
5524     int nizone = (1 << std::max(dd->ndim - 1, 0));
5525     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
5526
5527     zones = &dd->comm->zones;
5528
5529     for (i = 0; i < nzone; i++)
5530     {
5531         m = 0;
5532         clear_ivec(zones->shift[i]);
5533         for (d = 0; d < dd->ndim; d++)
5534         {
5535             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5536         }
5537     }
5538
5539     zones->n = nzone;
5540     for (i = 0; i < nzone; i++)
5541     {
5542         for (d = 0; d < DIM; d++)
5543         {
5544             s[d] = dd->ci[d] - zones->shift[i][d];
5545             if (s[d] < 0)
5546             {
5547                 s[d] += dd->nc[d];
5548             }
5549             else if (s[d] >= dd->nc[d])
5550             {
5551                 s[d] -= dd->nc[d];
5552             }
5553         }
5554     }
5555     zones->nizone = nizone;
5556     for (i = 0; i < zones->nizone; i++)
5557     {
5558         assert(ddNonbondedZonePairRanges[i][0] == i);
5559
5560         izone     = &zones->izone[i];
5561         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
5562          * j-zones up to nzone.
5563          */
5564         izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
5565         izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
5566         for (dim = 0; dim < DIM; dim++)
5567         {
5568             if (dd->nc[dim] == 1)
5569             {
5570                 /* All shifts should be allowed */
5571                 izone->shift0[dim] = -1;
5572                 izone->shift1[dim] = 1;
5573             }
5574             else
5575             {
5576                 /* Determine the min/max j-zone shift wrt the i-zone */
5577                 izone->shift0[dim] = 1;
5578                 izone->shift1[dim] = -1;
5579                 for (j = izone->j0; j < izone->j1; j++)
5580                 {
5581                     int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5582                     if (shift_diff < izone->shift0[dim])
5583                     {
5584                         izone->shift0[dim] = shift_diff;
5585                     }
5586                     if (shift_diff > izone->shift1[dim])
5587                     {
5588                         izone->shift1[dim] = shift_diff;
5589                     }
5590                 }
5591             }
5592         }
5593     }
5594
5595     if (!isDlbDisabled(dd->comm))
5596     {
5597         snew(dd->comm->root, dd->ndim);
5598     }
5599
5600     if (dd->comm->bRecordLoad)
5601     {
5602         make_load_communicators(dd);
5603     }
5604 }
5605
5606 static void make_pp_communicator(FILE                 *fplog,
5607                                  gmx_domdec_t         *dd,
5608                                  t_commrec gmx_unused *cr,
5609                                  int gmx_unused        reorder)
5610 {
5611 #if GMX_MPI
5612     gmx_domdec_comm_t *comm;
5613     int                rank, *buf;
5614     ivec               periods;
5615     MPI_Comm           comm_cart;
5616
5617     comm = dd->comm;
5618
5619     if (comm->bCartesianPP)
5620     {
5621         /* Set up cartesian communication for the particle-particle part */
5622         if (fplog)
5623         {
5624             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5625                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5626         }
5627
5628         for (int i = 0; i < DIM; i++)
5629         {
5630             periods[i] = TRUE;
5631         }
5632         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5633                         &comm_cart);
5634         /* We overwrite the old communicator with the new cartesian one */
5635         cr->mpi_comm_mygroup = comm_cart;
5636     }
5637
5638     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5639     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5640
5641     if (comm->bCartesianPP_PME)
5642     {
5643         /* Since we want to use the original cartesian setup for sim,
5644          * and not the one after split, we need to make an index.
5645          */
5646         snew(comm->ddindex2ddnodeid, dd->nnodes);
5647         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5648         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5649         /* Get the rank of the DD master,
5650          * above we made sure that the master node is a PP node.
5651          */
5652         if (MASTER(cr))
5653         {
5654             rank = dd->rank;
5655         }
5656         else
5657         {
5658             rank = 0;
5659         }
5660         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5661     }
5662     else if (comm->bCartesianPP)
5663     {
5664         if (cr->npmenodes == 0)
5665         {
5666             /* The PP communicator is also
5667              * the communicator for this simulation
5668              */
5669             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5670         }
5671         cr->nodeid = dd->rank;
5672
5673         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5674
5675         /* We need to make an index to go from the coordinates
5676          * to the nodeid of this simulation.
5677          */
5678         snew(comm->ddindex2simnodeid, dd->nnodes);
5679         snew(buf, dd->nnodes);
5680         if (cr->duty & DUTY_PP)
5681         {
5682             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5683         }
5684         /* Communicate the ddindex to simulation nodeid index */
5685         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5686                       cr->mpi_comm_mysim);
5687         sfree(buf);
5688
5689         /* Determine the master coordinates and rank.
5690          * The DD master should be the same node as the master of this sim.
5691          */
5692         for (int i = 0; i < dd->nnodes; i++)
5693         {
5694             if (comm->ddindex2simnodeid[i] == 0)
5695             {
5696                 ddindex2xyz(dd->nc, i, dd->master_ci);
5697                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5698             }
5699         }
5700         if (debug)
5701         {
5702             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5703         }
5704     }
5705     else
5706     {
5707         /* No Cartesian communicators */
5708         /* We use the rank in dd->comm->all as DD index */
5709         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5710         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5711         dd->masterrank = 0;
5712         clear_ivec(dd->master_ci);
5713     }
5714 #endif
5715
5716     if (fplog)
5717     {
5718         fprintf(fplog,
5719                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5720                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5721     }
5722     if (debug)
5723     {
5724         fprintf(debug,
5725                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5726                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5727     }
5728 }
5729
5730 static void receive_ddindex2simnodeid(gmx_domdec_t         *dd,
5731                                       t_commrec            *cr)
5732 {
5733 #if GMX_MPI
5734     gmx_domdec_comm_t *comm = dd->comm;
5735
5736     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5737     {
5738         int *buf;
5739         snew(comm->ddindex2simnodeid, dd->nnodes);
5740         snew(buf, dd->nnodes);
5741         if (cr->duty & DUTY_PP)
5742         {
5743             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5744         }
5745         /* Communicate the ddindex to simulation nodeid index */
5746         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5747                       cr->mpi_comm_mysim);
5748         sfree(buf);
5749     }
5750 #else
5751     GMX_UNUSED_VALUE(dd);
5752     GMX_UNUSED_VALUE(cr);
5753 #endif
5754 }
5755
5756 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5757                                                      int ncg, int natoms)
5758 {
5759     gmx_domdec_master_t *ma;
5760     int                  i;
5761
5762     snew(ma, 1);
5763
5764     snew(ma->ncg, dd->nnodes);
5765     snew(ma->index, dd->nnodes+1);
5766     snew(ma->cg, ncg);
5767     snew(ma->nat, dd->nnodes);
5768     snew(ma->ibuf, dd->nnodes*2);
5769     snew(ma->cell_x, DIM);
5770     for (i = 0; i < DIM; i++)
5771     {
5772         snew(ma->cell_x[i], dd->nc[i]+1);
5773     }
5774
5775     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5776     {
5777         ma->vbuf = nullptr;
5778     }
5779     else
5780     {
5781         snew(ma->vbuf, natoms);
5782     }
5783
5784     return ma;
5785 }
5786
5787 static void split_communicator(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
5788                                int gmx_unused dd_rank_order,
5789                                int gmx_unused reorder)
5790 {
5791     gmx_domdec_comm_t *comm;
5792     int                i;
5793     gmx_bool           bDiv[DIM];
5794 #if GMX_MPI
5795     MPI_Comm           comm_cart;
5796 #endif
5797
5798     comm = dd->comm;
5799
5800     if (comm->bCartesianPP)
5801     {
5802         for (i = 1; i < DIM; i++)
5803         {
5804             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5805         }
5806         if (bDiv[YY] || bDiv[ZZ])
5807         {
5808             comm->bCartesianPP_PME = TRUE;
5809             /* If we have 2D PME decomposition, which is always in x+y,
5810              * we stack the PME only nodes in z.
5811              * Otherwise we choose the direction that provides the thinnest slab
5812              * of PME only nodes as this will have the least effect
5813              * on the PP communication.
5814              * But for the PME communication the opposite might be better.
5815              */
5816             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5817                              !bDiv[YY] ||
5818                              dd->nc[YY] > dd->nc[ZZ]))
5819             {
5820                 comm->cartpmedim = ZZ;
5821             }
5822             else
5823             {
5824                 comm->cartpmedim = YY;
5825             }
5826             comm->ntot[comm->cartpmedim]
5827                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5828         }
5829         else if (fplog)
5830         {
5831             fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
5832             fprintf(fplog,
5833                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5834         }
5835     }
5836
5837 #if GMX_MPI
5838     if (comm->bCartesianPP_PME)
5839     {
5840         int  rank;
5841         ivec periods;
5842
5843         if (fplog)
5844         {
5845             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
5846         }
5847
5848         for (i = 0; i < DIM; i++)
5849         {
5850             periods[i] = TRUE;
5851         }
5852         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
5853                         &comm_cart);
5854         MPI_Comm_rank(comm_cart, &rank);
5855         if (MASTER(cr) && rank != 0)
5856         {
5857             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5858         }
5859
5860         /* With this assigment we loose the link to the original communicator
5861          * which will usually be MPI_COMM_WORLD, unless have multisim.
5862          */
5863         cr->mpi_comm_mysim = comm_cart;
5864         cr->sim_nodeid     = rank;
5865
5866         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
5867
5868         if (fplog)
5869         {
5870             fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
5871                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5872         }
5873
5874         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5875         {
5876             cr->duty = DUTY_PP;
5877         }
5878         if (cr->npmenodes == 0 ||
5879             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5880         {
5881             cr->duty = DUTY_PME;
5882         }
5883
5884         /* Split the sim communicator into PP and PME only nodes */
5885         MPI_Comm_split(cr->mpi_comm_mysim,
5886                        cr->duty,
5887                        dd_index(comm->ntot, dd->ci),
5888                        &cr->mpi_comm_mygroup);
5889     }
5890     else
5891     {
5892         switch (dd_rank_order)
5893         {
5894             case ddrankorderPP_PME:
5895                 if (fplog)
5896                 {
5897                     fprintf(fplog, "Order of the ranks: PP first, PME last\n");
5898                 }
5899                 break;
5900             case ddrankorderINTERLEAVE:
5901                 /* Interleave the PP-only and PME-only ranks */
5902                 if (fplog)
5903                 {
5904                     fprintf(fplog, "Interleaving PP and PME ranks\n");
5905                 }
5906                 comm->pmenodes = dd_interleaved_pme_ranks(dd);
5907                 break;
5908             case ddrankorderCARTESIAN:
5909                 break;
5910             default:
5911                 gmx_fatal(FARGS, "Unknown dd_rank_order=%d", dd_rank_order);
5912         }
5913
5914         if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
5915         {
5916             cr->duty = DUTY_PME;
5917         }
5918         else
5919         {
5920             cr->duty = DUTY_PP;
5921         }
5922
5923         /* Split the sim communicator into PP and PME only nodes */
5924         MPI_Comm_split(cr->mpi_comm_mysim,
5925                        cr->duty,
5926                        cr->nodeid,
5927                        &cr->mpi_comm_mygroup);
5928         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
5929     }
5930 #endif
5931
5932     if (fplog)
5933     {
5934         fprintf(fplog, "This rank does only %s work.\n\n",
5935                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5936     }
5937 }
5938
5939 /*! \brief Generates the MPI communicators for domain decomposition */
5940 static void make_dd_communicators(FILE *fplog, t_commrec *cr,
5941                                   gmx_domdec_t *dd, int dd_rank_order)
5942 {
5943     gmx_domdec_comm_t *comm;
5944     int                CartReorder;
5945
5946     comm = dd->comm;
5947
5948     copy_ivec(dd->nc, comm->ntot);
5949
5950     comm->bCartesianPP     = (dd_rank_order == ddrankorderCARTESIAN);
5951     comm->bCartesianPP_PME = FALSE;
5952
5953     /* Reorder the nodes by default. This might change the MPI ranks.
5954      * Real reordering is only supported on very few architectures,
5955      * Blue Gene is one of them.
5956      */
5957     CartReorder = (getenv("GMX_NO_CART_REORDER") == nullptr);
5958
5959     if (cr->npmenodes > 0)
5960     {
5961         /* Split the communicator into a PP and PME part */
5962         split_communicator(fplog, cr, dd, dd_rank_order, CartReorder);
5963         if (comm->bCartesianPP_PME)
5964         {
5965             /* We (possibly) reordered the nodes in split_communicator,
5966              * so it is no longer required in make_pp_communicator.
5967              */
5968             CartReorder = FALSE;
5969         }
5970     }
5971     else
5972     {
5973         /* All nodes do PP and PME */
5974 #if GMX_MPI
5975         /* We do not require separate communicators */
5976         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5977 #endif
5978     }
5979
5980     if (cr->duty & DUTY_PP)
5981     {
5982         /* Copy or make a new PP communicator */
5983         make_pp_communicator(fplog, dd, cr, CartReorder);
5984     }
5985     else
5986     {
5987         receive_ddindex2simnodeid(dd, cr);
5988     }
5989
5990     if (!(cr->duty & DUTY_PME))
5991     {
5992         /* Set up the commnuication to our PME node */
5993         dd->pme_nodeid           = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
5994         dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
5995         if (debug)
5996         {
5997             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
5998                     dd->pme_nodeid, dd->pme_receive_vir_ener);
5999         }
6000     }
6001     else
6002     {
6003         dd->pme_nodeid = -1;
6004     }
6005
6006     if (DDMASTER(dd))
6007     {
6008         dd->ma = init_gmx_domdec_master_t(dd,
6009                                           comm->cgs_gl.nr,
6010                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6011     }
6012 }
6013
6014 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6015 {
6016     real  *slb_frac, tot;
6017     int    i, n;
6018     double dbl;
6019
6020     slb_frac = nullptr;
6021     if (nc > 1 && size_string != nullptr)
6022     {
6023         if (fplog)
6024         {
6025             fprintf(fplog, "Using static load balancing for the %s direction\n",
6026                     dir);
6027         }
6028         snew(slb_frac, nc);
6029         tot = 0;
6030         for (i = 0; i < nc; i++)
6031         {
6032             dbl = 0;
6033             sscanf(size_string, "%20lf%n", &dbl, &n);
6034             if (dbl == 0)
6035             {
6036                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6037             }
6038             slb_frac[i]  = dbl;
6039             size_string += n;
6040             tot         += slb_frac[i];
6041         }
6042         /* Normalize */
6043         if (fplog)
6044         {
6045             fprintf(fplog, "Relative cell sizes:");
6046         }
6047         for (i = 0; i < nc; i++)
6048         {
6049             slb_frac[i] /= tot;
6050             if (fplog)
6051             {
6052                 fprintf(fplog, " %5.3f", slb_frac[i]);
6053             }
6054         }
6055         if (fplog)
6056         {
6057             fprintf(fplog, "\n");
6058         }
6059     }
6060
6061     return slb_frac;
6062 }
6063
6064 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
6065 {
6066     int                  n, nmol, ftype;
6067     gmx_mtop_ilistloop_t iloop;
6068     t_ilist             *il;
6069
6070     n     = 0;
6071     iloop = gmx_mtop_ilistloop_init(mtop);
6072     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6073     {
6074         for (ftype = 0; ftype < F_NRE; ftype++)
6075         {
6076             if ((interaction_function[ftype].flags & IF_BOND) &&
6077                 NRAL(ftype) >  2)
6078             {
6079                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6080             }
6081         }
6082     }
6083
6084     return n;
6085 }
6086
6087 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6088 {
6089     char *val;
6090     int   nst;
6091
6092     nst = def;
6093     val = getenv(env_var);
6094     if (val)
6095     {
6096         if (sscanf(val, "%20d", &nst) <= 0)
6097         {
6098             nst = 1;
6099         }
6100         if (fplog)
6101         {
6102             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6103                     env_var, val, nst);
6104         }
6105     }
6106
6107     return nst;
6108 }
6109
6110 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6111 {
6112     if (MASTER(cr))
6113     {
6114         fprintf(stderr, "\n%s\n", warn_string);
6115     }
6116     if (fplog)
6117     {
6118         fprintf(fplog, "\n%s\n", warn_string);
6119     }
6120 }
6121
6122 static void check_dd_restrictions(t_commrec *cr, const gmx_domdec_t *dd,
6123                                   const t_inputrec *ir, FILE *fplog)
6124 {
6125     if (ir->ePBC == epbcSCREW &&
6126         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6127     {
6128         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6129     }
6130
6131     if (ir->ns_type == ensSIMPLE)
6132     {
6133         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6134     }
6135
6136     if (ir->nstlist == 0)
6137     {
6138         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6139     }
6140
6141     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6142     {
6143         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6144     }
6145 }
6146
6147 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6148 {
6149     int  di, d;
6150     real r;
6151
6152     r = ddbox->box_size[XX];
6153     for (di = 0; di < dd->ndim; di++)
6154     {
6155         d = dd->dim[di];
6156         /* Check using the initial average cell size */
6157         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6158     }
6159
6160     return r;
6161 }
6162
6163 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
6164  */
6165 static int forceDlbOffOrBail(int                cmdlineDlbState,
6166                              const std::string &reasonStr,
6167                              t_commrec         *cr,
6168                              FILE              *fplog)
6169 {
6170     std::string dlbNotSupportedErr  = "Dynamic load balancing requested, but ";
6171     std::string dlbDisableNote      = "NOTE: disabling dynamic load balancing as ";
6172
6173     if (cmdlineDlbState == edlbsOnUser)
6174     {
6175         gmx_fatal(FARGS, (dlbNotSupportedErr + reasonStr).c_str());
6176     }
6177     else if (cmdlineDlbState == edlbsOffCanTurnOn)
6178     {
6179         dd_warning(cr, fplog, (dlbDisableNote + reasonStr + "\n").c_str());
6180     }
6181     return edlbsOffForever;
6182 }
6183
6184 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
6185  *
6186  * This function parses the parameters of "-dlb" command line option setting
6187  * corresponding state values. Then it checks the consistency of the determined
6188  * state with other run parameters and settings. As a result, the initial state
6189  * may be altered or an error may be thrown if incompatibility of options is detected.
6190  *
6191  * \param [in] fplog       Pointer to mdrun log file.
6192  * \param [in] cr          Pointer to MPI communication object.
6193  * \param [in] dlb_opt     Pointer to the '-dlb' command line argument's option.
6194  * \param [in] bRecordLoad True if the load balancer is recording load information.
6195  * \param [in] Flags       Simulation flags passed from main.
6196  * \param [in] ir          Pointer mdrun to input parameters.
6197  * \returns                DLB initial/startup state.
6198  */
6199 static int determineInitialDlbState(FILE *fplog, t_commrec *cr,
6200                                     const char *dlb_opt, gmx_bool bRecordLoad,
6201                                     unsigned long Flags, const t_inputrec *ir)
6202 {
6203     int dlbState = -1;
6204
6205     switch (dlb_opt[0])
6206     {
6207         case 'a': dlbState = edlbsOffCanTurnOn; break;
6208         case 'n': dlbState = edlbsOffUser;      break;
6209         case 'y': dlbState = edlbsOnUser;       break;
6210         default: gmx_incons("Unknown dlb_opt");
6211     }
6212
6213     /* Reruns don't support DLB: bail or override auto mode */
6214     if (Flags & MD_RERUN)
6215     {
6216         std::string reasonStr = "it is not supported in reruns.";
6217         return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6218     }
6219
6220     /* Unsupported integrators */
6221     if (!EI_DYNAMICS(ir->eI))
6222     {
6223         auto reasonStr = gmx::formatString("it is only supported with dynamics, not with integrator '%s'.", EI(ir->eI));
6224         return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6225     }
6226
6227     /* Without cycle counters we can't time work to balance on */
6228     if (!bRecordLoad)
6229     {
6230         std::string reasonStr = "cycle counters unsupported or not enabled in the operating system kernel.";
6231         return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6232     }
6233
6234     if (Flags & MD_REPRODUCIBLE)
6235     {
6236         std::string reasonStr = "you started a reproducible run.";
6237         switch (dlbState)
6238         {
6239             case edlbsOffUser:
6240                 break;
6241             case edlbsOffForever:
6242                 GMX_RELEASE_ASSERT(false, "edlbsOffForever is not a valid initial state");
6243                 break;
6244             case edlbsOffCanTurnOn:
6245                 return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6246                 break;
6247             case edlbsOnCanTurnOff:
6248                 GMX_RELEASE_ASSERT(false, "edlbsOffCanTurnOff is not a valid initial state");
6249                 break;
6250             case edlbsOnUser:
6251                 return forceDlbOffOrBail(dlbState, reasonStr + " In load balanced runs binary reproducibility cannot be ensured.", cr, fplog);
6252                 break;
6253             default:
6254                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
6255                 break;
6256         }
6257     }
6258
6259     return dlbState;
6260 }
6261
6262 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6263 {
6264     int dim;
6265
6266     dd->ndim = 0;
6267     if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
6268     {
6269         /* Decomposition order z,y,x */
6270         if (fplog)
6271         {
6272             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6273         }
6274         for (dim = DIM-1; dim >= 0; dim--)
6275         {
6276             if (dd->nc[dim] > 1)
6277             {
6278                 dd->dim[dd->ndim++] = dim;
6279             }
6280         }
6281     }
6282     else
6283     {
6284         /* Decomposition order x,y,z */
6285         for (dim = 0; dim < DIM; dim++)
6286         {
6287             if (dd->nc[dim] > 1)
6288             {
6289                 dd->dim[dd->ndim++] = dim;
6290             }
6291         }
6292     }
6293 }
6294
6295 static gmx_domdec_comm_t *init_dd_comm()
6296 {
6297     gmx_domdec_comm_t *comm;
6298     int                i;
6299
6300     snew(comm, 1);
6301     snew(comm->cggl_flag, DIM*2);
6302     snew(comm->cgcm_state, DIM*2);
6303     for (i = 0; i < DIM*2; i++)
6304     {
6305         comm->cggl_flag_nalloc[i]  = 0;
6306         comm->cgcm_state_nalloc[i] = 0;
6307     }
6308
6309     comm->nalloc_int = 0;
6310     comm->buf_int    = nullptr;
6311
6312     vec_rvec_init(&comm->vbuf);
6313
6314     comm->n_load_have    = 0;
6315     comm->n_load_collect = 0;
6316
6317     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6318     {
6319         comm->sum_nat[i] = 0;
6320     }
6321     comm->ndecomp   = 0;
6322     comm->nload     = 0;
6323     comm->load_step = 0;
6324     comm->load_sum  = 0;
6325     comm->load_max  = 0;
6326     clear_ivec(comm->load_lim);
6327     comm->load_mdf  = 0;
6328     comm->load_pme  = 0;
6329
6330     /* This should be replaced by a unique pointer */
6331     comm->balanceRegion = ddBalanceRegionAllocate();
6332
6333     return comm;
6334 }
6335
6336 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
6337 static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
6338                                    unsigned long Flags,
6339                                    ivec nc, int nPmeRanks,
6340                                    real comm_distance_min, real rconstr,
6341                                    const char *dlb_opt, real dlb_scale,
6342                                    const char *sizex, const char *sizey, const char *sizez,
6343                                    const gmx_mtop_t *mtop,
6344                                    const t_inputrec *ir,
6345                                    matrix box, const rvec *x,
6346                                    gmx_ddbox_t *ddbox,
6347                                    int *npme_x, int *npme_y)
6348 {
6349     real               r_bonded         = -1;
6350     real               r_bonded_limit   = -1;
6351     const real         tenPercentMargin = 1.1;
6352     gmx_domdec_comm_t *comm             = dd->comm;
6353
6354     snew(comm->cggl_flag, DIM*2);
6355     snew(comm->cgcm_state, DIM*2);
6356
6357     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6358     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6359
6360     dd->pme_recv_f_alloc = 0;
6361     dd->pme_recv_f_buf   = nullptr;
6362
6363     /* Initialize to GPU share count to 0, might change later */
6364     comm->nrank_gpu_shared = 0;
6365
6366     comm->dlbState         = determineInitialDlbState(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6367     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
6368     /* To consider turning DLB on after 2*nstlist steps we need to check
6369      * at partitioning count 3. Thus we need to increase the first count by 2.
6370      */
6371     comm->ddPartioningCountFirstDlbOff += 2;
6372
6373     if (fplog)
6374     {
6375         fprintf(fplog, "Dynamic load balancing: %s\n",
6376                 edlbs_names[comm->dlbState]);
6377     }
6378     comm->bPMELoadBalDLBLimits = FALSE;
6379
6380     /* Allocate the charge group/atom sorting struct */
6381     snew(comm->sort, 1);
6382
6383     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6384
6385     comm->bInterCGBondeds = ((ncg_mtop(mtop) > mtop->mols.nr) ||
6386                              mtop->bIntermolecularInteractions);
6387     if (comm->bInterCGBondeds)
6388     {
6389         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6390     }
6391     else
6392     {
6393         comm->bInterCGMultiBody = FALSE;
6394     }
6395
6396     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6397     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6398
6399     if (ir->rlist == 0)
6400     {
6401         /* Set the cut-off to some very large value,
6402          * so we don't need if statements everywhere in the code.
6403          * We use sqrt, since the cut-off is squared in some places.
6404          */
6405         comm->cutoff   = GMX_CUTOFF_INF;
6406     }
6407     else
6408     {
6409         comm->cutoff   = ir->rlist;
6410     }
6411     comm->cutoff_mbody = 0;
6412
6413     comm->cellsize_limit = 0;
6414     comm->bBondComm      = FALSE;
6415
6416     /* Atoms should be able to move by up to half the list buffer size (if > 0)
6417      * within nstlist steps. Since boundaries are allowed to displace by half
6418      * a cell size, DD cells should be at least the size of the list buffer.
6419      */
6420     comm->cellsize_limit = std::max(comm->cellsize_limit,
6421                                     ir->rlist - std::max(ir->rvdw, ir->rcoulomb));
6422
6423     if (comm->bInterCGBondeds)
6424     {
6425         if (comm_distance_min > 0)
6426         {
6427             comm->cutoff_mbody = comm_distance_min;
6428             if (Flags & MD_DDBONDCOMM)
6429             {
6430                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6431             }
6432             else
6433             {
6434                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6435             }
6436             r_bonded_limit = comm->cutoff_mbody;
6437         }
6438         else if (ir->bPeriodicMols)
6439         {
6440             /* Can not easily determine the required cut-off */
6441             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6442             comm->cutoff_mbody = comm->cutoff/2;
6443             r_bonded_limit     = comm->cutoff_mbody;
6444         }
6445         else
6446         {
6447             real r_2b, r_mb;
6448
6449             if (MASTER(cr))
6450             {
6451                 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6452                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6453             }
6454             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6455             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6456
6457             /* We use an initial margin of 10% for the minimum cell size,
6458              * except when we are just below the non-bonded cut-off.
6459              */
6460             if (Flags & MD_DDBONDCOMM)
6461             {
6462                 if (std::max(r_2b, r_mb) > comm->cutoff)
6463                 {
6464                     r_bonded        = std::max(r_2b, r_mb);
6465                     r_bonded_limit  = tenPercentMargin*r_bonded;
6466                     comm->bBondComm = TRUE;
6467                 }
6468                 else
6469                 {
6470                     r_bonded       = r_mb;
6471                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
6472                 }
6473                 /* We determine cutoff_mbody later */
6474             }
6475             else
6476             {
6477                 /* No special bonded communication,
6478                  * simply increase the DD cut-off.
6479                  */
6480                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
6481                 comm->cutoff_mbody = r_bonded_limit;
6482                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
6483             }
6484         }
6485         if (fplog)
6486         {
6487             fprintf(fplog,
6488                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6489                     r_bonded_limit);
6490         }
6491         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
6492     }
6493
6494     if (dd->bInterCGcons && rconstr <= 0)
6495     {
6496         /* There is a cell size limit due to the constraints (P-LINCS) */
6497         rconstr = constr_r_max(fplog, mtop, ir);
6498         if (fplog)
6499         {
6500             fprintf(fplog,
6501                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6502                     rconstr);
6503             if (rconstr > comm->cellsize_limit)
6504             {
6505                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6506             }
6507         }
6508     }
6509     else if (rconstr > 0 && fplog)
6510     {
6511         /* Here we do not check for dd->bInterCGcons,
6512          * because one can also set a cell size limit for virtual sites only
6513          * and at this point we don't know yet if there are intercg v-sites.
6514          */
6515         fprintf(fplog,
6516                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6517                 rconstr);
6518     }
6519     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
6520
6521     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6522
6523     if (nc[XX] > 0)
6524     {
6525         copy_ivec(nc, dd->nc);
6526         set_dd_dim(fplog, dd);
6527         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6528
6529         if (nPmeRanks >= 0)
6530         {
6531             cr->npmenodes = nPmeRanks;
6532         }
6533         else
6534         {
6535             /* When the DD grid is set explicitly and -npme is set to auto,
6536              * don't use PME ranks. We check later if the DD grid is
6537              * compatible with the total number of ranks.
6538              */
6539             cr->npmenodes = 0;
6540         }
6541
6542         real acs = average_cellsize_min(dd, ddbox);
6543         if (acs < comm->cellsize_limit)
6544         {
6545             if (fplog)
6546             {
6547                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6548             }
6549             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6550                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6551                                  acs, comm->cellsize_limit);
6552         }
6553     }
6554     else
6555     {
6556         set_ddbox_cr(cr, nullptr, ir, box, &comm->cgs_gl, x, ddbox);
6557
6558         /* We need to choose the optimal DD grid and possibly PME nodes */
6559         real limit =
6560             dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6561                            nPmeRanks,
6562                            !isDlbDisabled(comm),
6563                            dlb_scale,
6564                            comm->cellsize_limit, comm->cutoff,
6565                            comm->bInterCGBondeds);
6566
6567         if (dd->nc[XX] == 0)
6568         {
6569             char     buf[STRLEN];
6570             gmx_bool bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6571             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6572                     !bC ? "-rdd" : "-rcon",
6573                     comm->dlbState != edlbsOffUser ? " or -dds" : "",
6574                     bC ? " or your LINCS settings" : "");
6575
6576             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6577                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6578                                  "%s\n"
6579                                  "Look in the log file for details on the domain decomposition",
6580                                  cr->nnodes-cr->npmenodes, limit, buf);
6581         }
6582         set_dd_dim(fplog, dd);
6583     }
6584
6585     if (fplog)
6586     {
6587         fprintf(fplog,
6588                 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6589                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6590     }
6591
6592     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6593     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6594     {
6595         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6596                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6597                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6598     }
6599     if (cr->npmenodes > dd->nnodes)
6600     {
6601         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6602                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6603     }
6604     if (cr->npmenodes > 0)
6605     {
6606         comm->npmenodes = cr->npmenodes;
6607     }
6608     else
6609     {
6610         comm->npmenodes = dd->nnodes;
6611     }
6612
6613     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6614     {
6615         /* The following choices should match those
6616          * in comm_cost_est in domdec_setup.c.
6617          * Note that here the checks have to take into account
6618          * that the decomposition might occur in a different order than xyz
6619          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6620          * in which case they will not match those in comm_cost_est,
6621          * but since that is mainly for testing purposes that's fine.
6622          */
6623         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6624             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6625             getenv("GMX_PMEONEDD") == nullptr)
6626         {
6627             comm->npmedecompdim = 2;
6628             comm->npmenodes_x   = dd->nc[XX];
6629             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6630         }
6631         else
6632         {
6633             /* In case nc is 1 in both x and y we could still choose to
6634              * decompose pme in y instead of x, but we use x for simplicity.
6635              */
6636             comm->npmedecompdim = 1;
6637             if (dd->dim[0] == YY)
6638             {
6639                 comm->npmenodes_x = 1;
6640                 comm->npmenodes_y = comm->npmenodes;
6641             }
6642             else
6643             {
6644                 comm->npmenodes_x = comm->npmenodes;
6645                 comm->npmenodes_y = 1;
6646             }
6647         }
6648         if (fplog)
6649         {
6650             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6651                     comm->npmenodes_x, comm->npmenodes_y, 1);
6652         }
6653     }
6654     else
6655     {
6656         comm->npmedecompdim = 0;
6657         comm->npmenodes_x   = 0;
6658         comm->npmenodes_y   = 0;
6659     }
6660
6661     /* Technically we don't need both of these,
6662      * but it simplifies code not having to recalculate it.
6663      */
6664     *npme_x = comm->npmenodes_x;
6665     *npme_y = comm->npmenodes_y;
6666
6667     snew(comm->slb_frac, DIM);
6668     if (isDlbDisabled(comm))
6669     {
6670         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6671         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6672         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6673     }
6674
6675     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6676     {
6677         if (comm->bBondComm || !isDlbDisabled(comm))
6678         {
6679             /* Set the bonded communication distance to halfway
6680              * the minimum and the maximum,
6681              * since the extra communication cost is nearly zero.
6682              */
6683             real acs           = average_cellsize_min(dd, ddbox);
6684             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6685             if (!isDlbDisabled(comm))
6686             {
6687                 /* Check if this does not limit the scaling */
6688                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
6689             }
6690             if (!comm->bBondComm)
6691             {
6692                 /* Without bBondComm do not go beyond the n.b. cut-off */
6693                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
6694                 if (comm->cellsize_limit >= comm->cutoff)
6695                 {
6696                     /* We don't loose a lot of efficieny
6697                      * when increasing it to the n.b. cut-off.
6698                      * It can even be slightly faster, because we need
6699                      * less checks for the communication setup.
6700                      */
6701                     comm->cutoff_mbody = comm->cutoff;
6702                 }
6703             }
6704             /* Check if we did not end up below our original limit */
6705             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
6706
6707             if (comm->cutoff_mbody > comm->cellsize_limit)
6708             {
6709                 comm->cellsize_limit = comm->cutoff_mbody;
6710             }
6711         }
6712         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6713     }
6714
6715     if (debug)
6716     {
6717         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6718                 "cellsize limit %f\n",
6719                 comm->bBondComm, comm->cellsize_limit);
6720     }
6721
6722     if (MASTER(cr))
6723     {
6724         check_dd_restrictions(cr, dd, ir, fplog);
6725     }
6726 }
6727
6728 static void set_dlb_limits(gmx_domdec_t *dd)
6729
6730 {
6731     int d;
6732
6733     for (d = 0; d < dd->ndim; d++)
6734     {
6735         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6736         dd->comm->cellsize_min[dd->dim[d]] =
6737             dd->comm->cellsize_min_dlb[dd->dim[d]];
6738     }
6739 }
6740
6741
6742 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6743 {
6744     gmx_domdec_t      *dd;
6745     gmx_domdec_comm_t *comm;
6746     real               cellsize_min;
6747     int                d, nc, i;
6748
6749     dd   = cr->dd;
6750     comm = dd->comm;
6751
6752     cellsize_min = comm->cellsize_min[dd->dim[0]];
6753     for (d = 1; d < dd->ndim; d++)
6754     {
6755         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6756     }
6757
6758     /* Turn off DLB if we're too close to the cell size limit. */
6759     if (cellsize_min < comm->cellsize_limit*1.05)
6760     {
6761         auto str = gmx::formatString("step %" GMX_PRId64 " Measured %.1f %% performance loss due to load imbalance, "
6762                                      "but the minimum cell size is smaller than 1.05 times the cell size limit."
6763                                      "Will no longer try dynamic load balancing.\n", step, dd_force_imb_perf_loss(dd)*100);
6764         dd_warning(cr, fplog, str.c_str());
6765
6766         comm->dlbState = edlbsOffForever;
6767         return;
6768     }
6769
6770     char buf[STRLEN];
6771     sprintf(buf, "step %" GMX_PRId64 " Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.\n", step, dd_force_imb_perf_loss(dd)*100);
6772     dd_warning(cr, fplog, buf);
6773     comm->dlbState = edlbsOnCanTurnOff;
6774
6775     /* Store the non-DLB performance, so we can check if DLB actually
6776      * improves performance.
6777      */
6778     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
6779     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
6780
6781     set_dlb_limits(dd);
6782
6783     /* We can set the required cell size info here,
6784      * so we do not need to communicate this.
6785      * The grid is completely uniform.
6786      */
6787     for (d = 0; d < dd->ndim; d++)
6788     {
6789         if (comm->root[d])
6790         {
6791             comm->load[d].sum_m = comm->load[d].sum;
6792
6793             nc = dd->nc[dd->dim[d]];
6794             for (i = 0; i < nc; i++)
6795             {
6796                 comm->root[d]->cell_f[i]    = i/(real)nc;
6797                 if (d > 0)
6798                 {
6799                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6800                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6801                 }
6802             }
6803             comm->root[d]->cell_f[nc] = 1.0;
6804         }
6805     }
6806 }
6807
6808 static void turn_off_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6809 {
6810     gmx_domdec_t *dd = cr->dd;
6811
6812     char          buf[STRLEN];
6813     sprintf(buf, "step %" GMX_PRId64 " Turning off dynamic load balancing, because it is degrading performance.\n", step);
6814     dd_warning(cr, fplog, buf);
6815     dd->comm->dlbState                     = edlbsOffCanTurnOn;
6816     dd->comm->haveTurnedOffDlb             = true;
6817     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
6818 }
6819
6820 static void turn_off_dlb_forever(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6821 {
6822     GMX_RELEASE_ASSERT(cr->dd->comm->dlbState == edlbsOffCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
6823     char buf[STRLEN];
6824     sprintf(buf, "step %" GMX_PRId64 " Will no longer try dynamic load balancing, as it degraded performance.\n", step);
6825     dd_warning(cr, fplog, buf);
6826     cr->dd->comm->dlbState = edlbsOffForever;
6827 }
6828
6829 static char *init_bLocalCG(const gmx_mtop_t *mtop)
6830 {
6831     int   ncg, cg;
6832     char *bLocalCG;
6833
6834     ncg = ncg_mtop(mtop);
6835     snew(bLocalCG, ncg);
6836     for (cg = 0; cg < ncg; cg++)
6837     {
6838         bLocalCG[cg] = FALSE;
6839     }
6840
6841     return bLocalCG;
6842 }
6843
6844 void dd_init_bondeds(FILE *fplog,
6845                      gmx_domdec_t *dd,
6846                      const gmx_mtop_t *mtop,
6847                      const gmx_vsite_t *vsite,
6848                      const t_inputrec *ir,
6849                      gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
6850 {
6851     gmx_domdec_comm_t *comm;
6852
6853     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
6854
6855     comm = dd->comm;
6856
6857     if (comm->bBondComm)
6858     {
6859         /* Communicate atoms beyond the cut-off for bonded interactions */
6860         comm = dd->comm;
6861
6862         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
6863
6864         comm->bLocalCG = init_bLocalCG(mtop);
6865     }
6866     else
6867     {
6868         /* Only communicate atoms based on cut-off */
6869         comm->cglink   = nullptr;
6870         comm->bLocalCG = nullptr;
6871     }
6872 }
6873
6874 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
6875                               const gmx_mtop_t *mtop, const t_inputrec *ir,
6876                               gmx_bool bDynLoadBal, real dlb_scale,
6877                               const gmx_ddbox_t *ddbox)
6878 {
6879     gmx_domdec_comm_t *comm;
6880     int                d;
6881     ivec               np;
6882     real               limit, shrink;
6883     char               buf[64];
6884
6885     if (fplog == nullptr)
6886     {
6887         return;
6888     }
6889
6890     comm = dd->comm;
6891
6892     if (bDynLoadBal)
6893     {
6894         fprintf(fplog, "The maximum number of communication pulses is:");
6895         for (d = 0; d < dd->ndim; d++)
6896         {
6897             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
6898         }
6899         fprintf(fplog, "\n");
6900         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
6901         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
6902         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
6903         for (d = 0; d < DIM; d++)
6904         {
6905             if (dd->nc[d] > 1)
6906             {
6907                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6908                 {
6909                     shrink = 0;
6910                 }
6911                 else
6912                 {
6913                     shrink =
6914                         comm->cellsize_min_dlb[d]/
6915                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6916                 }
6917                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
6918             }
6919         }
6920         fprintf(fplog, "\n");
6921     }
6922     else
6923     {
6924         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
6925         fprintf(fplog, "The initial number of communication pulses is:");
6926         for (d = 0; d < dd->ndim; d++)
6927         {
6928             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
6929         }
6930         fprintf(fplog, "\n");
6931         fprintf(fplog, "The initial domain decomposition cell size is:");
6932         for (d = 0; d < DIM; d++)
6933         {
6934             if (dd->nc[d] > 1)
6935             {
6936                 fprintf(fplog, " %c %.2f nm",
6937                         dim2char(d), dd->comm->cellsize_min[d]);
6938             }
6939         }
6940         fprintf(fplog, "\n\n");
6941     }
6942
6943     gmx_bool bInterCGVsites = count_intercg_vsites(mtop);
6944
6945     if (comm->bInterCGBondeds ||
6946         bInterCGVsites ||
6947         dd->bInterCGcons || dd->bInterCGsettles)
6948     {
6949         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
6950         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6951                 "non-bonded interactions", "", comm->cutoff);
6952
6953         if (bDynLoadBal)
6954         {
6955             limit = dd->comm->cellsize_limit;
6956         }
6957         else
6958         {
6959             if (dynamic_dd_box(ddbox, ir))
6960             {
6961                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
6962             }
6963             limit = dd->comm->cellsize_min[XX];
6964             for (d = 1; d < DIM; d++)
6965             {
6966                 limit = std::min(limit, dd->comm->cellsize_min[d]);
6967             }
6968         }
6969
6970         if (comm->bInterCGBondeds)
6971         {
6972             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6973                     "two-body bonded interactions", "(-rdd)",
6974                     std::max(comm->cutoff, comm->cutoff_mbody));
6975             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6976                     "multi-body bonded interactions", "(-rdd)",
6977                     (comm->bBondComm || isDlbOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
6978         }
6979         if (bInterCGVsites)
6980         {
6981             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6982                     "virtual site constructions", "(-rcon)", limit);
6983         }
6984         if (dd->bInterCGcons || dd->bInterCGsettles)
6985         {
6986             sprintf(buf, "atoms separated by up to %d constraints",
6987                     1+ir->nProjOrder);
6988             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6989                     buf, "(-rcon)", limit);
6990         }
6991         fprintf(fplog, "\n");
6992     }
6993
6994     fflush(fplog);
6995 }
6996
6997 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
6998                                 real               dlb_scale,
6999                                 const t_inputrec  *ir,
7000                                 const gmx_ddbox_t *ddbox)
7001 {
7002     gmx_domdec_comm_t *comm;
7003     int                d, dim, npulse, npulse_d_max, npulse_d;
7004     gmx_bool           bNoCutOff;
7005
7006     comm = dd->comm;
7007
7008     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7009
7010     /* Determine the maximum number of comm. pulses in one dimension */
7011
7012     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7013
7014     /* Determine the maximum required number of grid pulses */
7015     if (comm->cellsize_limit >= comm->cutoff)
7016     {
7017         /* Only a single pulse is required */
7018         npulse = 1;
7019     }
7020     else if (!bNoCutOff && comm->cellsize_limit > 0)
7021     {
7022         /* We round down slightly here to avoid overhead due to the latency
7023          * of extra communication calls when the cut-off
7024          * would be only slightly longer than the cell size.
7025          * Later cellsize_limit is redetermined,
7026          * so we can not miss interactions due to this rounding.
7027          */
7028         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7029     }
7030     else
7031     {
7032         /* There is no cell size limit */
7033         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7034     }
7035
7036     if (!bNoCutOff && npulse > 1)
7037     {
7038         /* See if we can do with less pulses, based on dlb_scale */
7039         npulse_d_max = 0;
7040         for (d = 0; d < dd->ndim; d++)
7041         {
7042             dim      = dd->dim[d];
7043             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7044                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7045             npulse_d_max = std::max(npulse_d_max, npulse_d);
7046         }
7047         npulse = std::min(npulse, npulse_d_max);
7048     }
7049
7050     /* This env var can override npulse */
7051     d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
7052     if (d > 0)
7053     {
7054         npulse = d;
7055     }
7056
7057     comm->maxpulse       = 1;
7058     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7059     for (d = 0; d < dd->ndim; d++)
7060     {
7061         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
7062         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7063         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7064         comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
7065         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7066         {
7067             comm->bVacDLBNoLimit = FALSE;
7068         }
7069     }
7070
7071     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7072     if (!comm->bVacDLBNoLimit)
7073     {
7074         comm->cellsize_limit = std::max(comm->cellsize_limit,
7075                                         comm->cutoff/comm->maxpulse);
7076     }
7077     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7078     /* Set the minimum cell size for each DD dimension */
7079     for (d = 0; d < dd->ndim; d++)
7080     {
7081         if (comm->bVacDLBNoLimit ||
7082             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7083         {
7084             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7085         }
7086         else
7087         {
7088             comm->cellsize_min_dlb[dd->dim[d]] =
7089                 comm->cutoff/comm->cd[d].np_dlb;
7090         }
7091     }
7092     if (comm->cutoff_mbody <= 0)
7093     {
7094         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
7095     }
7096     if (isDlbOn(comm))
7097     {
7098         set_dlb_limits(dd);
7099     }
7100 }
7101
7102 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
7103 {
7104     /* If each molecule is a single charge group
7105      * or we use domain decomposition for each periodic dimension,
7106      * we do not need to take pbc into account for the bonded interactions.
7107      */
7108     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7109             !(dd->nc[XX] > 1 &&
7110               dd->nc[YY] > 1 &&
7111               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7112 }
7113
7114 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
7115 static void set_ddgrid_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7116                                   const gmx_mtop_t *mtop, const t_inputrec *ir,
7117                                   const gmx_ddbox_t *ddbox)
7118 {
7119     gmx_domdec_comm_t *comm;
7120     int                natoms_tot;
7121     real               vol_frac;
7122
7123     comm = dd->comm;
7124
7125     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7126     {
7127         init_ddpme(dd, &comm->ddpme[0], 0);
7128         if (comm->npmedecompdim >= 2)
7129         {
7130             init_ddpme(dd, &comm->ddpme[1], 1);
7131         }
7132     }
7133     else
7134     {
7135         comm->npmenodes = 0;
7136         if (dd->pme_nodeid >= 0)
7137         {
7138             gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
7139                                  "Can not have separate PME ranks without PME electrostatics");
7140         }
7141     }
7142
7143     if (debug)
7144     {
7145         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7146     }
7147     if (!isDlbDisabled(comm))
7148     {
7149         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7150     }
7151
7152     print_dd_settings(fplog, dd, mtop, ir, isDlbOn(comm), dlb_scale, ddbox);
7153     if (comm->dlbState == edlbsOffCanTurnOn)
7154     {
7155         if (fplog)
7156         {
7157             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7158         }
7159         print_dd_settings(fplog, dd, mtop, ir, TRUE, dlb_scale, ddbox);
7160     }
7161
7162     if (ir->ePBC == epbcNONE)
7163     {
7164         vol_frac = 1 - 1/(double)dd->nnodes;
7165     }
7166     else
7167     {
7168         vol_frac =
7169             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7170     }
7171     if (debug)
7172     {
7173         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7174     }
7175     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7176
7177     dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
7178 }
7179
7180 /*! \brief Set some important DD parameters that can be modified by env.vars */
7181 static void set_dd_envvar_options(FILE *fplog, gmx_domdec_t *dd, int rank_mysim)
7182 {
7183     gmx_domdec_comm_t *comm = dd->comm;
7184
7185     dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
7186     comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
7187     comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
7188     int recload         = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
7189     comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
7190     comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
7191     comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
7192
7193     if (dd->bSendRecv2 && fplog)
7194     {
7195         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
7196     }
7197
7198     if (comm->eFlop)
7199     {
7200         if (fplog)
7201         {
7202             fprintf(fplog, "Will load balance based on FLOP count\n");
7203         }
7204         if (comm->eFlop > 1)
7205         {
7206             srand(1 + rank_mysim);
7207         }
7208         comm->bRecordLoad = TRUE;
7209     }
7210     else
7211     {
7212         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
7213     }
7214 }
7215
7216 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
7217                                         unsigned long Flags,
7218                                         ivec nc, int nPmeRanks,
7219                                         int dd_rank_order,
7220                                         real comm_distance_min, real rconstr,
7221                                         const char *dlb_opt, real dlb_scale,
7222                                         const char *sizex, const char *sizey, const char *sizez,
7223                                         const gmx_mtop_t *mtop,
7224                                         const t_inputrec *ir,
7225                                         matrix box, rvec *x,
7226                                         gmx_ddbox_t *ddbox,
7227                                         int *npme_x, int *npme_y)
7228 {
7229     gmx_domdec_t      *dd;
7230
7231     if (fplog)
7232     {
7233         fprintf(fplog,
7234                 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
7235     }
7236
7237     snew(dd, 1);
7238
7239     dd->comm = init_dd_comm();
7240
7241     set_dd_envvar_options(fplog, dd, cr->nodeid);
7242
7243     set_dd_limits_and_grid(fplog, cr, dd, Flags,
7244                            nc, nPmeRanks,
7245                            comm_distance_min, rconstr,
7246                            dlb_opt, dlb_scale,
7247                            sizex, sizey, sizez,
7248                            mtop, ir,
7249                            box, x,
7250                            ddbox,
7251                            npme_x, npme_y);
7252
7253     make_dd_communicators(fplog, cr, dd, dd_rank_order);
7254
7255     if (cr->duty & DUTY_PP)
7256     {
7257         set_ddgrid_parameters(fplog, dd, dlb_scale, mtop, ir, ddbox);
7258
7259         setup_neighbor_relations(dd);
7260     }
7261
7262     /* Set overallocation to avoid frequent reallocation of arrays */
7263     set_over_alloc_dd(TRUE);
7264
7265     /* Initialize DD paritioning counters */
7266     dd->comm->partition_step = INT_MIN;
7267     dd->ddp_count            = 0;
7268
7269     /* We currently don't know the number of threads yet, we set this later */
7270     dd->comm->nth = 0;
7271
7272     clear_dd_cycle_counts(dd);
7273
7274     return dd;
7275 }
7276
7277 static gmx_bool test_dd_cutoff(t_commrec *cr,
7278                                t_state *state, const t_inputrec *ir,
7279                                real cutoff_req)
7280 {
7281     gmx_domdec_t *dd;
7282     gmx_ddbox_t   ddbox;
7283     int           d, dim, np;
7284     real          inv_cell_size;
7285     int           LocallyLimited;
7286
7287     dd = cr->dd;
7288
7289     set_ddbox(dd, FALSE, cr, ir, state->box,
7290               TRUE, &dd->comm->cgs_gl, as_rvec_array(state->x.data()), &ddbox);
7291
7292     LocallyLimited = 0;
7293
7294     for (d = 0; d < dd->ndim; d++)
7295     {
7296         dim = dd->dim[d];
7297
7298         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7299         if (dynamic_dd_box(&ddbox, ir))
7300         {
7301             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7302         }
7303
7304         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7305
7306         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
7307         {
7308             if (np > dd->comm->cd[d].np_dlb)
7309             {
7310                 return FALSE;
7311             }
7312
7313             /* If a current local cell size is smaller than the requested
7314              * cut-off, we could still fix it, but this gets very complicated.
7315              * Without fixing here, we might actually need more checks.
7316              */
7317             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7318             {
7319                 LocallyLimited = 1;
7320             }
7321         }
7322     }
7323
7324     if (!isDlbDisabled(dd->comm))
7325     {
7326         /* If DLB is not active yet, we don't need to check the grid jumps.
7327          * Actually we shouldn't, because then the grid jump data is not set.
7328          */
7329         if (isDlbOn(dd->comm) &&
7330             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7331         {
7332             LocallyLimited = 1;
7333         }
7334
7335         gmx_sumi(1, &LocallyLimited, cr);
7336
7337         if (LocallyLimited > 0)
7338         {
7339             return FALSE;
7340         }
7341     }
7342
7343     return TRUE;
7344 }
7345
7346 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, const t_inputrec *ir,
7347                           real cutoff_req)
7348 {
7349     gmx_bool bCutoffAllowed;
7350
7351     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7352
7353     if (bCutoffAllowed)
7354     {
7355         cr->dd->comm->cutoff = cutoff_req;
7356     }
7357
7358     return bCutoffAllowed;
7359 }
7360
7361 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
7362 {
7363     gmx_domdec_comm_t *comm;
7364
7365     comm = cr->dd->comm;
7366
7367     /* Turn on the DLB limiting (might have been on already) */
7368     comm->bPMELoadBalDLBLimits = TRUE;
7369
7370     /* Change the cut-off limit */
7371     comm->PMELoadBal_max_cutoff = cutoff;
7372
7373     if (debug)
7374     {
7375         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
7376                 comm->PMELoadBal_max_cutoff);
7377     }
7378 }
7379
7380 /* Sets whether we should later check the load imbalance data, so that
7381  * we can trigger dynamic load balancing if enough imbalance has
7382  * arisen.
7383  *
7384  * Used after PME load balancing unlocks DLB, so that the check
7385  * whether DLB will be useful can happen immediately.
7386  */
7387 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
7388 {
7389     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7390     {
7391         dd->comm->bCheckWhetherToTurnDlbOn = bValue;
7392
7393         if (bValue == TRUE)
7394         {
7395             /* Store the DD partitioning count, so we can ignore cycle counts
7396              * over the next nstlist steps, which are often slower.
7397              */
7398             dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
7399         }
7400     }
7401 }
7402
7403 /* Returns if we should check whether there has been enough load
7404  * imbalance to trigger dynamic load balancing.
7405  */
7406 static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
7407 {
7408     if (dd->comm->dlbState != edlbsOffCanTurnOn)
7409     {
7410         return FALSE;
7411     }
7412
7413     if (dd->ddp_count <= dd->comm->ddPartioningCountFirstDlbOff)
7414     {
7415         /* We ignore the first nstlist steps at the start of the run
7416          * or after PME load balancing or after turning DLB off, since
7417          * these often have extra allocation or cache miss overhead.
7418          */
7419         return FALSE;
7420     }
7421
7422     /* We should check whether we should use DLB directly after
7423      * unlocking DLB. */
7424     if (dd->comm->bCheckWhetherToTurnDlbOn)
7425     {
7426         /* This flag was set when the PME load-balancing routines
7427            unlocked DLB, and should now be cleared. */
7428         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
7429         return TRUE;
7430     }
7431     /* We check whether we should use DLB every c_checkTurnDlbOnInterval
7432      * partitionings (we do not do this every partioning, so that we
7433      * avoid excessive communication). */
7434     if (dd->comm->n_load_have % c_checkTurnDlbOnInterval == c_checkTurnDlbOnInterval - 1)
7435     {
7436         return TRUE;
7437     }
7438
7439     return FALSE;
7440 }
7441
7442 gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
7443 {
7444     return isDlbOn(dd->comm);
7445 }
7446
7447 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
7448 {
7449     return (dd->comm->dlbState == edlbsOffTemporarilyLocked);
7450 }
7451
7452 void dd_dlb_lock(gmx_domdec_t *dd)
7453 {
7454     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7455     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7456     {
7457         dd->comm->dlbState = edlbsOffTemporarilyLocked;
7458     }
7459 }
7460
7461 void dd_dlb_unlock(gmx_domdec_t *dd)
7462 {
7463     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7464     if (dd->comm->dlbState == edlbsOffTemporarilyLocked)
7465     {
7466         dd->comm->dlbState = edlbsOffCanTurnOn;
7467         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
7468     }
7469 }
7470
7471 static void merge_cg_buffers(int ncell,
7472                              gmx_domdec_comm_dim_t *cd, int pulse,
7473                              int  *ncg_cell,
7474                              int  *index_gl, int  *recv_i,
7475                              rvec *cg_cm,    rvec *recv_vr,
7476                              int *cgindex,
7477                              cginfo_mb_t *cginfo_mb, int *cginfo)
7478 {
7479     gmx_domdec_ind_t *ind, *ind_p;
7480     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7481     int               shift, shift_at;
7482
7483     ind = &cd->ind[pulse];
7484
7485     /* First correct the already stored data */
7486     shift = ind->nrecv[ncell];
7487     for (cell = ncell-1; cell >= 0; cell--)
7488     {
7489         shift -= ind->nrecv[cell];
7490         if (shift > 0)
7491         {
7492             /* Move the cg's present from previous grid pulses */
7493             cg0                = ncg_cell[ncell+cell];
7494             cg1                = ncg_cell[ncell+cell+1];
7495             cgindex[cg1+shift] = cgindex[cg1];
7496             for (cg = cg1-1; cg >= cg0; cg--)
7497             {
7498                 index_gl[cg+shift] = index_gl[cg];
7499                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7500                 cgindex[cg+shift] = cgindex[cg];
7501                 cginfo[cg+shift]  = cginfo[cg];
7502             }
7503             /* Correct the already stored send indices for the shift */
7504             for (p = 1; p <= pulse; p++)
7505             {
7506                 ind_p = &cd->ind[p];
7507                 cg0   = 0;
7508                 for (c = 0; c < cell; c++)
7509                 {
7510                     cg0 += ind_p->nsend[c];
7511                 }
7512                 cg1 = cg0 + ind_p->nsend[cell];
7513                 for (cg = cg0; cg < cg1; cg++)
7514                 {
7515                     ind_p->index[cg] += shift;
7516                 }
7517             }
7518         }
7519     }
7520
7521     /* Merge in the communicated buffers */
7522     shift    = 0;
7523     shift_at = 0;
7524     cg0      = 0;
7525     for (cell = 0; cell < ncell; cell++)
7526     {
7527         cg1 = ncg_cell[ncell+cell+1] + shift;
7528         if (shift_at > 0)
7529         {
7530             /* Correct the old cg indices */
7531             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7532             {
7533                 cgindex[cg+1] += shift_at;
7534             }
7535         }
7536         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7537         {
7538             /* Copy this charge group from the buffer */
7539             index_gl[cg1] = recv_i[cg0];
7540             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7541             /* Add it to the cgindex */
7542             cg_gl          = index_gl[cg1];
7543             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7544             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7545             cgindex[cg1+1] = cgindex[cg1] + nat;
7546             cg0++;
7547             cg1++;
7548             shift_at += nat;
7549         }
7550         shift                 += ind->nrecv[cell];
7551         ncg_cell[ncell+cell+1] = cg1;
7552     }
7553 }
7554
7555 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7556                                int nzone, int cg0, const int *cgindex)
7557 {
7558     int cg, zone, p;
7559
7560     /* Store the atom block boundaries for easy copying of communication buffers
7561      */
7562     cg = cg0;
7563     for (zone = 0; zone < nzone; zone++)
7564     {
7565         for (p = 0; p < cd->np; p++)
7566         {
7567             cd->ind[p].cell2at0[zone] = cgindex[cg];
7568             cg += cd->ind[p].nrecv[zone];
7569             cd->ind[p].cell2at1[zone] = cgindex[cg];
7570         }
7571     }
7572 }
7573
7574 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7575 {
7576     int      i;
7577     gmx_bool bMiss;
7578
7579     bMiss = FALSE;
7580     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7581     {
7582         if (!bLocalCG[link->a[i]])
7583         {
7584             bMiss = TRUE;
7585         }
7586     }
7587
7588     return bMiss;
7589 }
7590
7591 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7592 typedef struct {
7593     real c[DIM][4]; /* the corners for the non-bonded communication */
7594     real cr0;       /* corner for rounding */
7595     real cr1[4];    /* corners for rounding */
7596     real bc[DIM];   /* corners for bounded communication */
7597     real bcr1;      /* corner for rounding for bonded communication */
7598 } dd_corners_t;
7599
7600 /* Determine the corners of the domain(s) we are communicating with */
7601 static void
7602 set_dd_corners(const gmx_domdec_t *dd,
7603                int dim0, int dim1, int dim2,
7604                gmx_bool bDistMB,
7605                dd_corners_t *c)
7606 {
7607     const gmx_domdec_comm_t  *comm;
7608     const gmx_domdec_zones_t *zones;
7609     int i, j;
7610
7611     comm = dd->comm;
7612
7613     zones = &comm->zones;
7614
7615     /* Keep the compiler happy */
7616     c->cr0  = 0;
7617     c->bcr1 = 0;
7618
7619     /* The first dimension is equal for all cells */
7620     c->c[0][0] = comm->cell_x0[dim0];
7621     if (bDistMB)
7622     {
7623         c->bc[0] = c->c[0][0];
7624     }
7625     if (dd->ndim >= 2)
7626     {
7627         dim1 = dd->dim[1];
7628         /* This cell row is only seen from the first row */
7629         c->c[1][0] = comm->cell_x0[dim1];
7630         /* All rows can see this row */
7631         c->c[1][1] = comm->cell_x0[dim1];
7632         if (isDlbOn(dd->comm))
7633         {
7634             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7635             if (bDistMB)
7636             {
7637                 /* For the multi-body distance we need the maximum */
7638                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7639             }
7640         }
7641         /* Set the upper-right corner for rounding */
7642         c->cr0 = comm->cell_x1[dim0];
7643
7644         if (dd->ndim >= 3)
7645         {
7646             dim2 = dd->dim[2];
7647             for (j = 0; j < 4; j++)
7648             {
7649                 c->c[2][j] = comm->cell_x0[dim2];
7650             }
7651             if (isDlbOn(dd->comm))
7652             {
7653                 /* Use the maximum of the i-cells that see a j-cell */
7654                 for (i = 0; i < zones->nizone; i++)
7655                 {
7656                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7657                     {
7658                         if (j >= 4)
7659                         {
7660                             c->c[2][j-4] =
7661                                 std::max(c->c[2][j-4],
7662                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7663                         }
7664                     }
7665                 }
7666                 if (bDistMB)
7667                 {
7668                     /* For the multi-body distance we need the maximum */
7669                     c->bc[2] = comm->cell_x0[dim2];
7670                     for (i = 0; i < 2; i++)
7671                     {
7672                         for (j = 0; j < 2; j++)
7673                         {
7674                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
7675                         }
7676                     }
7677                 }
7678             }
7679
7680             /* Set the upper-right corner for rounding */
7681             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7682              * Only cell (0,0,0) can see cell 7 (1,1,1)
7683              */
7684             c->cr1[0] = comm->cell_x1[dim1];
7685             c->cr1[3] = comm->cell_x1[dim1];
7686             if (isDlbOn(dd->comm))
7687             {
7688                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7689                 if (bDistMB)
7690                 {
7691                     /* For the multi-body distance we need the maximum */
7692                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7693                 }
7694             }
7695         }
7696     }
7697 }
7698
7699 /* Determine which cg's we need to send in this pulse from this zone */
7700 static void
7701 get_zone_pulse_cgs(gmx_domdec_t *dd,
7702                    int zonei, int zone,
7703                    int cg0, int cg1,
7704                    const int *index_gl,
7705                    const int *cgindex,
7706                    int dim, int dim_ind,
7707                    int dim0, int dim1, int dim2,
7708                    real r_comm2, real r_bcomm2,
7709                    matrix box,
7710                    ivec tric_dist,
7711                    rvec *normal,
7712                    real skew_fac2_d, real skew_fac_01,
7713                    rvec *v_d, rvec *v_0, rvec *v_1,
7714                    const dd_corners_t *c,
7715                    rvec sf2_round,
7716                    gmx_bool bDistBonded,
7717                    gmx_bool bBondComm,
7718                    gmx_bool bDist2B,
7719                    gmx_bool bDistMB,
7720                    rvec *cg_cm,
7721                    int *cginfo,
7722                    gmx_domdec_ind_t *ind,
7723                    int **ibuf, int *ibuf_nalloc,
7724                    vec_rvec_t *vbuf,
7725                    int *nsend_ptr,
7726                    int *nat_ptr,
7727                    int *nsend_z_ptr)
7728 {
7729     gmx_domdec_comm_t *comm;
7730     gmx_bool           bScrew;
7731     gmx_bool           bDistMB_pulse;
7732     int                cg, i;
7733     real               r2, rb2, r, tric_sh;
7734     rvec               rn, rb;
7735     int                dimd;
7736     int                nsend_z, nsend, nat;
7737
7738     comm = dd->comm;
7739
7740     bScrew = (dd->bScrewPBC && dim == XX);
7741
7742     bDistMB_pulse = (bDistMB && bDistBonded);
7743
7744     nsend_z = 0;
7745     nsend   = *nsend_ptr;
7746     nat     = *nat_ptr;
7747
7748     for (cg = cg0; cg < cg1; cg++)
7749     {
7750         r2  = 0;
7751         rb2 = 0;
7752         if (tric_dist[dim_ind] == 0)
7753         {
7754             /* Rectangular direction, easy */
7755             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7756             if (r > 0)
7757             {
7758                 r2 += r*r;
7759             }
7760             if (bDistMB_pulse)
7761             {
7762                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7763                 if (r > 0)
7764                 {
7765                     rb2 += r*r;
7766                 }
7767             }
7768             /* Rounding gives at most a 16% reduction
7769              * in communicated atoms
7770              */
7771             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7772             {
7773                 r = cg_cm[cg][dim0] - c->cr0;
7774                 /* This is the first dimension, so always r >= 0 */
7775                 r2 += r*r;
7776                 if (bDistMB_pulse)
7777                 {
7778                     rb2 += r*r;
7779                 }
7780             }
7781             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7782             {
7783                 r = cg_cm[cg][dim1] - c->cr1[zone];
7784                 if (r > 0)
7785                 {
7786                     r2 += r*r;
7787                 }
7788                 if (bDistMB_pulse)
7789                 {
7790                     r = cg_cm[cg][dim1] - c->bcr1;
7791                     if (r > 0)
7792                     {
7793                         rb2 += r*r;
7794                     }
7795                 }
7796             }
7797         }
7798         else
7799         {
7800             /* Triclinic direction, more complicated */
7801             clear_rvec(rn);
7802             clear_rvec(rb);
7803             /* Rounding, conservative as the skew_fac multiplication
7804              * will slightly underestimate the distance.
7805              */
7806             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7807             {
7808                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7809                 for (i = dim0+1; i < DIM; i++)
7810                 {
7811                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7812                 }
7813                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7814                 if (bDistMB_pulse)
7815                 {
7816                     rb[dim0] = rn[dim0];
7817                     rb2      = r2;
7818                 }
7819                 /* Take care that the cell planes along dim0 might not
7820                  * be orthogonal to those along dim1 and dim2.
7821                  */
7822                 for (i = 1; i <= dim_ind; i++)
7823                 {
7824                     dimd = dd->dim[i];
7825                     if (normal[dim0][dimd] > 0)
7826                     {
7827                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7828                         if (bDistMB_pulse)
7829                         {
7830                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7831                         }
7832                     }
7833                 }
7834             }
7835             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7836             {
7837                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7838                 tric_sh   = 0;
7839                 for (i = dim1+1; i < DIM; i++)
7840                 {
7841                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7842                 }
7843                 rn[dim1] += tric_sh;
7844                 if (rn[dim1] > 0)
7845                 {
7846                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7847                     /* Take care of coupling of the distances
7848                      * to the planes along dim0 and dim1 through dim2.
7849                      */
7850                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7851                     /* Take care that the cell planes along dim1
7852                      * might not be orthogonal to that along dim2.
7853                      */
7854                     if (normal[dim1][dim2] > 0)
7855                     {
7856                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7857                     }
7858                 }
7859                 if (bDistMB_pulse)
7860                 {
7861                     rb[dim1] +=
7862                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7863                     if (rb[dim1] > 0)
7864                     {
7865                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7866                         /* Take care of coupling of the distances
7867                          * to the planes along dim0 and dim1 through dim2.
7868                          */
7869                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7870                         /* Take care that the cell planes along dim1
7871                          * might not be orthogonal to that along dim2.
7872                          */
7873                         if (normal[dim1][dim2] > 0)
7874                         {
7875                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7876                         }
7877                     }
7878                 }
7879             }
7880             /* The distance along the communication direction */
7881             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7882             tric_sh  = 0;
7883             for (i = dim+1; i < DIM; i++)
7884             {
7885                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7886             }
7887             rn[dim] += tric_sh;
7888             if (rn[dim] > 0)
7889             {
7890                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7891                 /* Take care of coupling of the distances
7892                  * to the planes along dim0 and dim1 through dim2.
7893                  */
7894                 if (dim_ind == 1 && zonei == 1)
7895                 {
7896                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7897                 }
7898             }
7899             if (bDistMB_pulse)
7900             {
7901                 clear_rvec(rb);
7902                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7903                 if (rb[dim] > 0)
7904                 {
7905                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7906                     /* Take care of coupling of the distances
7907                      * to the planes along dim0 and dim1 through dim2.
7908                      */
7909                     if (dim_ind == 1 && zonei == 1)
7910                     {
7911                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7912                     }
7913                 }
7914             }
7915         }
7916
7917         if (r2 < r_comm2 ||
7918             (bDistBonded &&
7919              ((bDistMB && rb2 < r_bcomm2) ||
7920               (bDist2B && r2  < r_bcomm2)) &&
7921              (!bBondComm ||
7922               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7923                missing_link(comm->cglink, index_gl[cg],
7924                             comm->bLocalCG)))))
7925         {
7926             /* Make an index to the local charge groups */
7927             if (nsend+1 > ind->nalloc)
7928             {
7929                 ind->nalloc = over_alloc_large(nsend+1);
7930                 srenew(ind->index, ind->nalloc);
7931             }
7932             if (nsend+1 > *ibuf_nalloc)
7933             {
7934                 *ibuf_nalloc = over_alloc_large(nsend+1);
7935                 srenew(*ibuf, *ibuf_nalloc);
7936             }
7937             ind->index[nsend] = cg;
7938             (*ibuf)[nsend]    = index_gl[cg];
7939             nsend_z++;
7940             vec_rvec_check_alloc(vbuf, nsend+1);
7941
7942             if (dd->ci[dim] == 0)
7943             {
7944                 /* Correct cg_cm for pbc */
7945                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7946                 if (bScrew)
7947                 {
7948                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7949                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7950                 }
7951             }
7952             else
7953             {
7954                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7955             }
7956             nsend++;
7957             nat += cgindex[cg+1] - cgindex[cg];
7958         }
7959     }
7960
7961     *nsend_ptr   = nsend;
7962     *nat_ptr     = nat;
7963     *nsend_z_ptr = nsend_z;
7964 }
7965
7966 static void setup_dd_communication(gmx_domdec_t *dd,
7967                                    matrix box, gmx_ddbox_t *ddbox,
7968                                    t_forcerec *fr,
7969                                    t_state *state, PaddedRVecVector *f)
7970 {
7971     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7972     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7973     int                    c, i, cg, cg_gl, nrcg;
7974     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7975     gmx_domdec_comm_t     *comm;
7976     gmx_domdec_zones_t    *zones;
7977     gmx_domdec_comm_dim_t *cd;
7978     gmx_domdec_ind_t      *ind;
7979     cginfo_mb_t           *cginfo_mb;
7980     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
7981     real                   r_comm2, r_bcomm2;
7982     dd_corners_t           corners;
7983     ivec                   tric_dist;
7984     rvec                  *cg_cm, *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr, *recv_vr;
7985     real                   skew_fac2_d, skew_fac_01;
7986     rvec                   sf2_round;
7987     int                    nsend, nat;
7988     int                    th;
7989
7990     if (debug)
7991     {
7992         fprintf(debug, "Setting up DD communication\n");
7993     }
7994
7995     comm  = dd->comm;
7996
7997     if (comm->nth == 0)
7998     {
7999         /* Initialize the thread data.
8000          * This can not be done in init_domain_decomposition,
8001          * as the numbers of threads is determined later.
8002          */
8003         comm->nth = gmx_omp_nthreads_get(emntDomdec);
8004         if (comm->nth > 1)
8005         {
8006             snew(comm->dth, comm->nth);
8007         }
8008     }
8009
8010     switch (fr->cutoff_scheme)
8011     {
8012         case ecutsGROUP:
8013             cg_cm = fr->cg_cm;
8014             break;
8015         case ecutsVERLET:
8016             cg_cm = as_rvec_array(state->x.data());
8017             break;
8018         default:
8019             gmx_incons("unimplemented");
8020             cg_cm = nullptr;
8021     }
8022
8023     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8024     {
8025         /* Check if we need to use triclinic distances */
8026         tric_dist[dim_ind] = 0;
8027         for (i = 0; i <= dim_ind; i++)
8028         {
8029             if (ddbox->tric_dir[dd->dim[i]])
8030             {
8031                 tric_dist[dim_ind] = 1;
8032             }
8033         }
8034     }
8035
8036     bBondComm = comm->bBondComm;
8037
8038     /* Do we need to determine extra distances for multi-body bondeds? */
8039     bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
8040
8041     /* Do we need to determine extra distances for only two-body bondeds? */
8042     bDist2B = (bBondComm && !bDistMB);
8043
8044     r_comm2  = gmx::square(comm->cutoff);
8045     r_bcomm2 = gmx::square(comm->cutoff_mbody);
8046
8047     if (debug)
8048     {
8049         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, std::sqrt(r_bcomm2));
8050     }
8051
8052     zones = &comm->zones;
8053
8054     dim0 = dd->dim[0];
8055     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8056     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8057
8058     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8059
8060     /* Triclinic stuff */
8061     normal      = ddbox->normal;
8062     skew_fac_01 = 0;
8063     if (dd->ndim >= 2)
8064     {
8065         v_0 = ddbox->v[dim0];
8066         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8067         {
8068             /* Determine the coupling coefficient for the distances
8069              * to the cell planes along dim0 and dim1 through dim2.
8070              * This is required for correct rounding.
8071              */
8072             skew_fac_01 =
8073                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8074             if (debug)
8075             {
8076                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8077             }
8078         }
8079     }
8080     if (dd->ndim >= 3)
8081     {
8082         v_1 = ddbox->v[dim1];
8083     }
8084
8085     zone_cg_range = zones->cg_range;
8086     index_gl      = dd->index_gl;
8087     cgindex       = dd->cgindex;
8088     cginfo_mb     = fr->cginfo_mb;
8089
8090     zone_cg_range[0]   = 0;
8091     zone_cg_range[1]   = dd->ncg_home;
8092     comm->zone_ncg1[0] = dd->ncg_home;
8093     pos_cg             = dd->ncg_home;
8094
8095     nat_tot = dd->nat_home;
8096     nzone   = 1;
8097     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8098     {
8099         dim = dd->dim[dim_ind];
8100         cd  = &comm->cd[dim_ind];
8101
8102         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8103         {
8104             /* No pbc in this dimension, the first node should not comm. */
8105             nzone_send = 0;
8106         }
8107         else
8108         {
8109             nzone_send = nzone;
8110         }
8111
8112         v_d         = ddbox->v[dim];
8113         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
8114
8115         cd->bInPlace = TRUE;
8116         for (p = 0; p < cd->np; p++)
8117         {
8118             /* Only atoms communicated in the first pulse are used
8119              * for multi-body bonded interactions or for bBondComm.
8120              */
8121             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8122
8123             ind   = &cd->ind[p];
8124             nsend = 0;
8125             nat   = 0;
8126             for (zone = 0; zone < nzone_send; zone++)
8127             {
8128                 if (tric_dist[dim_ind] && dim_ind > 0)
8129                 {
8130                     /* Determine slightly more optimized skew_fac's
8131                      * for rounding.
8132                      * This reduces the number of communicated atoms
8133                      * by about 10% for 3D DD of rhombic dodecahedra.
8134                      */
8135                     for (dimd = 0; dimd < dim; dimd++)
8136                     {
8137                         sf2_round[dimd] = 1;
8138                         if (ddbox->tric_dir[dimd])
8139                         {
8140                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8141                             {
8142                                 /* If we are shifted in dimension i
8143                                  * and the cell plane is tilted forward
8144                                  * in dimension i, skip this coupling.
8145                                  */
8146                                 if (!(zones->shift[nzone+zone][i] &&
8147                                       ddbox->v[dimd][i][dimd] >= 0))
8148                                 {
8149                                     sf2_round[dimd] +=
8150                                         gmx::square(ddbox->v[dimd][i][dimd]);
8151                                 }
8152                             }
8153                             sf2_round[dimd] = 1/sf2_round[dimd];
8154                         }
8155                     }
8156                 }
8157
8158                 zonei = zone_perm[dim_ind][zone];
8159                 if (p == 0)
8160                 {
8161                     /* Here we permutate the zones to obtain a convenient order
8162                      * for neighbor searching
8163                      */
8164                     cg0 = zone_cg_range[zonei];
8165                     cg1 = zone_cg_range[zonei+1];
8166                 }
8167                 else
8168                 {
8169                     /* Look only at the cg's received in the previous grid pulse
8170                      */
8171                     cg1 = zone_cg_range[nzone+zone+1];
8172                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8173                 }
8174
8175 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8176                 for (th = 0; th < comm->nth; th++)
8177                 {
8178                     try
8179                     {
8180                         gmx_domdec_ind_t *ind_p;
8181                         int             **ibuf_p, *ibuf_nalloc_p;
8182                         vec_rvec_t       *vbuf_p;
8183                         int              *nsend_p, *nat_p;
8184                         int              *nsend_zone_p;
8185                         int               cg0_th, cg1_th;
8186
8187                         if (th == 0)
8188                         {
8189                             /* Thread 0 writes in the comm buffers */
8190                             ind_p         = ind;
8191                             ibuf_p        = &comm->buf_int;
8192                             ibuf_nalloc_p = &comm->nalloc_int;
8193                             vbuf_p        = &comm->vbuf;
8194                             nsend_p       = &nsend;
8195                             nat_p         = &nat;
8196                             nsend_zone_p  = &ind->nsend[zone];
8197                         }
8198                         else
8199                         {
8200                             /* Other threads write into temp buffers */
8201                             ind_p         = &comm->dth[th].ind;
8202                             ibuf_p        = &comm->dth[th].ibuf;
8203                             ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8204                             vbuf_p        = &comm->dth[th].vbuf;
8205                             nsend_p       = &comm->dth[th].nsend;
8206                             nat_p         = &comm->dth[th].nat;
8207                             nsend_zone_p  = &comm->dth[th].nsend_zone;
8208
8209                             comm->dth[th].nsend      = 0;
8210                             comm->dth[th].nat        = 0;
8211                             comm->dth[th].nsend_zone = 0;
8212                         }
8213
8214                         if (comm->nth == 1)
8215                         {
8216                             cg0_th = cg0;
8217                             cg1_th = cg1;
8218                         }
8219                         else
8220                         {
8221                             cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8222                             cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8223                         }
8224
8225                         /* Get the cg's for this pulse in this zone */
8226                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8227                                            index_gl, cgindex,
8228                                            dim, dim_ind, dim0, dim1, dim2,
8229                                            r_comm2, r_bcomm2,
8230                                            box, tric_dist,
8231                                            normal, skew_fac2_d, skew_fac_01,
8232                                            v_d, v_0, v_1, &corners, sf2_round,
8233                                            bDistBonded, bBondComm,
8234                                            bDist2B, bDistMB,
8235                                            cg_cm, fr->cginfo,
8236                                            ind_p,
8237                                            ibuf_p, ibuf_nalloc_p,
8238                                            vbuf_p,
8239                                            nsend_p, nat_p,
8240                                            nsend_zone_p);
8241                     }
8242                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
8243                 } // END
8244
8245                 /* Append data of threads>=1 to the communication buffers */
8246                 for (th = 1; th < comm->nth; th++)
8247                 {
8248                     dd_comm_setup_work_t *dth;
8249                     int                   i, ns1;
8250
8251                     dth = &comm->dth[th];
8252
8253                     ns1 = nsend + dth->nsend_zone;
8254                     if (ns1 > ind->nalloc)
8255                     {
8256                         ind->nalloc = over_alloc_dd(ns1);
8257                         srenew(ind->index, ind->nalloc);
8258                     }
8259                     if (ns1 > comm->nalloc_int)
8260                     {
8261                         comm->nalloc_int = over_alloc_dd(ns1);
8262                         srenew(comm->buf_int, comm->nalloc_int);
8263                     }
8264                     if (ns1 > comm->vbuf.nalloc)
8265                     {
8266                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8267                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8268                     }
8269
8270                     for (i = 0; i < dth->nsend_zone; i++)
8271                     {
8272                         ind->index[nsend]    = dth->ind.index[i];
8273                         comm->buf_int[nsend] = dth->ibuf[i];
8274                         copy_rvec(dth->vbuf.v[i],
8275                                   comm->vbuf.v[nsend]);
8276                         nsend++;
8277                     }
8278                     nat              += dth->nat;
8279                     ind->nsend[zone] += dth->nsend_zone;
8280                 }
8281             }
8282             /* Clear the counts in case we do not have pbc */
8283             for (zone = nzone_send; zone < nzone; zone++)
8284             {
8285                 ind->nsend[zone] = 0;
8286             }
8287             ind->nsend[nzone]   = nsend;
8288             ind->nsend[nzone+1] = nat;
8289             /* Communicate the number of cg's and atoms to receive */
8290             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8291                             ind->nsend, nzone+2,
8292                             ind->nrecv, nzone+2);
8293
8294             /* The rvec buffer is also required for atom buffers of size nsend
8295              * in dd_move_x and dd_move_f.
8296              */
8297             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8298
8299             if (p > 0)
8300             {
8301                 /* We can receive in place if only the last zone is not empty */
8302                 for (zone = 0; zone < nzone-1; zone++)
8303                 {
8304                     if (ind->nrecv[zone] > 0)
8305                     {
8306                         cd->bInPlace = FALSE;
8307                     }
8308                 }
8309                 if (!cd->bInPlace)
8310                 {
8311                     /* The int buffer is only required here for the cg indices */
8312                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8313                     {
8314                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8315                         srenew(comm->buf_int2, comm->nalloc_int2);
8316                     }
8317                     /* The rvec buffer is also required for atom buffers
8318                      * of size nrecv in dd_move_x and dd_move_f.
8319                      */
8320                     i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8321                     vec_rvec_check_alloc(&comm->vbuf2, i);
8322                 }
8323             }
8324
8325             /* Make space for the global cg indices */
8326             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8327                 || dd->cg_nalloc == 0)
8328             {
8329                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8330                 srenew(index_gl, dd->cg_nalloc);
8331                 srenew(cgindex, dd->cg_nalloc+1);
8332             }
8333             /* Communicate the global cg indices */
8334             if (cd->bInPlace)
8335             {
8336                 recv_i = index_gl + pos_cg;
8337             }
8338             else
8339             {
8340                 recv_i = comm->buf_int2;
8341             }
8342             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8343                             comm->buf_int, nsend,
8344                             recv_i,        ind->nrecv[nzone]);
8345
8346             /* Make space for cg_cm */
8347             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8348             if (fr->cutoff_scheme == ecutsGROUP)
8349             {
8350                 cg_cm = fr->cg_cm;
8351             }
8352             else
8353             {
8354                 cg_cm = as_rvec_array(state->x.data());
8355             }
8356             /* Communicate cg_cm */
8357             if (cd->bInPlace)
8358             {
8359                 recv_vr = cg_cm + pos_cg;
8360             }
8361             else
8362             {
8363                 recv_vr = comm->vbuf2.v;
8364             }
8365             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8366                              comm->vbuf.v, nsend,
8367                              recv_vr,      ind->nrecv[nzone]);
8368
8369             /* Make the charge group index */
8370             if (cd->bInPlace)
8371             {
8372                 zone = (p == 0 ? 0 : nzone - 1);
8373                 while (zone < nzone)
8374                 {
8375                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8376                     {
8377                         cg_gl              = index_gl[pos_cg];
8378                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8379                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8380                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8381                         if (bBondComm)
8382                         {
8383                             /* Update the charge group presence,
8384                              * so we can use it in the next pass of the loop.
8385                              */
8386                             comm->bLocalCG[cg_gl] = TRUE;
8387                         }
8388                         pos_cg++;
8389                     }
8390                     if (p == 0)
8391                     {
8392                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8393                     }
8394                     zone++;
8395                     zone_cg_range[nzone+zone] = pos_cg;
8396                 }
8397             }
8398             else
8399             {
8400                 /* This part of the code is never executed with bBondComm. */
8401                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8402                                  index_gl, recv_i, cg_cm, recv_vr,
8403                                  cgindex, fr->cginfo_mb, fr->cginfo);
8404                 pos_cg += ind->nrecv[nzone];
8405             }
8406             nat_tot += ind->nrecv[nzone+1];
8407         }
8408         if (!cd->bInPlace)
8409         {
8410             /* Store the atom block for easy copying of communication buffers */
8411             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8412         }
8413         nzone += nzone;
8414     }
8415     dd->index_gl = index_gl;
8416     dd->cgindex  = cgindex;
8417
8418     dd->ncg_tot          = zone_cg_range[zones->n];
8419     dd->nat_tot          = nat_tot;
8420     comm->nat[ddnatHOME] = dd->nat_home;
8421     for (i = ddnatZONE; i < ddnatNR; i++)
8422     {
8423         comm->nat[i] = dd->nat_tot;
8424     }
8425
8426     if (!bBondComm)
8427     {
8428         /* We don't need to update cginfo, since that was alrady done above.
8429          * So we pass NULL for the forcerec.
8430          */
8431         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8432                       nullptr, comm->bLocalCG);
8433     }
8434
8435     if (debug)
8436     {
8437         fprintf(debug, "Finished setting up DD communication, zones:");
8438         for (c = 0; c < zones->n; c++)
8439         {
8440             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8441         }
8442         fprintf(debug, "\n");
8443     }
8444 }
8445
8446 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8447 {
8448     int c;
8449
8450     for (c = 0; c < zones->nizone; c++)
8451     {
8452         zones->izone[c].cg1  = zones->cg_range[c+1];
8453         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8454         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8455     }
8456 }
8457
8458 static void set_zones_size(gmx_domdec_t *dd,
8459                            matrix box, const gmx_ddbox_t *ddbox,
8460                            int zone_start, int zone_end)
8461 {
8462     gmx_domdec_comm_t  *comm;
8463     gmx_domdec_zones_t *zones;
8464     gmx_bool            bDistMB;
8465     int                 z, zi, d, dim;
8466     real                rcs, rcmbs;
8467     int                 i, j;
8468     real                vol;
8469
8470     comm = dd->comm;
8471
8472     zones = &comm->zones;
8473
8474     /* Do we need to determine extra distances for multi-body bondeds? */
8475     bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
8476
8477     for (z = zone_start; z < zone_end; z++)
8478     {
8479         /* Copy cell limits to zone limits.
8480          * Valid for non-DD dims and non-shifted dims.
8481          */
8482         copy_rvec(comm->cell_x0, zones->size[z].x0);
8483         copy_rvec(comm->cell_x1, zones->size[z].x1);
8484     }
8485
8486     for (d = 0; d < dd->ndim; d++)
8487     {
8488         dim = dd->dim[d];
8489
8490         for (z = 0; z < zones->n; z++)
8491         {
8492             /* With a staggered grid we have different sizes
8493              * for non-shifted dimensions.
8494              */
8495             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
8496             {
8497                 if (d == 1)
8498                 {
8499                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8500                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8501                 }
8502                 else if (d == 2)
8503                 {
8504                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8505                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8506                 }
8507             }
8508         }
8509
8510         rcs   = comm->cutoff;
8511         rcmbs = comm->cutoff_mbody;
8512         if (ddbox->tric_dir[dim])
8513         {
8514             rcs   /= ddbox->skew_fac[dim];
8515             rcmbs /= ddbox->skew_fac[dim];
8516         }
8517
8518         /* Set the lower limit for the shifted zone dimensions */
8519         for (z = zone_start; z < zone_end; z++)
8520         {
8521             if (zones->shift[z][dim] > 0)
8522             {
8523                 dim = dd->dim[d];
8524                 if (!isDlbOn(dd->comm) || d == 0)
8525                 {
8526                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8527                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8528                 }
8529                 else
8530                 {
8531                     /* Here we take the lower limit of the zone from
8532                      * the lowest domain of the zone below.
8533                      */
8534                     if (z < 4)
8535                     {
8536                         zones->size[z].x0[dim] =
8537                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8538                     }
8539                     else
8540                     {
8541                         if (d == 1)
8542                         {
8543                             zones->size[z].x0[dim] =
8544                                 zones->size[zone_perm[2][z-4]].x0[dim];
8545                         }
8546                         else
8547                         {
8548                             zones->size[z].x0[dim] =
8549                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8550                         }
8551                     }
8552                     /* A temporary limit, is updated below */
8553                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8554
8555                     if (bDistMB)
8556                     {
8557                         for (zi = 0; zi < zones->nizone; zi++)
8558                         {
8559                             if (zones->shift[zi][dim] == 0)
8560                             {
8561                                 /* This takes the whole zone into account.
8562                                  * With multiple pulses this will lead
8563                                  * to a larger zone then strictly necessary.
8564                                  */
8565                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8566                                                                   zones->size[zi].x1[dim]+rcmbs);
8567                             }
8568                         }
8569                     }
8570                 }
8571             }
8572         }
8573
8574         /* Loop over the i-zones to set the upper limit of each
8575          * j-zone they see.
8576          */
8577         for (zi = 0; zi < zones->nizone; zi++)
8578         {
8579             if (zones->shift[zi][dim] == 0)
8580             {
8581                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8582                 {
8583                     if (zones->shift[z][dim] > 0)
8584                     {
8585                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8586                                                           zones->size[zi].x1[dim]+rcs);
8587                     }
8588                 }
8589             }
8590         }
8591     }
8592
8593     for (z = zone_start; z < zone_end; z++)
8594     {
8595         /* Initialization only required to keep the compiler happy */
8596         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8597         int  nc, c;
8598
8599         /* To determine the bounding box for a zone we need to find
8600          * the extreme corners of 4, 2 or 1 corners.
8601          */
8602         nc = 1 << (ddbox->nboundeddim - 1);
8603
8604         for (c = 0; c < nc; c++)
8605         {
8606             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8607             corner[XX] = 0;
8608             if ((c & 1) == 0)
8609             {
8610                 corner[YY] = zones->size[z].x0[YY];
8611             }
8612             else
8613             {
8614                 corner[YY] = zones->size[z].x1[YY];
8615             }
8616             if ((c & 2) == 0)
8617             {
8618                 corner[ZZ] = zones->size[z].x0[ZZ];
8619             }
8620             else
8621             {
8622                 corner[ZZ] = zones->size[z].x1[ZZ];
8623             }
8624             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
8625                 box[ZZ][1 - dd->dim[0]] != 0)
8626             {
8627                 /* With 1D domain decomposition the cg's are not in
8628                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
8629                  * Shift the corner of the z-vector back to along the box
8630                  * vector of dimension d, so it will later end up at 0 along d.
8631                  * This can affect the location of this corner along dd->dim[0]
8632                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
8633                  */
8634                 int d = 1 - dd->dim[0];
8635
8636                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
8637             }
8638             /* Apply the triclinic couplings */
8639             assert(ddbox->npbcdim <= DIM);
8640             for (i = YY; i < ddbox->npbcdim; i++)
8641             {
8642                 for (j = XX; j < i; j++)
8643                 {
8644                     corner[j] += corner[i]*box[i][j]/box[i][i];
8645                 }
8646             }
8647             if (c == 0)
8648             {
8649                 copy_rvec(corner, corner_min);
8650                 copy_rvec(corner, corner_max);
8651             }
8652             else
8653             {
8654                 for (i = 0; i < DIM; i++)
8655                 {
8656                     corner_min[i] = std::min(corner_min[i], corner[i]);
8657                     corner_max[i] = std::max(corner_max[i], corner[i]);
8658                 }
8659             }
8660         }
8661         /* Copy the extreme cornes without offset along x */
8662         for (i = 0; i < DIM; i++)
8663         {
8664             zones->size[z].bb_x0[i] = corner_min[i];
8665             zones->size[z].bb_x1[i] = corner_max[i];
8666         }
8667         /* Add the offset along x */
8668         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8669         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8670     }
8671
8672     if (zone_start == 0)
8673     {
8674         vol = 1;
8675         for (dim = 0; dim < DIM; dim++)
8676         {
8677             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8678         }
8679         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8680     }
8681
8682     if (debug)
8683     {
8684         for (z = zone_start; z < zone_end; z++)
8685         {
8686             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8687                     z,
8688                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8689                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8690                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8691             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8692                     z,
8693                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8694                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8695                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8696         }
8697     }
8698 }
8699
8700 static int comp_cgsort(const void *a, const void *b)
8701 {
8702     int           comp;
8703
8704     gmx_cgsort_t *cga, *cgb;
8705     cga = (gmx_cgsort_t *)a;
8706     cgb = (gmx_cgsort_t *)b;
8707
8708     comp = cga->nsc - cgb->nsc;
8709     if (comp == 0)
8710     {
8711         comp = cga->ind_gl - cgb->ind_gl;
8712     }
8713
8714     return comp;
8715 }
8716
8717 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8718                          int *a, int *buf)
8719 {
8720     int i;
8721
8722     /* Order the data */
8723     for (i = 0; i < n; i++)
8724     {
8725         buf[i] = a[sort[i].ind];
8726     }
8727
8728     /* Copy back to the original array */
8729     for (i = 0; i < n; i++)
8730     {
8731         a[i] = buf[i];
8732     }
8733 }
8734
8735 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8736                          rvec *v, rvec *buf)
8737 {
8738     int i;
8739
8740     /* Order the data */
8741     for (i = 0; i < n; i++)
8742     {
8743         copy_rvec(v[sort[i].ind], buf[i]);
8744     }
8745
8746     /* Copy back to the original array */
8747     for (i = 0; i < n; i++)
8748     {
8749         copy_rvec(buf[i], v[i]);
8750     }
8751 }
8752
8753 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8754                            rvec *v, rvec *buf)
8755 {
8756     int a, atot, cg, cg0, cg1, i;
8757
8758     if (cgindex == nullptr)
8759     {
8760         /* Avoid the useless loop of the atoms within a cg */
8761         order_vec_cg(ncg, sort, v, buf);
8762
8763         return;
8764     }
8765
8766     /* Order the data */
8767     a = 0;
8768     for (cg = 0; cg < ncg; cg++)
8769     {
8770         cg0 = cgindex[sort[cg].ind];
8771         cg1 = cgindex[sort[cg].ind+1];
8772         for (i = cg0; i < cg1; i++)
8773         {
8774             copy_rvec(v[i], buf[a]);
8775             a++;
8776         }
8777     }
8778     atot = a;
8779
8780     /* Copy back to the original array */
8781     for (a = 0; a < atot; a++)
8782     {
8783         copy_rvec(buf[a], v[a]);
8784     }
8785 }
8786
8787 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8788                          int nsort_new, gmx_cgsort_t *sort_new,
8789                          gmx_cgsort_t *sort1)
8790 {
8791     int i1, i2, i_new;
8792
8793     /* The new indices are not very ordered, so we qsort them */
8794     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8795
8796     /* sort2 is already ordered, so now we can merge the two arrays */
8797     i1    = 0;
8798     i2    = 0;
8799     i_new = 0;
8800     while (i2 < nsort2 || i_new < nsort_new)
8801     {
8802         if (i2 == nsort2)
8803         {
8804             sort1[i1++] = sort_new[i_new++];
8805         }
8806         else if (i_new == nsort_new)
8807         {
8808             sort1[i1++] = sort2[i2++];
8809         }
8810         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8811                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8812                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8813         {
8814             sort1[i1++] = sort2[i2++];
8815         }
8816         else
8817         {
8818             sort1[i1++] = sort_new[i_new++];
8819         }
8820     }
8821 }
8822
8823 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8824 {
8825     gmx_domdec_sort_t *sort;
8826     gmx_cgsort_t      *cgsort, *sort_i;
8827     int                ncg_new, nsort2, nsort_new, i, *a, moved;
8828
8829     sort = dd->comm->sort;
8830
8831     a = fr->ns->grid->cell_index;
8832
8833     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
8834
8835     if (ncg_home_old >= 0)
8836     {
8837         /* The charge groups that remained in the same ns grid cell
8838          * are completely ordered. So we can sort efficiently by sorting
8839          * the charge groups that did move into the stationary list.
8840          */
8841         ncg_new   = 0;
8842         nsort2    = 0;
8843         nsort_new = 0;
8844         for (i = 0; i < dd->ncg_home; i++)
8845         {
8846             /* Check if this cg did not move to another node */
8847             if (a[i] < moved)
8848             {
8849                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8850                 {
8851                     /* This cg is new on this node or moved ns grid cell */
8852                     if (nsort_new >= sort->sort_new_nalloc)
8853                     {
8854                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8855                         srenew(sort->sort_new, sort->sort_new_nalloc);
8856                     }
8857                     sort_i = &(sort->sort_new[nsort_new++]);
8858                 }
8859                 else
8860                 {
8861                     /* This cg did not move */
8862                     sort_i = &(sort->sort2[nsort2++]);
8863                 }
8864                 /* Sort on the ns grid cell indices
8865                  * and the global topology index.
8866                  * index_gl is irrelevant with cell ns,
8867                  * but we set it here anyhow to avoid a conditional.
8868                  */
8869                 sort_i->nsc    = a[i];
8870                 sort_i->ind_gl = dd->index_gl[i];
8871                 sort_i->ind    = i;
8872                 ncg_new++;
8873             }
8874         }
8875         if (debug)
8876         {
8877             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8878                     nsort2, nsort_new);
8879         }
8880         /* Sort efficiently */
8881         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8882                      sort->sort);
8883     }
8884     else
8885     {
8886         cgsort  = sort->sort;
8887         ncg_new = 0;
8888         for (i = 0; i < dd->ncg_home; i++)
8889         {
8890             /* Sort on the ns grid cell indices
8891              * and the global topology index
8892              */
8893             cgsort[i].nsc    = a[i];
8894             cgsort[i].ind_gl = dd->index_gl[i];
8895             cgsort[i].ind    = i;
8896             if (cgsort[i].nsc < moved)
8897             {
8898                 ncg_new++;
8899             }
8900         }
8901         if (debug)
8902         {
8903             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8904         }
8905         /* Determine the order of the charge groups using qsort */
8906         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8907     }
8908
8909     return ncg_new;
8910 }
8911
8912 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8913 {
8914     gmx_cgsort_t *sort;
8915     int           ncg_new, i, na;
8916     const int    *a;
8917
8918     sort = dd->comm->sort->sort;
8919
8920     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8921
8922     ncg_new = 0;
8923     for (i = 0; i < na; i++)
8924     {
8925         if (a[i] >= 0)
8926         {
8927             sort[ncg_new].ind = a[i];
8928             ncg_new++;
8929         }
8930     }
8931
8932     return ncg_new;
8933 }
8934
8935 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
8936                           int ncg_home_old)
8937 {
8938     gmx_domdec_sort_t *sort;
8939     gmx_cgsort_t      *cgsort;
8940     int               *cgindex;
8941     int                ncg_new, i, *ibuf, cgsize;
8942     rvec              *vbuf;
8943
8944     sort = dd->comm->sort;
8945
8946     if (dd->ncg_home > sort->sort_nalloc)
8947     {
8948         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8949         srenew(sort->sort, sort->sort_nalloc);
8950         srenew(sort->sort2, sort->sort_nalloc);
8951     }
8952     cgsort = sort->sort;
8953
8954     switch (fr->cutoff_scheme)
8955     {
8956         case ecutsGROUP:
8957             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8958             break;
8959         case ecutsVERLET:
8960             ncg_new = dd_sort_order_nbnxn(dd, fr);
8961             break;
8962         default:
8963             gmx_incons("unimplemented");
8964             ncg_new = 0;
8965     }
8966
8967     /* We alloc with the old size, since cgindex is still old */
8968     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8969     vbuf = dd->comm->vbuf.v;
8970
8971     if (dd->comm->bCGs)
8972     {
8973         cgindex = dd->cgindex;
8974     }
8975     else
8976     {
8977         cgindex = nullptr;
8978     }
8979
8980     /* Remove the charge groups which are no longer at home here */
8981     dd->ncg_home = ncg_new;
8982     if (debug)
8983     {
8984         fprintf(debug, "Set the new home charge group count to %d\n",
8985                 dd->ncg_home);
8986     }
8987
8988     /* Reorder the state */
8989     if (state->flags & (1 << estX))
8990     {
8991         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->x.data()), vbuf);
8992     }
8993     if (state->flags & (1 << estV))
8994     {
8995         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->v.data()), vbuf);
8996     }
8997     if (state->flags & (1 << estCGP))
8998     {
8999         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->cg_p.data()), vbuf);
9000     }
9001
9002     if (fr->cutoff_scheme == ecutsGROUP)
9003     {
9004         /* Reorder cgcm */
9005         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
9006     }
9007
9008     if (dd->ncg_home+1 > sort->ibuf_nalloc)
9009     {
9010         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9011         srenew(sort->ibuf, sort->ibuf_nalloc);
9012     }
9013     ibuf = sort->ibuf;
9014     /* Reorder the global cg index */
9015     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9016     /* Reorder the cginfo */
9017     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9018     /* Rebuild the local cg index */
9019     if (dd->comm->bCGs)
9020     {
9021         ibuf[0] = 0;
9022         for (i = 0; i < dd->ncg_home; i++)
9023         {
9024             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9025             ibuf[i+1] = ibuf[i] + cgsize;
9026         }
9027         for (i = 0; i < dd->ncg_home+1; i++)
9028         {
9029             dd->cgindex[i] = ibuf[i];
9030         }
9031     }
9032     else
9033     {
9034         for (i = 0; i < dd->ncg_home+1; i++)
9035         {
9036             dd->cgindex[i] = i;
9037         }
9038     }
9039     /* Set the home atom number */
9040     dd->nat_home = dd->cgindex[dd->ncg_home];
9041
9042     if (fr->cutoff_scheme == ecutsVERLET)
9043     {
9044         /* The atoms are now exactly in grid order, update the grid order */
9045         nbnxn_set_atomorder(fr->nbv->nbs);
9046     }
9047     else
9048     {
9049         /* Copy the sorted ns cell indices back to the ns grid struct */
9050         for (i = 0; i < dd->ncg_home; i++)
9051         {
9052             fr->ns->grid->cell_index[i] = cgsort[i].nsc;
9053         }
9054         fr->ns->grid->nr = dd->ncg_home;
9055     }
9056 }
9057
9058 static void add_dd_statistics(gmx_domdec_t *dd)
9059 {
9060     gmx_domdec_comm_t *comm;
9061     int                ddnat;
9062
9063     comm = dd->comm;
9064
9065     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9066     {
9067         comm->sum_nat[ddnat-ddnatZONE] +=
9068             comm->nat[ddnat] - comm->nat[ddnat-1];
9069     }
9070     comm->ndecomp++;
9071 }
9072
9073 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9074 {
9075     gmx_domdec_comm_t *comm;
9076     int                ddnat;
9077
9078     comm = dd->comm;
9079
9080     /* Reset all the statistics and counters for total run counting */
9081     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9082     {
9083         comm->sum_nat[ddnat-ddnatZONE] = 0;
9084     }
9085     comm->ndecomp   = 0;
9086     comm->nload     = 0;
9087     comm->load_step = 0;
9088     comm->load_sum  = 0;
9089     comm->load_max  = 0;
9090     clear_ivec(comm->load_lim);
9091     comm->load_mdf = 0;
9092     comm->load_pme = 0;
9093 }
9094
9095 void print_dd_statistics(t_commrec *cr, const t_inputrec *ir, FILE *fplog)
9096 {
9097     gmx_domdec_comm_t *comm;
9098     int                ddnat;
9099     double             av;
9100
9101     comm = cr->dd->comm;
9102
9103     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9104
9105     if (fplog == nullptr)
9106     {
9107         return;
9108     }
9109
9110     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9111
9112     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9113     {
9114         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9115         switch (ddnat)
9116         {
9117             case ddnatZONE:
9118                 fprintf(fplog,
9119                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9120                         2, av);
9121                 break;
9122             case ddnatVSITE:
9123                 if (cr->dd->vsite_comm)
9124                 {
9125                     fprintf(fplog,
9126                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9127                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9128                             av);
9129                 }
9130                 break;
9131             case ddnatCON:
9132                 if (cr->dd->constraint_comm)
9133                 {
9134                     fprintf(fplog,
9135                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9136                             1 + ir->nLincsIter, av);
9137                 }
9138                 break;
9139             default:
9140                 gmx_incons(" Unknown type for DD statistics");
9141         }
9142     }
9143     fprintf(fplog, "\n");
9144
9145     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9146     {
9147         print_dd_load_av(fplog, cr->dd);
9148     }
9149 }
9150
9151 void dd_partition_system(FILE                *fplog,
9152                          gmx_int64_t          step,
9153                          t_commrec           *cr,
9154                          gmx_bool             bMasterState,
9155                          int                  nstglobalcomm,
9156                          t_state             *state_global,
9157                          const gmx_mtop_t    *top_global,
9158                          const t_inputrec    *ir,
9159                          t_state             *state_local,
9160                          PaddedRVecVector    *f,
9161                          t_mdatoms           *mdatoms,
9162                          gmx_localtop_t      *top_local,
9163                          t_forcerec          *fr,
9164                          gmx_vsite_t         *vsite,
9165                          gmx_constr_t         constr,
9166                          t_nrnb              *nrnb,
9167                          gmx_wallcycle_t      wcycle,
9168                          gmx_bool             bVerbose)
9169 {
9170     gmx_domdec_t      *dd;
9171     gmx_domdec_comm_t *comm;
9172     gmx_ddbox_t        ddbox = {0};
9173     t_block           *cgs_gl;
9174     gmx_int64_t        step_pcoupl;
9175     rvec               cell_ns_x0, cell_ns_x1;
9176     int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9177     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
9178     gmx_bool           bRedist, bSortCG, bResortAll;
9179     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9180     real               grid_density;
9181     char               sbuf[22];
9182
9183     wallcycle_start(wcycle, ewcDOMDEC);
9184
9185     dd   = cr->dd;
9186     comm = dd->comm;
9187
9188     bBoxChanged = (bMasterState || inputrecDeform(ir));
9189     if (ir->epc != epcNO)
9190     {
9191         /* With nstpcouple > 1 pressure coupling happens.
9192          * one step after calculating the pressure.
9193          * Box scaling happens at the end of the MD step,
9194          * after the DD partitioning.
9195          * We therefore have to do DLB in the first partitioning
9196          * after an MD step where P-coupling occurred.
9197          * We need to determine the last step in which p-coupling occurred.
9198          * MRS -- need to validate this for vv?
9199          */
9200         n = ir->nstpcouple;
9201         if (n == 1)
9202         {
9203             step_pcoupl = step - 1;
9204         }
9205         else
9206         {
9207             step_pcoupl = ((step - 1)/n)*n + 1;
9208         }
9209         if (step_pcoupl >= comm->partition_step)
9210         {
9211             bBoxChanged = TRUE;
9212         }
9213     }
9214
9215     bNStGlobalComm = (step % nstglobalcomm == 0);
9216
9217     if (!isDlbOn(comm))
9218     {
9219         bDoDLB = FALSE;
9220     }
9221     else
9222     {
9223         /* Should we do dynamic load balacing this step?
9224          * Since it requires (possibly expensive) global communication,
9225          * we might want to do DLB less frequently.
9226          */
9227         if (bBoxChanged || ir->epc != epcNO)
9228         {
9229             bDoDLB = bBoxChanged;
9230         }
9231         else
9232         {
9233             bDoDLB = bNStGlobalComm;
9234         }
9235     }
9236
9237     /* Check if we have recorded loads on the nodes */
9238     if (comm->bRecordLoad && dd_load_count(comm) > 0)
9239     {
9240         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
9241
9242         /* Print load every nstlog, first and last step to the log file */
9243         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9244                     comm->n_load_collect == 0 ||
9245                     (ir->nsteps >= 0 &&
9246                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9247
9248         /* Avoid extra communication due to verbose screen output
9249          * when nstglobalcomm is set.
9250          */
9251         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
9252             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9253         {
9254             get_load_distribution(dd, wcycle);
9255             if (DDMASTER(dd))
9256             {
9257                 if (bLogLoad)
9258                 {
9259                     dd_print_load(fplog, dd, step-1);
9260                 }
9261                 if (bVerbose)
9262                 {
9263                     dd_print_load_verbose(dd);
9264                 }
9265             }
9266             comm->n_load_collect++;
9267
9268             if (isDlbOn(comm))
9269             {
9270                 if (DDMASTER(dd))
9271                 {
9272                     /* Add the measured cycles to the running average */
9273                     const float averageFactor        = 0.1f;
9274                     comm->cyclesPerStepDlbExpAverage =
9275                         (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
9276                         averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
9277                 }
9278                 if (comm->dlbState == edlbsOnCanTurnOff &&
9279                     dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
9280                 {
9281                     gmx_bool turnOffDlb;
9282                     if (DDMASTER(dd))
9283                     {
9284                         /* If the running averaged cycles with DLB are more
9285                          * than before we turned on DLB, turn off DLB.
9286                          * We will again run and check the cycles without DLB
9287                          * and we can then decide if to turn off DLB forever.
9288                          */
9289                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
9290                                       comm->cyclesPerStepBeforeDLB);
9291                     }
9292                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
9293                     if (turnOffDlb)
9294                     {
9295                         /* To turn off DLB, we need to redistribute the atoms */
9296                         dd_collect_state(dd, state_local, state_global);
9297                         bMasterState = TRUE;
9298                         turn_off_dlb(fplog, cr, step);
9299                     }
9300                 }
9301             }
9302             else if (bCheckWhetherToTurnDlbOn)
9303             {
9304                 gmx_bool turnOffDlbForever = FALSE;
9305                 gmx_bool turnOnDlb         = FALSE;
9306
9307                 /* Since the timings are node dependent, the master decides */
9308                 if (DDMASTER(dd))
9309                 {
9310                     /* If we recently turned off DLB, we want to check if
9311                      * performance is better without DLB. We want to do this
9312                      * ASAP to minimize the chance that external factors
9313                      * slowed down the DLB step are gone here and we
9314                      * incorrectly conclude that DLB was causing the slowdown.
9315                      * So we measure one nstlist block, no running average.
9316                      */
9317                     if (comm->haveTurnedOffDlb &&
9318                         comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
9319                         comm->cyclesPerStepDlbExpAverage)
9320                     {
9321                         /* After turning off DLB we ran nstlist steps in fewer
9322                          * cycles than with DLB. This likely means that DLB
9323                          * in not benefical, but this could be due to a one
9324                          * time unlucky fluctuation, so we require two such
9325                          * observations in close succession to turn off DLB
9326                          * forever.
9327                          */
9328                         if (comm->dlbSlowerPartitioningCount > 0 &&
9329                             dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
9330                         {
9331                             turnOffDlbForever = TRUE;
9332                         }
9333                         comm->haveTurnedOffDlb           = false;
9334                         /* Register when we last measured DLB slowdown */
9335                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
9336                     }
9337                     else
9338                     {
9339                         /* Here we check if the max PME rank load is more than 0.98
9340                          * the max PP force load. If so, PP DLB will not help,
9341                          * since we are (almost) limited by PME. Furthermore,
9342                          * DLB will cause a significant extra x/f redistribution
9343                          * cost on the PME ranks, which will then surely result
9344                          * in lower total performance.
9345                          */
9346                         if (cr->npmenodes > 0 &&
9347                             dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
9348                         {
9349                             turnOnDlb = FALSE;
9350                         }
9351                         else
9352                         {
9353                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9354                         }
9355                     }
9356                 }
9357                 struct
9358                 {
9359                     gmx_bool turnOffDlbForever;
9360                     gmx_bool turnOnDlb;
9361                 }
9362                 bools {
9363                     turnOffDlbForever, turnOnDlb
9364                 };
9365                 dd_bcast(dd, sizeof(bools), &bools);
9366                 if (bools.turnOffDlbForever)
9367                 {
9368                     turn_off_dlb_forever(fplog, cr, step);
9369                 }
9370                 else if (bools.turnOnDlb)
9371                 {
9372                     turn_on_dlb(fplog, cr, step);
9373                     bDoDLB = TRUE;
9374                 }
9375             }
9376         }
9377         comm->n_load_have++;
9378     }
9379
9380     cgs_gl = &comm->cgs_gl;
9381
9382     bRedist = FALSE;
9383     if (bMasterState)
9384     {
9385         /* Clear the old state */
9386         clear_dd_indices(dd, 0, 0);
9387         ncgindex_set = 0;
9388
9389         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9390                   TRUE, cgs_gl, as_rvec_array(state_global->x.data()), &ddbox);
9391
9392         get_cg_distribution(fplog, dd, cgs_gl,
9393                             state_global->box, &ddbox, as_rvec_array(state_global->x.data()));
9394
9395         dd_distribute_state(dd, cgs_gl,
9396                             state_global, state_local, f);
9397
9398         dd_make_local_cgs(dd, &top_local->cgs);
9399
9400         /* Ensure that we have space for the new distribution */
9401         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9402
9403         if (fr->cutoff_scheme == ecutsGROUP)
9404         {
9405             calc_cgcm(fplog, 0, dd->ncg_home,
9406                       &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9407         }
9408
9409         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9410
9411         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9412     }
9413     else if (state_local->ddp_count != dd->ddp_count)
9414     {
9415         if (state_local->ddp_count > dd->ddp_count)
9416         {
9417             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9418         }
9419
9420         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9421         {
9422             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9423         }
9424
9425         /* Clear the old state */
9426         clear_dd_indices(dd, 0, 0);
9427
9428         /* Build the new indices */
9429         rebuild_cgindex(dd, cgs_gl->index, state_local);
9430         make_dd_indices(dd, cgs_gl->index, 0);
9431         ncgindex_set = dd->ncg_home;
9432
9433         if (fr->cutoff_scheme == ecutsGROUP)
9434         {
9435             /* Redetermine the cg COMs */
9436             calc_cgcm(fplog, 0, dd->ncg_home,
9437                       &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9438         }
9439
9440         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9441
9442         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9443
9444         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9445                   TRUE, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9446
9447         bRedist = isDlbOn(comm);
9448     }
9449     else
9450     {
9451         /* We have the full state, only redistribute the cgs */
9452
9453         /* Clear the non-home indices */
9454         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9455         ncgindex_set = 0;
9456
9457         /* Avoid global communication for dim's without pbc and -gcom */
9458         if (!bNStGlobalComm)
9459         {
9460             copy_rvec(comm->box0, ddbox.box0    );
9461             copy_rvec(comm->box_size, ddbox.box_size);
9462         }
9463         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9464                   bNStGlobalComm, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9465
9466         bBoxChanged = TRUE;
9467         bRedist     = TRUE;
9468     }
9469     /* For dim's without pbc and -gcom */
9470     copy_rvec(ddbox.box0, comm->box0    );
9471     copy_rvec(ddbox.box_size, comm->box_size);
9472
9473     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9474                       step, wcycle);
9475
9476     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9477     {
9478         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9479     }
9480
9481     /* Check if we should sort the charge groups */
9482     bSortCG = (bMasterState || bRedist);
9483
9484     ncg_home_old = dd->ncg_home;
9485
9486     ncg_moved = 0;
9487     if (bRedist)
9488     {
9489         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9490
9491         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9492                            state_local, f, fr,
9493                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9494
9495         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9496     }
9497
9498     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9499                           dd, &ddbox,
9500                           &comm->cell_x0, &comm->cell_x1,
9501                           dd->ncg_home, fr->cg_cm,
9502                           cell_ns_x0, cell_ns_x1, &grid_density);
9503
9504     if (bBoxChanged)
9505     {
9506         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9507     }
9508
9509     switch (fr->cutoff_scheme)
9510     {
9511         case ecutsGROUP:
9512             copy_ivec(fr->ns->grid->n, ncells_old);
9513             grid_first(fplog, fr->ns->grid, dd, &ddbox,
9514                        state_local->box, cell_ns_x0, cell_ns_x1,
9515                        fr->rlist, grid_density);
9516             break;
9517         case ecutsVERLET:
9518             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9519             break;
9520         default:
9521             gmx_incons("unimplemented");
9522     }
9523     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9524     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9525
9526     if (bSortCG)
9527     {
9528         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9529
9530         /* Sort the state on charge group position.
9531          * This enables exact restarts from this step.
9532          * It also improves performance by about 15% with larger numbers
9533          * of atoms per node.
9534          */
9535
9536         /* Fill the ns grid with the home cell,
9537          * so we can sort with the indices.
9538          */
9539         set_zones_ncg_home(dd);
9540
9541         switch (fr->cutoff_scheme)
9542         {
9543             case ecutsVERLET:
9544                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9545
9546                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9547                                   0,
9548                                   comm->zones.size[0].bb_x0,
9549                                   comm->zones.size[0].bb_x1,
9550                                   0, dd->ncg_home,
9551                                   comm->zones.dens_zone0,
9552                                   fr->cginfo,
9553                                   as_rvec_array(state_local->x.data()),
9554                                   ncg_moved, bRedist ? comm->moved : nullptr,
9555                                   fr->nbv->grp[eintLocal].kernel_type,
9556                                   fr->nbv->grp[eintLocal].nbat);
9557
9558                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9559                 break;
9560             case ecutsGROUP:
9561                 fill_grid(&comm->zones, fr->ns->grid, dd->ncg_home,
9562                           0, dd->ncg_home, fr->cg_cm);
9563
9564                 copy_ivec(fr->ns->grid->n, ncells_new);
9565                 break;
9566             default:
9567                 gmx_incons("unimplemented");
9568         }
9569
9570         bResortAll = bMasterState;
9571
9572         /* Check if we can user the old order and ns grid cell indices
9573          * of the charge groups to sort the charge groups efficiently.
9574          */
9575         if (ncells_new[XX] != ncells_old[XX] ||
9576             ncells_new[YY] != ncells_old[YY] ||
9577             ncells_new[ZZ] != ncells_old[ZZ])
9578         {
9579             bResortAll = TRUE;
9580         }
9581
9582         if (debug)
9583         {
9584             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9585                     gmx_step_str(step, sbuf), dd->ncg_home);
9586         }
9587         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9588                       bResortAll ? -1 : ncg_home_old);
9589
9590         /* After sorting and compacting we set the correct size */
9591         dd_resize_state(state_local, f, dd->nat_home);
9592
9593         /* Rebuild all the indices */
9594         ga2la_clear(dd->ga2la);
9595         ncgindex_set = 0;
9596
9597         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9598     }
9599
9600     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9601
9602     /* Setup up the communication and communicate the coordinates */
9603     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9604
9605     /* Set the indices */
9606     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9607
9608     /* Set the charge group boundaries for neighbor searching */
9609     set_cg_boundaries(&comm->zones);
9610
9611     if (fr->cutoff_scheme == ecutsVERLET)
9612     {
9613         set_zones_size(dd, state_local->box, &ddbox,
9614                        bSortCG ? 1 : 0, comm->zones.n);
9615     }
9616
9617     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9618
9619     /*
9620        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9621                  -1,as_rvec_array(state_local->x.data()),state_local->box);
9622      */
9623
9624     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9625
9626     /* Extract a local topology from the global topology */
9627     for (i = 0; i < dd->ndim; i++)
9628     {
9629         np[dd->dim[i]] = comm->cd[i].np;
9630     }
9631     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9632                       comm->cellsize_min, np,
9633                       fr,
9634                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : as_rvec_array(state_local->x.data()),
9635                       vsite, top_global, top_local);
9636
9637     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9638
9639     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9640
9641     /* Set up the special atom communication */
9642     n = comm->nat[ddnatZONE];
9643     for (i = ddnatZONE+1; i < ddnatNR; i++)
9644     {
9645         switch (i)
9646         {
9647             case ddnatVSITE:
9648                 if (vsite && vsite->n_intercg_vsite)
9649                 {
9650                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9651                 }
9652                 break;
9653             case ddnatCON:
9654                 if (dd->bInterCGcons || dd->bInterCGsettles)
9655                 {
9656                     /* Only for inter-cg constraints we need special code */
9657                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9658                                                   constr, ir->nProjOrder,
9659                                                   top_local->idef.il);
9660                 }
9661                 break;
9662             default:
9663                 gmx_incons("Unknown special atom type setup");
9664         }
9665         comm->nat[i] = n;
9666     }
9667
9668     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9669
9670     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9671
9672     /* Make space for the extra coordinates for virtual site
9673      * or constraint communication.
9674      */
9675     state_local->natoms = comm->nat[ddnatNR-1];
9676
9677     dd_resize_state(state_local, f, state_local->natoms);
9678
9679     if (fr->bF_NoVirSum)
9680     {
9681         if (vsite && vsite->n_intercg_vsite)
9682         {
9683             nat_f_novirsum = comm->nat[ddnatVSITE];
9684         }
9685         else
9686         {
9687             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9688             {
9689                 nat_f_novirsum = dd->nat_tot;
9690             }
9691             else
9692             {
9693                 nat_f_novirsum = dd->nat_home;
9694             }
9695         }
9696     }
9697     else
9698     {
9699         nat_f_novirsum = 0;
9700     }
9701
9702     /* Set the number of atoms required for the force calculation.
9703      * Forces need to be constrained when doing energy
9704      * minimization. For simple simulations we could avoid some
9705      * allocation, zeroing and copying, but this is probably not worth
9706      * the complications and checking.
9707      */
9708     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9709                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9710
9711     /* Update atom data for mdatoms and several algorithms */
9712     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
9713                               nullptr, mdatoms, vsite, nullptr);
9714
9715     if (ir->implicit_solvent)
9716     {
9717         make_local_gb(cr, fr->born, ir->gb_algorithm);
9718     }
9719
9720     if (!(cr->duty & DUTY_PME))
9721     {
9722         /* Send the charges and/or c6/sigmas to our PME only node */
9723         gmx_pme_send_parameters(cr,
9724                                 fr->ic,
9725                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9726                                 mdatoms->chargeA, mdatoms->chargeB,
9727                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9728                                 mdatoms->sigmaA, mdatoms->sigmaB,
9729                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9730     }
9731
9732     if (constr)
9733     {
9734         set_constraints(constr, top_local, ir, mdatoms, cr);
9735     }
9736
9737     if (ir->bPull)
9738     {
9739         /* Update the local pull groups */
9740         dd_make_local_pull_groups(cr, ir->pull_work, mdatoms);
9741     }
9742
9743     if (ir->bRot)
9744     {
9745         /* Update the local rotation groups */
9746         dd_make_local_rotation_groups(dd, ir->rot);
9747     }
9748
9749     if (ir->eSwapCoords != eswapNO)
9750     {
9751         /* Update the local groups needed for ion swapping */
9752         dd_make_local_swap_groups(dd, ir->swap);
9753     }
9754
9755     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9756     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9757
9758     add_dd_statistics(dd);
9759
9760     /* Make sure we only count the cycles for this DD partitioning */
9761     clear_dd_cycle_counts(dd);
9762
9763     /* Because the order of the atoms might have changed since
9764      * the last vsite construction, we need to communicate the constructing
9765      * atom coordinates again (for spreading the forces this MD step).
9766      */
9767     dd_move_x_vsites(dd, state_local->box, as_rvec_array(state_local->x.data()));
9768
9769     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9770
9771     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9772     {
9773         dd_move_x(dd, state_local->box, as_rvec_array(state_local->x.data()));
9774         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9775                      -1, as_rvec_array(state_local->x.data()), state_local->box);
9776     }
9777
9778     /* Store the partitioning step */
9779     comm->partition_step = step;
9780
9781     /* Increase the DD partitioning counter */
9782     dd->ddp_count++;
9783     /* The state currently matches this DD partitioning count, store it */
9784     state_local->ddp_count = dd->ddp_count;
9785     if (bMasterState)
9786     {
9787         /* The DD master node knows the complete cg distribution,
9788          * store the count so we can possibly skip the cg info communication.
9789          */
9790         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9791     }
9792
9793     if (comm->DD_debug > 0)
9794     {
9795         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9796         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9797                                 "after partitioning");
9798     }
9799
9800     wallcycle_stop(wcycle, ewcDOMDEC);
9801 }