src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <limits.h>
  44 #include <math.h>
  45 #include <stdio.h>
  46 #include <stdlib.h>
  47 #include <string.h>
  48
  49 #include <algorithm>
  50
  51 #include "gromacs/domdec/domdec_network.h"
  52 #include "gromacs/domdec/ga2la.h"
  53 #include "gromacs/ewald/pme.h"
  54 #include "gromacs/fileio/gmxfio.h"
  55 #include "gromacs/fileio/pdbio.h"
  56 #include "gromacs/gmxlib/chargegroup.h"
  57 #include "gromacs/gmxlib/network.h"
  58 #include "gromacs/gmxlib/nrnb.h"
  59 #include "gromacs/gpu_utils/gpu_utils.h"
  60 #include "gromacs/hardware/hw_info.h"
  61 #include "gromacs/imd/imd.h"
  62 #include "gromacs/listed-forces/manage-threading.h"
  63 #include "gromacs/math/functions.h"
  64 #include "gromacs/math/vec.h"
  65 #include "gromacs/math/vectypes.h"
  66 #include "gromacs/mdlib/constr.h"
  67 #include "gromacs/mdlib/force.h"
  68 #include "gromacs/mdlib/forcerec.h"
  69 #include "gromacs/mdlib/genborn.h"
  70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  71 #include "gromacs/mdlib/mdatoms.h"
  72 #include "gromacs/mdlib/mdrun.h"
  73 #include "gromacs/mdlib/mdsetup.h"
  74 #include "gromacs/mdlib/nb_verlet.h"
  75 #include "gromacs/mdlib/nbnxn_grid.h"
  76 #include "gromacs/mdlib/nsgrid.h"
  77 #include "gromacs/mdlib/vsite.h"
  78 #include "gromacs/mdtypes/commrec.h"
  79 #include "gromacs/mdtypes/df_history.h"
  80 #include "gromacs/mdtypes/forcerec.h"
  81 #include "gromacs/mdtypes/inputrec.h"
  82 #include "gromacs/mdtypes/md_enums.h"
  83 #include "gromacs/mdtypes/mdatom.h"
  84 #include "gromacs/mdtypes/nblist.h"
  85 #include "gromacs/mdtypes/state.h"
  86 #include "gromacs/pbcutil/ishift.h"
  87 #include "gromacs/pbcutil/pbc.h"
  88 #include "gromacs/pulling/pull.h"
  89 #include "gromacs/pulling/pull_rotation.h"
  90 #include "gromacs/swap/swapcoords.h"
  91 #include "gromacs/timing/wallcycle.h"
  92 #include "gromacs/topology/block.h"
  93 #include "gromacs/topology/idef.h"
  94 #include "gromacs/topology/ifunc.h"
  95 #include "gromacs/topology/mtop_lookup.h"
  96 #include "gromacs/topology/mtop_util.h"
  97 #include "gromacs/topology/topology.h"
  98 #include "gromacs/utility/basedefinitions.h"
  99 #include "gromacs/utility/basenetwork.h"
 100 #include "gromacs/utility/cstringutil.h"
 101 #include "gromacs/utility/exceptions.h"
 102 #include "gromacs/utility/fatalerror.h"
 103 #include "gromacs/utility/gmxmpi.h"
 104 #include "gromacs/utility/qsort_threadsafe.h"
 105 #include "gromacs/utility/real.h"
 106 #include "gromacs/utility/smalloc.h"
 107
 108 #include "domdec_constraints.h"
 109 #include "domdec_internal.h"
 110 #include "domdec_vsite.h"
 111
 112 #define DDRANK(dd, rank)    (rank)
 113 #define DDMASTERRANK(dd)   (dd->masterrank)
 114
 115 struct gmx_domdec_master_t
 116 {
 117     /* The cell boundaries */
 118     real **cell_x;
 119     /* The global charge group division */
 120     int   *ncg;    /* Number of home charge groups for each node */
 121     int   *index;  /* Index of nnodes+1 into cg */
 122     int   *cg;     /* Global charge group index */
 123     int   *nat;    /* Number of home atoms for each node. */
 124     int   *ibuf;   /* Buffer for communication */
 125     rvec  *vbuf;   /* Buffer for state scattering and gathering */
 126 };
 127
 128 #define DD_NLOAD_MAX 9
 129
 130 const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on" };
 131
 132 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 133 #define DD_CGIBS 2
 134
 135 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 136 #define DD_FLAG_NRCG  65535
 137 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 138 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 139
 140 /* The DD zone order */
 141 static const ivec dd_zo[DD_MAXZONE] =
 142 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 143
 144 /* The non-bonded zone-pair setup for domain decomposition
 145  * The first number is the i-zone, the second number the first j-zone seen by
 146  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 147  * As is, this is for 3D decomposition, where there are 4 i-zones.
 148  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 149  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 150  */
 151 static const int
 152     ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
 153                                                  {1, 3, 6},
 154                                                  {2, 5, 6},
 155                                                  {3, 5, 7}};
 156
 157 /* Factors used to avoid problems due to rounding issues */
 158 #define DD_CELL_MARGIN       1.0001
 159 #define DD_CELL_MARGIN2      1.00005
 160 /* Factor to account for pressure scaling during nstlist steps */
 161 #define DD_PRES_SCALE_MARGIN 1.02
 162
 163 /* Turn on DLB when the load imbalance causes this amount of total loss.
 164  * There is a bit of overhead with DLB and it's difficult to achieve
 165  * a load imbalance of less than 2% with DLB.
 166  */
 167 #define DD_PERF_LOSS_DLB_ON  0.02
 168
 169 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 170 #define DD_PERF_LOSS_WARN    0.05
 171
 172 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 173
 174 /* Use separate MPI send and receive commands
 175  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 176  * This saves memory (and some copying for small nnodes).
 177  * For high parallelization scatter and gather calls are used.
 178  */
 179 #define GMX_DD_NNODES_SENDRECV 4
 180
 181
 182 /* We check if to turn on DLB at the first and every 100 DD partitionings.
 183  * With large imbalance DLB will turn on at the first step, so we can
 184  * make the interval so large that the MPI overhead of the check is negligible.
 185  */
 186 static const int c_checkTurnDlbOnInterval  = 100;
 187 /* We need to check if DLB results in worse performance and then turn it off.
 188  * We check this more often then for turning DLB on, because the DLB can scale
 189  * the domains very rapidly, so if unlucky the load imbalance can go up quickly
 190  * and furthermore, we are already synchronizing often with DLB, so
 191  * the overhead of the MPI Bcast is not that high.
 192  */
 193 static const int c_checkTurnDlbOffInterval =  20;
 194
 195 /* Forward declaration */
 196 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue);
 197
 198
 199 /*
 200    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 201
 202    static void index2xyz(ivec nc,int ind,ivec xyz)
 203    {
 204    xyz[XX] = ind % nc[XX];
 205    xyz[YY] = (ind / nc[XX]) % nc[YY];
 206    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 207    }
 208  */
 209
 210 /* This order is required to minimize the coordinate communication in PME
 211  * which uses decomposition in the x direction.
 212  */
 213 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 214
 215 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 216 {
 217     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 218     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 219     xyz[ZZ] = ind % nc[ZZ];
 220 }
 221
 222 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 223 {
 224     int ddindex;
 225     int ddnodeid = -1;
 226
 227     ddindex = dd_index(dd->nc, c);
 228     if (dd->comm->bCartesianPP_PME)
 229     {
 230         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 231     }
 232     else if (dd->comm->bCartesianPP)
 233     {
 234 #if GMX_MPI
 235         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 236 #endif
 237     }
 238     else
 239     {
 240         ddnodeid = ddindex;
 241     }
 242
 243     return ddnodeid;
 244 }
 245
 246 static gmx_bool dynamic_dd_box(const gmx_ddbox_t *ddbox, const t_inputrec *ir)
 247 {
 248     return (ddbox->nboundeddim < DIM || inputrecDynamicBox(ir));
 249 }
 250
 251 int ddglatnr(const gmx_domdec_t *dd, int i)
 252 {
 253     int atnr;
 254
 255     if (dd == nullptr)
 256     {
 257         atnr = i + 1;
 258     }
 259     else
 260     {
 261         if (i >= dd->comm->nat[ddnatNR-1])
 262         {
 263             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 264         }
 265         atnr = dd->gatindex[i] + 1;
 266     }
 267
 268     return atnr;
 269 }
 270
 271 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 272 {
 273     return &dd->comm->cgs_gl;
 274 }
 275
 276 static bool dlbIsOn(const gmx_domdec_comm_t *comm)
 277 {
 278     return (comm->dlbState == edlbsOnCanTurnOff ||
 279             comm->dlbState == edlbsOnForever);
 280 }
 281
 282 static void vec_rvec_init(vec_rvec_t *v)
 283 {
 284     v->nalloc = 0;
 285     v->v      = nullptr;
 286 }
 287
 288 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 289 {
 290     if (n > v->nalloc)
 291     {
 292         v->nalloc = over_alloc_dd(n);
 293         srenew(v->v, v->nalloc);
 294     }
 295 }
 296
 297 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 298 {
 299     int i;
 300
 301     if (state->ddp_count != dd->ddp_count)
 302     {
 303         gmx_incons("The state does not the domain decomposition state");
 304     }
 305
 306     state->cg_gl.resize(dd->ncg_home);
 307     for (i = 0; i < dd->ncg_home; i++)
 308     {
 309         state->cg_gl[i] = dd->index_gl[i];
 310     }
 311
 312     state->ddp_count_cg_gl = dd->ddp_count;
 313 }
 314
 315 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 316 {
 317     return &dd->comm->zones;
 318 }
 319
 320 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
 321                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 322 {
 323     gmx_domdec_zones_t *zones;
 324     int                 izone, d, dim;
 325
 326     zones = &dd->comm->zones;
 327
 328     izone = 0;
 329     while (icg >= zones->izone[izone].cg1)
 330     {
 331         izone++;
 332     }
 333
 334     if (izone == 0)
 335     {
 336         *jcg0 = icg;
 337     }
 338     else if (izone < zones->nizone)
 339     {
 340         *jcg0 = zones->izone[izone].jcg0;
 341     }
 342     else
 343     {
 344         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 345                   icg, izone, zones->nizone);
 346     }
 347
 348     *jcg1 = zones->izone[izone].jcg1;
 349
 350     for (d = 0; d < dd->ndim; d++)
 351     {
 352         dim         = dd->dim[d];
 353         shift0[dim] = zones->izone[izone].shift0[dim];
 354         shift1[dim] = zones->izone[izone].shift1[dim];
 355         if (dd->comm->tric_dir[dim] || (dlbIsOn(dd->comm) && d > 0))
 356         {
 357             /* A conservative approach, this can be optimized */
 358             shift0[dim] -= 1;
 359             shift1[dim] += 1;
 360         }
 361     }
 362 }
 363
 364 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
 365 {
 366     /* We currently set mdatoms entries for all atoms:
 367      * local + non-local + communicated for vsite + constraints
 368      */
 369
 370     return dd->comm->nat[ddnatNR - 1];
 371 }
 372
 373 int dd_natoms_vsite(const gmx_domdec_t *dd)
 374 {
 375     return dd->comm->nat[ddnatVSITE];
 376 }
 377
 378 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
 379 {
 380     *at_start = dd->comm->nat[ddnatCON-1];
 381     *at_end   = dd->comm->nat[ddnatCON];
 382 }
 383
 384 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 385 {
 386     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 387     int                   *index, *cgindex;
 388     gmx_domdec_comm_t     *comm;
 389     gmx_domdec_comm_dim_t *cd;
 390     gmx_domdec_ind_t      *ind;
 391     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 392     gmx_bool               bPBC, bScrew;
 393
 394     comm = dd->comm;
 395
 396     cgindex = dd->cgindex;
 397
 398     buf = comm->vbuf.v;
 399
 400     nzone   = 1;
 401     nat_tot = dd->nat_home;
 402     for (d = 0; d < dd->ndim; d++)
 403     {
 404         bPBC   = (dd->ci[dd->dim[d]] == 0);
 405         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 406         if (bPBC)
 407         {
 408             copy_rvec(box[dd->dim[d]], shift);
 409         }
 410         cd = &comm->cd[d];
 411         for (p = 0; p < cd->np; p++)
 412         {
 413             ind   = &cd->ind[p];
 414             index = ind->index;
 415             n     = 0;
 416             if (!bPBC)
 417             {
 418                 for (i = 0; i < ind->nsend[nzone]; i++)
 419                 {
 420                     at0 = cgindex[index[i]];
 421                     at1 = cgindex[index[i]+1];
 422                     for (j = at0; j < at1; j++)
 423                     {
 424                         copy_rvec(x[j], buf[n]);
 425                         n++;
 426                     }
 427                 }
 428             }
 429             else if (!bScrew)
 430             {
 431                 for (i = 0; i < ind->nsend[nzone]; i++)
 432                 {
 433                     at0 = cgindex[index[i]];
 434                     at1 = cgindex[index[i]+1];
 435                     for (j = at0; j < at1; j++)
 436                     {
 437                         /* We need to shift the coordinates */
 438                         rvec_add(x[j], shift, buf[n]);
 439                         n++;
 440                     }
 441                 }
 442             }
 443             else
 444             {
 445                 for (i = 0; i < ind->nsend[nzone]; i++)
 446                 {
 447                     at0 = cgindex[index[i]];
 448                     at1 = cgindex[index[i]+1];
 449                     for (j = at0; j < at1; j++)
 450                     {
 451                         /* Shift x */
 452                         buf[n][XX] = x[j][XX] + shift[XX];
 453                         /* Rotate y and z.
 454                          * This operation requires a special shift force
 455                          * treatment, which is performed in calc_vir.
 456                          */
 457                         buf[n][YY] = box[YY][YY] - x[j][YY];
 458                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 459                         n++;
 460                     }
 461                 }
 462             }
 463
 464             if (cd->bInPlace)
 465             {
 466                 rbuf = x + nat_tot;
 467             }
 468             else
 469             {
 470                 rbuf = comm->vbuf2.v;
 471             }
 472             /* Send and receive the coordinates */
 473             dd_sendrecv_rvec(dd, d, dddirBackward,
 474                              buf,  ind->nsend[nzone+1],
 475                              rbuf, ind->nrecv[nzone+1]);
 476             if (!cd->bInPlace)
 477             {
 478                 j = 0;
 479                 for (zone = 0; zone < nzone; zone++)
 480                 {
 481                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 482                     {
 483                         copy_rvec(rbuf[j], x[i]);
 484                         j++;
 485                     }
 486                 }
 487             }
 488             nat_tot += ind->nrecv[nzone+1];
 489         }
 490         nzone += nzone;
 491     }
 492 }
 493
 494 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 495 {
 496     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 497     int                   *index, *cgindex;
 498     gmx_domdec_comm_t     *comm;
 499     gmx_domdec_comm_dim_t *cd;
 500     gmx_domdec_ind_t      *ind;
 501     rvec                  *buf, *sbuf;
 502     ivec                   vis;
 503     int                    is;
 504     gmx_bool               bShiftForcesNeedPbc, bScrew;
 505
 506     comm = dd->comm;
 507
 508     cgindex = dd->cgindex;
 509
 510     buf = comm->vbuf.v;
 511
 512     nzone   = comm->zones.n/2;
 513     nat_tot = dd->nat_tot;
 514     for (d = dd->ndim-1; d >= 0; d--)
 515     {
 516         /* Only forces in domains near the PBC boundaries need to
 517            consider PBC in the treatment of fshift */
 518         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 519         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 520         if (fshift == nullptr && !bScrew)
 521         {
 522             bShiftForcesNeedPbc = FALSE;
 523         }
 524         /* Determine which shift vector we need */
 525         clear_ivec(vis);
 526         vis[dd->dim[d]] = 1;
 527         is              = IVEC2IS(vis);
 528
 529         cd = &comm->cd[d];
 530         for (p = cd->np-1; p >= 0; p--)
 531         {
 532             ind      = &cd->ind[p];
 533             nat_tot -= ind->nrecv[nzone+1];
 534             if (cd->bInPlace)
 535             {
 536                 sbuf = f + nat_tot;
 537             }
 538             else
 539             {
 540                 sbuf = comm->vbuf2.v;
 541                 j    = 0;
 542                 for (zone = 0; zone < nzone; zone++)
 543                 {
 544                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 545                     {
 546                         copy_rvec(f[i], sbuf[j]);
 547                         j++;
 548                     }
 549                 }
 550             }
 551             /* Communicate the forces */
 552             dd_sendrecv_rvec(dd, d, dddirForward,
 553                              sbuf, ind->nrecv[nzone+1],
 554                              buf,  ind->nsend[nzone+1]);
 555             index = ind->index;
 556             /* Add the received forces */
 557             n = 0;
 558             if (!bShiftForcesNeedPbc)
 559             {
 560                 for (i = 0; i < ind->nsend[nzone]; i++)
 561                 {
 562                     at0 = cgindex[index[i]];
 563                     at1 = cgindex[index[i]+1];
 564                     for (j = at0; j < at1; j++)
 565                     {
 566                         rvec_inc(f[j], buf[n]);
 567                         n++;
 568                     }
 569                 }
 570             }
 571             else if (!bScrew)
 572             {
 573                 /* fshift should always be defined if this function is
 574                  * called when bShiftForcesNeedPbc is true */
 575                 assert(NULL != fshift);
 576                 for (i = 0; i < ind->nsend[nzone]; i++)
 577                 {
 578                     at0 = cgindex[index[i]];
 579                     at1 = cgindex[index[i]+1];
 580                     for (j = at0; j < at1; j++)
 581                     {
 582                         rvec_inc(f[j], buf[n]);
 583                         /* Add this force to the shift force */
 584                         rvec_inc(fshift[is], buf[n]);
 585                         n++;
 586                     }
 587                 }
 588             }
 589             else
 590             {
 591                 for (i = 0; i < ind->nsend[nzone]; i++)
 592                 {
 593                     at0 = cgindex[index[i]];
 594                     at1 = cgindex[index[i]+1];
 595                     for (j = at0; j < at1; j++)
 596                     {
 597                         /* Rotate the force */
 598                         f[j][XX] += buf[n][XX];
 599                         f[j][YY] -= buf[n][YY];
 600                         f[j][ZZ] -= buf[n][ZZ];
 601                         if (fshift)
 602                         {
 603                             /* Add this force to the shift force */
 604                             rvec_inc(fshift[is], buf[n]);
 605                         }
 606                         n++;
 607                     }
 608                 }
 609             }
 610         }
 611         nzone /= 2;
 612     }
 613 }
 614
 615 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 616 {
 617     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 618     int                   *index, *cgindex;
 619     gmx_domdec_comm_t     *comm;
 620     gmx_domdec_comm_dim_t *cd;
 621     gmx_domdec_ind_t      *ind;
 622     real                  *buf, *rbuf;
 623
 624     comm = dd->comm;
 625
 626     cgindex = dd->cgindex;
 627
 628     buf = &comm->vbuf.v[0][0];
 629
 630     nzone   = 1;
 631     nat_tot = dd->nat_home;
 632     for (d = 0; d < dd->ndim; d++)
 633     {
 634         cd = &comm->cd[d];
 635         for (p = 0; p < cd->np; p++)
 636         {
 637             ind   = &cd->ind[p];
 638             index = ind->index;
 639             n     = 0;
 640             for (i = 0; i < ind->nsend[nzone]; i++)
 641             {
 642                 at0 = cgindex[index[i]];
 643                 at1 = cgindex[index[i]+1];
 644                 for (j = at0; j < at1; j++)
 645                 {
 646                     buf[n] = v[j];
 647                     n++;
 648                 }
 649             }
 650
 651             if (cd->bInPlace)
 652             {
 653                 rbuf = v + nat_tot;
 654             }
 655             else
 656             {
 657                 rbuf = &comm->vbuf2.v[0][0];
 658             }
 659             /* Send and receive the coordinates */
 660             dd_sendrecv_real(dd, d, dddirBackward,
 661                              buf,  ind->nsend[nzone+1],
 662                              rbuf, ind->nrecv[nzone+1]);
 663             if (!cd->bInPlace)
 664             {
 665                 j = 0;
 666                 for (zone = 0; zone < nzone; zone++)
 667                 {
 668                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 669                     {
 670                         v[i] = rbuf[j];
 671                         j++;
 672                     }
 673                 }
 674             }
 675             nat_tot += ind->nrecv[nzone+1];
 676         }
 677         nzone += nzone;
 678     }
 679 }
 680
 681 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 682 {
 683     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 684     int                   *index, *cgindex;
 685     gmx_domdec_comm_t     *comm;
 686     gmx_domdec_comm_dim_t *cd;
 687     gmx_domdec_ind_t      *ind;
 688     real                  *buf, *sbuf;
 689
 690     comm = dd->comm;
 691
 692     cgindex = dd->cgindex;
 693
 694     buf = &comm->vbuf.v[0][0];
 695
 696     nzone   = comm->zones.n/2;
 697     nat_tot = dd->nat_tot;
 698     for (d = dd->ndim-1; d >= 0; d--)
 699     {
 700         cd = &comm->cd[d];
 701         for (p = cd->np-1; p >= 0; p--)
 702         {
 703             ind      = &cd->ind[p];
 704             nat_tot -= ind->nrecv[nzone+1];
 705             if (cd->bInPlace)
 706             {
 707                 sbuf = v + nat_tot;
 708             }
 709             else
 710             {
 711                 sbuf = &comm->vbuf2.v[0][0];
 712                 j    = 0;
 713                 for (zone = 0; zone < nzone; zone++)
 714                 {
 715                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 716                     {
 717                         sbuf[j] = v[i];
 718                         j++;
 719                     }
 720                 }
 721             }
 722             /* Communicate the forces */
 723             dd_sendrecv_real(dd, d, dddirForward,
 724                              sbuf, ind->nrecv[nzone+1],
 725                              buf,  ind->nsend[nzone+1]);
 726             index = ind->index;
 727             /* Add the received forces */
 728             n = 0;
 729             for (i = 0; i < ind->nsend[nzone]; i++)
 730             {
 731                 at0 = cgindex[index[i]];
 732                 at1 = cgindex[index[i]+1];
 733                 for (j = at0; j < at1; j++)
 734                 {
 735                     v[j] += buf[n];
 736                     n++;
 737                 }
 738             }
 739         }
 740         nzone /= 2;
 741     }
 742 }
 743
 744 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 745 {
 746     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 747             d, i, j,
 748             zone->min0, zone->max1,
 749             zone->mch0, zone->mch0,
 750             zone->p1_0, zone->p1_1);
 751 }
 752
 753
 754 #define DDZONECOMM_MAXZONE  5
 755 #define DDZONECOMM_BUFSIZE  3
 756
 757 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 758                                int ddimind, int direction,
 759                                gmx_ddzone_t *buf_s, int n_s,
 760                                gmx_ddzone_t *buf_r, int n_r)
 761 {
 762 #define ZBS  DDZONECOMM_BUFSIZE
 763     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 764     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 765     int  i;
 766
 767     for (i = 0; i < n_s; i++)
 768     {
 769         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 770         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 771         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 772         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 773         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 774         vbuf_s[i*ZBS+1][2] = 0;
 775         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 776         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 777         vbuf_s[i*ZBS+2][2] = 0;
 778     }
 779
 780     dd_sendrecv_rvec(dd, ddimind, direction,
 781                      vbuf_s, n_s*ZBS,
 782                      vbuf_r, n_r*ZBS);
 783
 784     for (i = 0; i < n_r; i++)
 785     {
 786         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 787         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 788         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 789         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 790         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 791         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 792         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 793     }
 794
 795 #undef ZBS
 796 }
 797
 798 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 799                           rvec cell_ns_x0, rvec cell_ns_x1)
 800 {
 801     int                d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
 802     gmx_ddzone_t      *zp;
 803     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 804     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 805     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 806     rvec               extr_s[2], extr_r[2];
 807     rvec               dh;
 808     real               dist_d, c = 0, det;
 809     gmx_domdec_comm_t *comm;
 810     gmx_bool           bPBC, bUse;
 811
 812     comm = dd->comm;
 813
 814     for (d = 1; d < dd->ndim; d++)
 815     {
 816         dim      = dd->dim[d];
 817         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 818         zp->min0 = cell_ns_x0[dim];
 819         zp->max1 = cell_ns_x1[dim];
 820         zp->min1 = cell_ns_x1[dim];
 821         zp->mch0 = cell_ns_x0[dim];
 822         zp->mch1 = cell_ns_x1[dim];
 823         zp->p1_0 = cell_ns_x0[dim];
 824         zp->p1_1 = cell_ns_x1[dim];
 825     }
 826
 827     for (d = dd->ndim-2; d >= 0; d--)
 828     {
 829         dim  = dd->dim[d];
 830         bPBC = (dim < ddbox->npbcdim);
 831
 832         /* Use an rvec to store two reals */
 833         extr_s[d][0] = comm->cell_f0[d+1];
 834         extr_s[d][1] = comm->cell_f1[d+1];
 835         extr_s[d][2] = comm->cell_f1[d+1];
 836
 837         pos = 0;
 838         /* Store the extremes in the backward sending buffer,
 839          * so the get updated separately from the forward communication.
 840          */
 841         for (d1 = d; d1 < dd->ndim-1; d1++)
 842         {
 843             /* We invert the order to be able to use the same loop for buf_e */
 844             buf_s[pos].min0 = extr_s[d1][1];
 845             buf_s[pos].max1 = extr_s[d1][0];
 846             buf_s[pos].min1 = extr_s[d1][2];
 847             buf_s[pos].mch0 = 0;
 848             buf_s[pos].mch1 = 0;
 849             /* Store the cell corner of the dimension we communicate along */
 850             buf_s[pos].p1_0 = comm->cell_x0[dim];
 851             buf_s[pos].p1_1 = 0;
 852             pos++;
 853         }
 854
 855         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 856         pos++;
 857
 858         if (dd->ndim == 3 && d == 0)
 859         {
 860             buf_s[pos] = comm->zone_d2[0][1];
 861             pos++;
 862             buf_s[pos] = comm->zone_d1[0];
 863             pos++;
 864         }
 865
 866         /* We only need to communicate the extremes
 867          * in the forward direction
 868          */
 869         npulse = comm->cd[d].np;
 870         if (bPBC)
 871         {
 872             /* Take the minimum to avoid double communication */
 873             npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
 874         }
 875         else
 876         {
 877             /* Without PBC we should really not communicate over
 878              * the boundaries, but implementing that complicates
 879              * the communication setup and therefore we simply
 880              * do all communication, but ignore some data.
 881              */
 882             npulse_min = npulse;
 883         }
 884         for (p = 0; p < npulse_min; p++)
 885         {
 886             /* Communicate the extremes forward */
 887             bUse = (bPBC || dd->ci[dim] > 0);
 888
 889             dd_sendrecv_rvec(dd, d, dddirForward,
 890                              extr_s+d, dd->ndim-d-1,
 891                              extr_r+d, dd->ndim-d-1);
 892
 893             if (bUse)
 894             {
 895                 for (d1 = d; d1 < dd->ndim-1; d1++)
 896                 {
 897                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 898                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 899                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 900                 }
 901             }
 902         }
 903
 904         buf_size = pos;
 905         for (p = 0; p < npulse; p++)
 906         {
 907             /* Communicate all the zone information backward */
 908             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 909
 910             dd_sendrecv_ddzone(dd, d, dddirBackward,
 911                                buf_s, buf_size,
 912                                buf_r, buf_size);
 913
 914             clear_rvec(dh);
 915             if (p > 0)
 916             {
 917                 for (d1 = d+1; d1 < dd->ndim; d1++)
 918                 {
 919                     /* Determine the decrease of maximum required
 920                      * communication height along d1 due to the distance along d,
 921                      * this avoids a lot of useless atom communication.
 922                      */
 923                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 924
 925                     if (ddbox->tric_dir[dim])
 926                     {
 927                         /* c is the off-diagonal coupling between the cell planes
 928                          * along directions d and d1.
 929                          */
 930                         c = ddbox->v[dim][dd->dim[d1]][dim];
 931                     }
 932                     else
 933                     {
 934                         c = 0;
 935                     }
 936                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 937                     if (det > 0)
 938                     {
 939                         dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
 940                     }
 941                     else
 942                     {
 943                         /* A negative value signals out of range */
 944                         dh[d1] = -1;
 945                     }
 946                 }
 947             }
 948
 949             /* Accumulate the extremes over all pulses */
 950             for (i = 0; i < buf_size; i++)
 951             {
 952                 if (p == 0)
 953                 {
 954                     buf_e[i] = buf_r[i];
 955                 }
 956                 else
 957                 {
 958                     if (bUse)
 959                     {
 960                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 961                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 962                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 963                     }
 964
 965                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 966                     {
 967                         d1 = 1;
 968                     }
 969                     else
 970                     {
 971                         d1 = d + 1;
 972                     }
 973                     if (bUse && dh[d1] >= 0)
 974                     {
 975                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 976                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 977                     }
 978                 }
 979                 /* Copy the received buffer to the send buffer,
 980                  * to pass the data through with the next pulse.
 981                  */
 982                 buf_s[i] = buf_r[i];
 983             }
 984             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 985                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 986             {
 987                 /* Store the extremes */
 988                 pos = 0;
 989
 990                 for (d1 = d; d1 < dd->ndim-1; d1++)
 991                 {
 992                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 993                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 994                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
 995                     pos++;
 996                 }
 997
 998                 if (d == 1 || (d == 0 && dd->ndim == 3))
 999                 {
1000                     for (i = d; i < 2; i++)
1001                     {
1002                         comm->zone_d2[1-d][i] = buf_e[pos];
1003                         pos++;
1004                     }
1005                 }
1006                 if (d == 0)
1007                 {
1008                     comm->zone_d1[1] = buf_e[pos];
1009                     pos++;
1010                 }
1011             }
1012         }
1013     }
1014
1015     if (dd->ndim >= 2)
1016     {
1017         dim = dd->dim[1];
1018         for (i = 0; i < 2; i++)
1019         {
1020             if (debug)
1021             {
1022                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1023             }
1024             cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1025             cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1026         }
1027     }
1028     if (dd->ndim >= 3)
1029     {
1030         dim = dd->dim[2];
1031         for (i = 0; i < 2; i++)
1032         {
1033             for (j = 0; j < 2; j++)
1034             {
1035                 if (debug)
1036                 {
1037                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1038                 }
1039                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1040                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1041             }
1042         }
1043     }
1044     for (d = 1; d < dd->ndim; d++)
1045     {
1046         comm->cell_f_max0[d] = extr_s[d-1][0];
1047         comm->cell_f_min1[d] = extr_s[d-1][1];
1048         if (debug)
1049         {
1050             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1051                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1052         }
1053     }
1054 }
1055
1056 static void dd_collect_cg(gmx_domdec_t *dd,
1057                           t_state      *state_local)
1058 {
1059     gmx_domdec_master_t *ma = nullptr;
1060     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = nullptr, nat_home = 0;
1061
1062     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1063     {
1064         /* The master has the correct distribution */
1065         return;
1066     }
1067
1068     if (state_local->ddp_count == dd->ddp_count)
1069     {
1070         /* The local state and DD are in sync, use the DD indices */
1071         ncg_home = dd->ncg_home;
1072         cg       = dd->index_gl;
1073         nat_home = dd->nat_home;
1074     }
1075     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1076     {
1077         /* The DD is out of sync with the local state, but we have stored
1078          * the cg indices with the local state, so we can use those.
1079          */
1080         t_block *cgs_gl;
1081
1082         cgs_gl = &dd->comm->cgs_gl;
1083
1084         ncg_home = state_local->cg_gl.size();
1085         cg       = state_local->cg_gl.data();
1086         nat_home = 0;
1087         for (i = 0; i < ncg_home; i++)
1088         {
1089             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1090         }
1091     }
1092     else
1093     {
1094         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1095     }
1096
1097     buf2[0] = ncg_home;
1098     buf2[1] = nat_home;
1099     if (DDMASTER(dd))
1100     {
1101         ma   = dd->ma;
1102         ibuf = ma->ibuf;
1103     }
1104     else
1105     {
1106         ibuf = nullptr;
1107     }
1108     /* Collect the charge group and atom counts on the master */
1109     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1110
1111     if (DDMASTER(dd))
1112     {
1113         ma->index[0] = 0;
1114         for (i = 0; i < dd->nnodes; i++)
1115         {
1116             ma->ncg[i]     = ma->ibuf[2*i];
1117             ma->nat[i]     = ma->ibuf[2*i+1];
1118             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1119
1120         }
1121         /* Make byte counts and indices */
1122         for (i = 0; i < dd->nnodes; i++)
1123         {
1124             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1125             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1126         }
1127         if (debug)
1128         {
1129             fprintf(debug, "Initial charge group distribution: ");
1130             for (i = 0; i < dd->nnodes; i++)
1131             {
1132                 fprintf(debug, " %d", ma->ncg[i]);
1133             }
1134             fprintf(debug, "\n");
1135         }
1136     }
1137
1138     /* Collect the charge group indices on the master */
1139     dd_gatherv(dd,
1140                ncg_home*sizeof(int), cg,
1141                DDMASTER(dd) ? ma->ibuf : nullptr,
1142                DDMASTER(dd) ? ma->ibuf+dd->nnodes : nullptr,
1143                DDMASTER(dd) ? ma->cg : nullptr);
1144
1145     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1146 }
1147
1148 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1149                                     const rvec *lv, rvec *v)
1150 {
1151     gmx_domdec_master_t *ma;
1152     int                  n, i, c, a, nalloc = 0;
1153     rvec                *buf = nullptr;
1154     t_block             *cgs_gl;
1155
1156     ma = dd->ma;
1157
1158     if (!DDMASTER(dd))
1159     {
1160 #if GMX_MPI
1161         MPI_Send(const_cast<void *>(static_cast<const void *>(lv)), dd->nat_home*sizeof(rvec), MPI_BYTE,
1162                  DDMASTERRANK(dd), dd->rank, dd->mpi_comm_all);
1163 #endif
1164     }
1165     else
1166     {
1167         /* Copy the master coordinates to the global array */
1168         cgs_gl = &dd->comm->cgs_gl;
1169
1170         n = DDMASTERRANK(dd);
1171         a = 0;
1172         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1173         {
1174             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1175             {
1176                 copy_rvec(lv[a++], v[c]);
1177             }
1178         }
1179
1180         for (n = 0; n < dd->nnodes; n++)
1181         {
1182             if (n != dd->rank)
1183             {
1184                 if (ma->nat[n] > nalloc)
1185                 {
1186                     nalloc = over_alloc_dd(ma->nat[n]);
1187                     srenew(buf, nalloc);
1188                 }
1189 #if GMX_MPI
1190                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1191                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1192 #endif
1193                 a = 0;
1194                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1195                 {
1196                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1197                     {
1198                         copy_rvec(buf[a++], v[c]);
1199                     }
1200                 }
1201             }
1202         }
1203         sfree(buf);
1204     }
1205 }
1206
1207 static void get_commbuffer_counts(gmx_domdec_t *dd,
1208                                   int **counts, int **disps)
1209 {
1210     gmx_domdec_master_t *ma;
1211     int                  n;
1212
1213     ma = dd->ma;
1214
1215     /* Make the rvec count and displacment arrays */
1216     *counts  = ma->ibuf;
1217     *disps   = ma->ibuf + dd->nnodes;
1218     for (n = 0; n < dd->nnodes; n++)
1219     {
1220         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1221         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1222     }
1223 }
1224
1225 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1226                                    const rvec *lv, rvec *v)
1227 {
1228     gmx_domdec_master_t *ma;
1229     int                 *rcounts = nullptr, *disps = nullptr;
1230     int                  n, i, c, a;
1231     rvec                *buf = nullptr;
1232     t_block             *cgs_gl;
1233
1234     ma = dd->ma;
1235
1236     if (DDMASTER(dd))
1237     {
1238         get_commbuffer_counts(dd, &rcounts, &disps);
1239
1240         buf = ma->vbuf;
1241     }
1242
1243     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1244
1245     if (DDMASTER(dd))
1246     {
1247         cgs_gl = &dd->comm->cgs_gl;
1248
1249         a = 0;
1250         for (n = 0; n < dd->nnodes; n++)
1251         {
1252             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1253             {
1254                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1255                 {
1256                     copy_rvec(buf[a++], v[c]);
1257                 }
1258             }
1259         }
1260     }
1261 }
1262
1263 void dd_collect_vec(gmx_domdec_t           *dd,
1264                     t_state                *state_local,
1265                     const PaddedRVecVector *localVector,
1266                     rvec                   *v)
1267 {
1268     dd_collect_cg(dd, state_local);
1269
1270     const rvec *lv = as_rvec_array(localVector->data());
1271
1272     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1273     {
1274         dd_collect_vec_sendrecv(dd, lv, v);
1275     }
1276     else
1277     {
1278         dd_collect_vec_gatherv(dd, lv, v);
1279     }
1280 }
1281
1282 void dd_collect_vec(gmx_domdec_t           *dd,
1283                     t_state                *state_local,
1284                     const PaddedRVecVector *localVector,
1285                     PaddedRVecVector       *vector)
1286 {
1287     dd_collect_vec(dd, state_local, localVector, as_rvec_array(vector->data()));
1288 }
1289
1290
1291 void dd_collect_state(gmx_domdec_t *dd,
1292                       t_state *state_local, t_state *state)
1293 {
1294     int nh = state->nhchainlength;
1295
1296     if (DDMASTER(dd))
1297     {
1298         for (int i = 0; i < efptNR; i++)
1299         {
1300             state->lambda[i] = state_local->lambda[i];
1301         }
1302         state->fep_state = state_local->fep_state;
1303         state->veta      = state_local->veta;
1304         state->vol0      = state_local->vol0;
1305         copy_mat(state_local->box, state->box);
1306         copy_mat(state_local->boxv, state->boxv);
1307         copy_mat(state_local->svir_prev, state->svir_prev);
1308         copy_mat(state_local->fvir_prev, state->fvir_prev);
1309         copy_mat(state_local->pres_prev, state->pres_prev);
1310
1311         for (int i = 0; i < state_local->ngtc; i++)
1312         {
1313             for (int j = 0; j < nh; j++)
1314             {
1315                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1316                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1317             }
1318             state->therm_integral[i] = state_local->therm_integral[i];
1319         }
1320         for (int i = 0; i < state_local->nnhpres; i++)
1321         {
1322             for (int j = 0; j < nh; j++)
1323             {
1324                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1325                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1326             }
1327         }
1328         state->baros_integral = state_local->baros_integral;
1329     }
1330     if (state_local->flags & (1 << estX))
1331     {
1332         dd_collect_vec(dd, state_local, &state_local->x, &state->x);
1333     }
1334     if (state_local->flags & (1 << estV))
1335     {
1336         dd_collect_vec(dd, state_local, &state_local->v, &state->v);
1337     }
1338     if (state_local->flags & (1 << estCGP))
1339     {
1340         dd_collect_vec(dd, state_local, &state_local->cg_p, &state->cg_p);
1341     }
1342 }
1343
1344 static void dd_resize_state(t_state *state, PaddedRVecVector *f, int natoms)
1345 {
1346     if (debug)
1347     {
1348         fprintf(debug, "Resizing state: currently %d, required %d\n", state->natoms, natoms);
1349     }
1350
1351     state_change_natoms(state, natoms);
1352
1353     if (f != nullptr)
1354     {
1355         /* We need to allocate one element extra, since we might use
1356          * (unaligned) 4-wide SIMD loads to access rvec entries.
1357          */
1358         f->resize(natoms + 1);
1359     }
1360 }
1361
1362 static void dd_check_alloc_ncg(t_forcerec       *fr,
1363                                t_state          *state,
1364                                PaddedRVecVector *f,
1365                                int               numChargeGroups)
1366 {
1367     if (numChargeGroups > fr->cg_nalloc)
1368     {
1369         if (debug)
1370         {
1371             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, numChargeGroups, over_alloc_dd(numChargeGroups));
1372         }
1373         fr->cg_nalloc = over_alloc_dd(numChargeGroups);
1374         srenew(fr->cginfo, fr->cg_nalloc);
1375         if (fr->cutoff_scheme == ecutsGROUP)
1376         {
1377             srenew(fr->cg_cm, fr->cg_nalloc);
1378         }
1379     }
1380     if (fr->cutoff_scheme == ecutsVERLET)
1381     {
1382         /* We don't use charge groups, we use x in state to set up
1383          * the atom communication.
1384          */
1385         dd_resize_state(state, f, numChargeGroups);
1386     }
1387 }
1388
1389 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1390                                        rvec *v, rvec *lv)
1391 {
1392     gmx_domdec_master_t *ma;
1393     int                  n, i, c, a, nalloc = 0;
1394     rvec                *buf = nullptr;
1395
1396     if (DDMASTER(dd))
1397     {
1398         ma  = dd->ma;
1399
1400         for (n = 0; n < dd->nnodes; n++)
1401         {
1402             if (n != dd->rank)
1403             {
1404                 if (ma->nat[n] > nalloc)
1405                 {
1406                     nalloc = over_alloc_dd(ma->nat[n]);
1407                     srenew(buf, nalloc);
1408                 }
1409                 /* Use lv as a temporary buffer */
1410                 a = 0;
1411                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1412                 {
1413                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1414                     {
1415                         copy_rvec(v[c], buf[a++]);
1416                     }
1417                 }
1418                 if (a != ma->nat[n])
1419                 {
1420                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1421                               a, ma->nat[n]);
1422                 }
1423
1424 #if GMX_MPI
1425                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1426                          DDRANK(dd, n), n, dd->mpi_comm_all);
1427 #endif
1428             }
1429         }
1430         sfree(buf);
1431         n = DDMASTERRANK(dd);
1432         a = 0;
1433         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1434         {
1435             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1436             {
1437                 copy_rvec(v[c], lv[a++]);
1438             }
1439         }
1440     }
1441     else
1442     {
1443 #if GMX_MPI
1444         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1445                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1446 #endif
1447     }
1448 }
1449
1450 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1451                                        rvec *v, rvec *lv)
1452 {
1453     gmx_domdec_master_t *ma;
1454     int                 *scounts = nullptr, *disps = nullptr;
1455     int                  n, i, c, a;
1456     rvec                *buf = nullptr;
1457
1458     if (DDMASTER(dd))
1459     {
1460         ma  = dd->ma;
1461
1462         get_commbuffer_counts(dd, &scounts, &disps);
1463
1464         buf = ma->vbuf;
1465         a   = 0;
1466         for (n = 0; n < dd->nnodes; n++)
1467         {
1468             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1469             {
1470                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1471                 {
1472                     copy_rvec(v[c], buf[a++]);
1473                 }
1474             }
1475         }
1476     }
1477
1478     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1479 }
1480
1481 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1482 {
1483     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1484     {
1485         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1486     }
1487     else
1488     {
1489         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1490     }
1491 }
1492
1493 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1494 {
1495     if (dfhist == nullptr)
1496     {
1497         return;
1498     }
1499
1500     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1501     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1502     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1503
1504     if (dfhist->nlambda > 0)
1505     {
1506         int nlam = dfhist->nlambda;
1507         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1508         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1509         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1510         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1511         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1512         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1513
1514         for (int i = 0; i < nlam; i++)
1515         {
1516             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1517             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1518             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1519             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1520             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1521             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1522         }
1523     }
1524 }
1525
1526 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1527                                 t_state *state, t_state *state_local,
1528                                 PaddedRVecVector *f)
1529 {
1530     int nh = state->nhchainlength;
1531
1532     if (DDMASTER(dd))
1533     {
1534         for (int i = 0; i < efptNR; i++)
1535         {
1536             state_local->lambda[i] = state->lambda[i];
1537         }
1538         state_local->fep_state = state->fep_state;
1539         state_local->veta      = state->veta;
1540         state_local->vol0      = state->vol0;
1541         copy_mat(state->box, state_local->box);
1542         copy_mat(state->box_rel, state_local->box_rel);
1543         copy_mat(state->boxv, state_local->boxv);
1544         copy_mat(state->svir_prev, state_local->svir_prev);
1545         copy_mat(state->fvir_prev, state_local->fvir_prev);
1546         if (state->dfhist != nullptr)
1547         {
1548             copy_df_history(state_local->dfhist, state->dfhist);
1549         }
1550         for (int i = 0; i < state_local->ngtc; i++)
1551         {
1552             for (int j = 0; j < nh; j++)
1553             {
1554                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1555                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1556             }
1557             state_local->therm_integral[i] = state->therm_integral[i];
1558         }
1559         for (int i = 0; i < state_local->nnhpres; i++)
1560         {
1561             for (int j = 0; j < nh; j++)
1562             {
1563                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1564                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1565             }
1566         }
1567         state_local->baros_integral = state->baros_integral;
1568     }
1569     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda.data());
1570     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1571     dd_bcast(dd, sizeof(real), &state_local->veta);
1572     dd_bcast(dd, sizeof(real), &state_local->vol0);
1573     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1574     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1575     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1576     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1577     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1578     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi.data());
1579     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi.data());
1580     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral.data());
1581     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi.data());
1582     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi.data());
1583
1584     /* communicate df_history -- required for restarting from checkpoint */
1585     dd_distribute_dfhist(dd, state_local->dfhist);
1586
1587     dd_resize_state(state_local, f, dd->nat_home);
1588
1589     if (state_local->flags & (1 << estX))
1590     {
1591         dd_distribute_vec(dd, cgs, as_rvec_array(state->x.data()), as_rvec_array(state_local->x.data()));
1592     }
1593     if (state_local->flags & (1 << estV))
1594     {
1595         dd_distribute_vec(dd, cgs, as_rvec_array(state->v.data()), as_rvec_array(state_local->v.data()));
1596     }
1597     if (state_local->flags & (1 << estCGP))
1598     {
1599         dd_distribute_vec(dd, cgs, as_rvec_array(state->cg_p.data()), as_rvec_array(state_local->cg_p.data()));
1600     }
1601 }
1602
1603 static char dim2char(int dim)
1604 {
1605     char c = '?';
1606
1607     switch (dim)
1608     {
1609         case XX: c = 'X'; break;
1610         case YY: c = 'Y'; break;
1611         case ZZ: c = 'Z'; break;
1612         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1613     }
1614
1615     return c;
1616 }
1617
1618 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1619                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1620 {
1621     rvec   grid_s[2], *grid_r = nullptr, cx, r;
1622     char   fname[STRLEN], buf[22];
1623     FILE  *out;
1624     int    a, i, d, z, y, x;
1625     matrix tric;
1626     real   vol;
1627
1628     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1629     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1630
1631     if (DDMASTER(dd))
1632     {
1633         snew(grid_r, 2*dd->nnodes);
1634     }
1635
1636     dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : nullptr);
1637
1638     if (DDMASTER(dd))
1639     {
1640         for (d = 0; d < DIM; d++)
1641         {
1642             for (i = 0; i < DIM; i++)
1643             {
1644                 if (d == i)
1645                 {
1646                     tric[d][i] = 1;
1647                 }
1648                 else
1649                 {
1650                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1651                     {
1652                         tric[d][i] = box[i][d]/box[i][i];
1653                     }
1654                     else
1655                     {
1656                         tric[d][i] = 0;
1657                     }
1658                 }
1659             }
1660         }
1661         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1662         out = gmx_fio_fopen(fname, "w");
1663         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1664         a = 1;
1665         for (i = 0; i < dd->nnodes; i++)
1666         {
1667             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1668             for (d = 0; d < DIM; d++)
1669             {
1670                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1671             }
1672             for (z = 0; z < 2; z++)
1673             {
1674                 for (y = 0; y < 2; y++)
1675                 {
1676                     for (x = 0; x < 2; x++)
1677                     {
1678                         cx[XX] = grid_r[i*2+x][XX];
1679                         cx[YY] = grid_r[i*2+y][YY];
1680                         cx[ZZ] = grid_r[i*2+z][ZZ];
1681                         mvmul(tric, cx, r);
1682                         gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
1683                                                  10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
1684                     }
1685                 }
1686             }
1687             for (d = 0; d < DIM; d++)
1688             {
1689                 for (x = 0; x < 4; x++)
1690                 {
1691                     switch (d)
1692                     {
1693                         case 0: y = 1 + i*8 + 2*x; break;
1694                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1695                         case 2: y = 1 + i*8 + x; break;
1696                     }
1697                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
1698                 }
1699             }
1700         }
1701         gmx_fio_fclose(out);
1702         sfree(grid_r);
1703     }
1704 }
1705
1706 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
1707                   const gmx_mtop_t *mtop, t_commrec *cr,
1708                   int natoms, rvec x[], matrix box)
1709 {
1710     char          fname[STRLEN], buf[22];
1711     FILE         *out;
1712     int           i, ii, resnr, c;
1713     const char   *atomname, *resname;
1714     real          b;
1715     gmx_domdec_t *dd;
1716
1717     dd = cr->dd;
1718     if (natoms == -1)
1719     {
1720         natoms = dd->comm->nat[ddnatVSITE];
1721     }
1722
1723     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
1724
1725     out = gmx_fio_fopen(fname, "w");
1726
1727     fprintf(out, "TITLE     %s\n", title);
1728     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1729     int molb = 0;
1730     for (i = 0; i < natoms; i++)
1731     {
1732         ii = dd->gatindex[i];
1733         mtopGetAtomAndResidueName(mtop, ii, &molb, &atomname, &resnr, &resname, nullptr);
1734         if (i < dd->comm->nat[ddnatZONE])
1735         {
1736             c = 0;
1737             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1738             {
1739                 c++;
1740             }
1741             b = c;
1742         }
1743         else if (i < dd->comm->nat[ddnatVSITE])
1744         {
1745             b = dd->comm->zones.n;
1746         }
1747         else
1748         {
1749             b = dd->comm->zones.n + 1;
1750         }
1751         gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
1752                                  10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
1753     }
1754     fprintf(out, "TER\n");
1755
1756     gmx_fio_fclose(out);
1757 }
1758
1759 real dd_cutoff_multibody(const gmx_domdec_t *dd)
1760 {
1761     gmx_domdec_comm_t *comm;
1762     int                di;
1763     real               r;
1764
1765     comm = dd->comm;
1766
1767     r = -1;
1768     if (comm->bInterCGBondeds)
1769     {
1770         if (comm->cutoff_mbody > 0)
1771         {
1772             r = comm->cutoff_mbody;
1773         }
1774         else
1775         {
1776             /* cutoff_mbody=0 means we do not have DLB */
1777             r = comm->cellsize_min[dd->dim[0]];
1778             for (di = 1; di < dd->ndim; di++)
1779             {
1780                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
1781             }
1782             if (comm->bBondComm)
1783             {
1784                 r = std::max(r, comm->cutoff_mbody);
1785             }
1786             else
1787             {
1788                 r = std::min(r, comm->cutoff);
1789             }
1790         }
1791     }
1792
1793     return r;
1794 }
1795
1796 real dd_cutoff_twobody(const gmx_domdec_t *dd)
1797 {
1798     real r_mb;
1799
1800     r_mb = dd_cutoff_multibody(dd);
1801
1802     return std::max(dd->comm->cutoff, r_mb);
1803 }
1804
1805
1806 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
1807                                    ivec coord_pme)
1808 {
1809     int nc, ntot;
1810
1811     nc   = dd->nc[dd->comm->cartpmedim];
1812     ntot = dd->comm->ntot[dd->comm->cartpmedim];
1813     copy_ivec(coord, coord_pme);
1814     coord_pme[dd->comm->cartpmedim] =
1815         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
1816 }
1817
1818 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
1819 {
1820     int npp, npme;
1821
1822     npp  = dd->nnodes;
1823     npme = dd->comm->npmenodes;
1824
1825     /* Here we assign a PME node to communicate with this DD node
1826      * by assuming that the major index of both is x.
1827      * We add cr->npmenodes/2 to obtain an even distribution.
1828      */
1829     return (ddindex*npme + npme/2)/npp;
1830 }
1831
1832 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
1833 {
1834     int *pme_rank;
1835     int  n, i, p0, p1;
1836
1837     snew(pme_rank, dd->comm->npmenodes);
1838     n = 0;
1839     for (i = 0; i < dd->nnodes; i++)
1840     {
1841         p0 = ddindex2pmeindex(dd, i);
1842         p1 = ddindex2pmeindex(dd, i+1);
1843         if (i+1 == dd->nnodes || p1 > p0)
1844         {
1845             if (debug)
1846             {
1847                 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
1848             }
1849             pme_rank[n] = i + 1 + n;
1850             n++;
1851         }
1852     }
1853
1854     return pme_rank;
1855 }
1856
1857 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
1858 {
1859     gmx_domdec_t *dd;
1860     ivec          coords;
1861     int           slab;
1862
1863     dd = cr->dd;
1864     /*
1865        if (dd->comm->bCartesian) {
1866        gmx_ddindex2xyz(dd->nc,ddindex,coords);
1867        dd_coords2pmecoords(dd,coords,coords_pme);
1868        copy_ivec(dd->ntot,nc);
1869        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
1870        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1871
1872        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
1873        } else {
1874        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
1875        }
1876      */
1877     coords[XX] = x;
1878     coords[YY] = y;
1879     coords[ZZ] = z;
1880     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
1881
1882     return slab;
1883 }
1884
1885 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
1886 {
1887     gmx_domdec_comm_t *comm;
1888     ivec               coords;
1889     int                ddindex, nodeid = -1;
1890
1891     comm = cr->dd->comm;
1892
1893     coords[XX] = x;
1894     coords[YY] = y;
1895     coords[ZZ] = z;
1896     if (comm->bCartesianPP_PME)
1897     {
1898 #if GMX_MPI
1899         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
1900 #endif
1901     }
1902     else
1903     {
1904         ddindex = dd_index(cr->dd->nc, coords);
1905         if (comm->bCartesianPP)
1906         {
1907             nodeid = comm->ddindex2simnodeid[ddindex];
1908         }
1909         else
1910         {
1911             if (comm->pmenodes)
1912             {
1913                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
1914             }
1915             else
1916             {
1917                 nodeid = ddindex;
1918             }
1919         }
1920     }
1921
1922     return nodeid;
1923 }
1924
1925 static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
1926                               const t_commrec gmx_unused *cr,
1927                               int                         sim_nodeid)
1928 {
1929     int pmenode = -1;
1930
1931     const gmx_domdec_comm_t *comm = dd->comm;
1932
1933     /* This assumes a uniform x domain decomposition grid cell size */
1934     if (comm->bCartesianPP_PME)
1935     {
1936 #if GMX_MPI
1937         ivec coord, coord_pme;
1938         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
1939         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1940         {
1941             /* This is a PP node */
1942             dd_cart_coord2pmecoord(dd, coord, coord_pme);
1943             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
1944         }
1945 #endif
1946     }
1947     else if (comm->bCartesianPP)
1948     {
1949         if (sim_nodeid < dd->nnodes)
1950         {
1951             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1952         }
1953     }
1954     else
1955     {
1956         /* This assumes DD cells with identical x coordinates
1957          * are numbered sequentially.
1958          */
1959         if (dd->comm->pmenodes == nullptr)
1960         {
1961             if (sim_nodeid < dd->nnodes)
1962             {
1963                 /* The DD index equals the nodeid */
1964                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1965             }
1966         }
1967         else
1968         {
1969             int i = 0;
1970             while (sim_nodeid > dd->comm->pmenodes[i])
1971             {
1972                 i++;
1973             }
1974             if (sim_nodeid < dd->comm->pmenodes[i])
1975             {
1976                 pmenode = dd->comm->pmenodes[i];
1977             }
1978         }
1979     }
1980
1981     return pmenode;
1982 }
1983
1984 void get_pme_nnodes(const gmx_domdec_t *dd,
1985                     int *npmenodes_x, int *npmenodes_y)
1986 {
1987     if (dd != nullptr)
1988     {
1989         *npmenodes_x = dd->comm->npmenodes_x;
1990         *npmenodes_y = dd->comm->npmenodes_y;
1991     }
1992     else
1993     {
1994         *npmenodes_x = 1;
1995         *npmenodes_y = 1;
1996     }
1997 }
1998
1999 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2000                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2001 {
2002     gmx_domdec_t *dd;
2003     int           x, y, z;
2004     ivec          coord, coord_pme;
2005
2006     dd = cr->dd;
2007
2008     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2009
2010     *nmy_ddnodes = 0;
2011     for (x = 0; x < dd->nc[XX]; x++)
2012     {
2013         for (y = 0; y < dd->nc[YY]; y++)
2014         {
2015             for (z = 0; z < dd->nc[ZZ]; z++)
2016             {
2017                 if (dd->comm->bCartesianPP_PME)
2018                 {
2019                     coord[XX] = x;
2020                     coord[YY] = y;
2021                     coord[ZZ] = z;
2022                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2023                     if (dd->ci[XX] == coord_pme[XX] &&
2024                         dd->ci[YY] == coord_pme[YY] &&
2025                         dd->ci[ZZ] == coord_pme[ZZ])
2026                     {
2027                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2028                     }
2029                 }
2030                 else
2031                 {
2032                     /* The slab corresponds to the nodeid in the PME group */
2033                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2034                     {
2035                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2036                     }
2037                 }
2038             }
2039         }
2040     }
2041
2042     /* The last PP-only node is the peer node */
2043     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2044
2045     if (debug)
2046     {
2047         fprintf(debug, "Receive coordinates from PP ranks:");
2048         for (x = 0; x < *nmy_ddnodes; x++)
2049         {
2050             fprintf(debug, " %d", (*my_ddnodes)[x]);
2051         }
2052         fprintf(debug, "\n");
2053     }
2054 }
2055
2056 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
2057 {
2058     gmx_bool bReceive = TRUE;
2059
2060     if (cr->npmenodes < dd->nnodes)
2061     {
2062         gmx_domdec_comm_t *comm = dd->comm;
2063         if (comm->bCartesianPP_PME)
2064         {
2065 #if GMX_MPI
2066             int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2067             ivec coords;
2068             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2069             coords[comm->cartpmedim]++;
2070             if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2071             {
2072                 int rank;
2073                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2074                 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
2075                 {
2076                     /* This is not the last PP node for pmenode */
2077                     bReceive = FALSE;
2078                 }
2079             }
2080 #else
2081             GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
2082 #endif
2083         }
2084         else
2085         {
2086             int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2087             if (cr->sim_nodeid+1 < cr->nnodes &&
2088                 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
2089             {
2090                 /* This is not the last PP node for pmenode */
2091                 bReceive = FALSE;
2092             }
2093         }
2094     }
2095
2096     return bReceive;
2097 }
2098
2099 static void set_zones_ncg_home(gmx_domdec_t *dd)
2100 {
2101     gmx_domdec_zones_t *zones;
2102     int                 i;
2103
2104     zones = &dd->comm->zones;
2105
2106     zones->cg_range[0] = 0;
2107     for (i = 1; i < zones->n+1; i++)
2108     {
2109         zones->cg_range[i] = dd->ncg_home;
2110     }
2111     /* zone_ncg1[0] should always be equal to ncg_home */
2112     dd->comm->zone_ncg1[0] = dd->ncg_home;
2113 }
2114
2115 static void rebuild_cgindex(gmx_domdec_t *dd,
2116                             const int *gcgs_index, const t_state *state)
2117 {
2118     int * gmx_restrict dd_cg_gl = dd->index_gl;
2119     int * gmx_restrict cgindex  = dd->cgindex;
2120     int                nat      = 0;
2121
2122     /* Copy back the global charge group indices from state
2123      * and rebuild the local charge group to atom index.
2124      */
2125     cgindex[0] = nat;
2126     for (unsigned int i = 0; i < state->cg_gl.size(); i++)
2127     {
2128         cgindex[i]  = nat;
2129         int cg_gl   = state->cg_gl[i];
2130         dd_cg_gl[i] = cg_gl;
2131         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2132     }
2133     cgindex[state->cg_gl.size()] = nat;
2134
2135     dd->ncg_home = state->cg_gl.size();
2136     dd->nat_home = nat;
2137
2138     set_zones_ncg_home(dd);
2139 }
2140
2141 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2142 {
2143     while (cg >= cginfo_mb->cg_end)
2144     {
2145         cginfo_mb++;
2146     }
2147
2148     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2149 }
2150
2151 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2152                           t_forcerec *fr, char *bLocalCG)
2153 {
2154     cginfo_mb_t *cginfo_mb;
2155     int         *cginfo;
2156     int          cg;
2157
2158     if (fr != nullptr)
2159     {
2160         cginfo_mb = fr->cginfo_mb;
2161         cginfo    = fr->cginfo;
2162
2163         for (cg = cg0; cg < cg1; cg++)
2164         {
2165             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2166         }
2167     }
2168
2169     if (bLocalCG != nullptr)
2170     {
2171         for (cg = cg0; cg < cg1; cg++)
2172         {
2173             bLocalCG[index_gl[cg]] = TRUE;
2174         }
2175     }
2176 }
2177
2178 static void make_dd_indices(gmx_domdec_t *dd,
2179                             const int *gcgs_index, int cg_start)
2180 {
2181     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2182     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2183     gmx_bool     bCGs;
2184
2185     if (dd->nat_tot > dd->gatindex_nalloc)
2186     {
2187         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2188         srenew(dd->gatindex, dd->gatindex_nalloc);
2189     }
2190
2191     nzone      = dd->comm->zones.n;
2192     zone2cg    = dd->comm->zones.cg_range;
2193     zone_ncg1  = dd->comm->zone_ncg1;
2194     index_gl   = dd->index_gl;
2195     gatindex   = dd->gatindex;
2196     bCGs       = dd->comm->bCGs;
2197
2198     if (zone2cg[1] != dd->ncg_home)
2199     {
2200         gmx_incons("dd->ncg_zone is not up to date");
2201     }
2202
2203     /* Make the local to global and global to local atom index */
2204     a = dd->cgindex[cg_start];
2205     for (zone = 0; zone < nzone; zone++)
2206     {
2207         if (zone == 0)
2208         {
2209             cg0 = cg_start;
2210         }
2211         else
2212         {
2213             cg0 = zone2cg[zone];
2214         }
2215         cg1    = zone2cg[zone+1];
2216         cg1_p1 = cg0 + zone_ncg1[zone];
2217
2218         for (cg = cg0; cg < cg1; cg++)
2219         {
2220             zone1 = zone;
2221             if (cg >= cg1_p1)
2222             {
2223                 /* Signal that this cg is from more than one pulse away */
2224                 zone1 += nzone;
2225             }
2226             cg_gl = index_gl[cg];
2227             if (bCGs)
2228             {
2229                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2230                 {
2231                     gatindex[a] = a_gl;
2232                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2233                     a++;
2234                 }
2235             }
2236             else
2237             {
2238                 gatindex[a] = cg_gl;
2239                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2240                 a++;
2241             }
2242         }
2243     }
2244 }
2245
2246 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2247                           const char *where)
2248 {
2249     int i, ngl, nerr;
2250
2251     nerr = 0;
2252     if (bLocalCG == nullptr)
2253     {
2254         return nerr;
2255     }
2256     for (i = 0; i < dd->ncg_tot; i++)
2257     {
2258         if (!bLocalCG[dd->index_gl[i]])
2259         {
2260             fprintf(stderr,
2261                     "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2262             nerr++;
2263         }
2264     }
2265     ngl = 0;
2266     for (i = 0; i < ncg_sys; i++)
2267     {
2268         if (bLocalCG[i])
2269         {
2270             ngl++;
2271         }
2272     }
2273     if (ngl != dd->ncg_tot)
2274     {
2275         fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2276         nerr++;
2277     }
2278
2279     return nerr;
2280 }
2281
2282 static void check_index_consistency(gmx_domdec_t *dd,
2283                                     int natoms_sys, int ncg_sys,
2284                                     const char *where)
2285 {
2286     int   nerr, ngl, i, a, cell;
2287     int  *have;
2288
2289     nerr = 0;
2290
2291     if (dd->comm->DD_debug > 1)
2292     {
2293         snew(have, natoms_sys);
2294         for (a = 0; a < dd->nat_tot; a++)
2295         {
2296             if (have[dd->gatindex[a]] > 0)
2297             {
2298                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2299             }
2300             else
2301             {
2302                 have[dd->gatindex[a]] = a + 1;
2303             }
2304         }
2305         sfree(have);
2306     }
2307
2308     snew(have, dd->nat_tot);
2309
2310     ngl  = 0;
2311     for (i = 0; i < natoms_sys; i++)
2312     {
2313         if (ga2la_get(dd->ga2la, i, &a, &cell))
2314         {
2315             if (a >= dd->nat_tot)
2316             {
2317                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2318                 nerr++;
2319             }
2320             else
2321             {
2322                 have[a] = 1;
2323                 if (dd->gatindex[a] != i)
2324                 {
2325                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2326                     nerr++;
2327                 }
2328             }
2329             ngl++;
2330         }
2331     }
2332     if (ngl != dd->nat_tot)
2333     {
2334         fprintf(stderr,
2335                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2336                 dd->rank, where, ngl, dd->nat_tot);
2337     }
2338     for (a = 0; a < dd->nat_tot; a++)
2339     {
2340         if (have[a] == 0)
2341         {
2342             fprintf(stderr,
2343                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
2344                     dd->rank, where, a+1, dd->gatindex[a]+1);
2345         }
2346     }
2347     sfree(have);
2348
2349     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2350
2351     if (nerr > 0)
2352     {
2353         gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2354                   dd->rank, where, nerr);
2355     }
2356 }
2357
2358 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2359 {
2360     int   i;
2361     char *bLocalCG;
2362
2363     if (a_start == 0)
2364     {
2365         /* Clear the whole list without searching */
2366         ga2la_clear(dd->ga2la);
2367     }
2368     else
2369     {
2370         for (i = a_start; i < dd->nat_tot; i++)
2371         {
2372             ga2la_del(dd->ga2la, dd->gatindex[i]);
2373         }
2374     }
2375
2376     bLocalCG = dd->comm->bLocalCG;
2377     if (bLocalCG)
2378     {
2379         for (i = cg_start; i < dd->ncg_tot; i++)
2380         {
2381             bLocalCG[dd->index_gl[i]] = FALSE;
2382         }
2383     }
2384
2385     dd_clear_local_vsite_indices(dd);
2386
2387     if (dd->constraints)
2388     {
2389         dd_clear_local_constraint_indices(dd);
2390     }
2391 }
2392
2393 /* This function should be used for moving the domain boudaries during DLB,
2394  * for obtaining the minimum cell size. It checks the initially set limit
2395  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2396  * and, possibly, a longer cut-off limit set for PME load balancing.
2397  */
2398 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2399 {
2400     real cellsize_min;
2401
2402     cellsize_min = comm->cellsize_min[dim];
2403
2404     if (!comm->bVacDLBNoLimit)
2405     {
2406         /* The cut-off might have changed, e.g. by PME load balacning,
2407          * from the value used to set comm->cellsize_min, so check it.
2408          */
2409         cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2410
2411         if (comm->bPMELoadBalDLBLimits)
2412         {
2413             /* Check for the cut-off limit set by the PME load balancing */
2414             cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2415         }
2416     }
2417
2418     return cellsize_min;
2419 }
2420
2421 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2422                             int dim_ind)
2423 {
2424     real grid_jump_limit;
2425
2426     /* The distance between the boundaries of cells at distance
2427      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2428      * and by the fact that cells should not be shifted by more than
2429      * half their size, such that cg's only shift by one cell
2430      * at redecomposition.
2431      */
2432     grid_jump_limit = comm->cellsize_limit;
2433     if (!comm->bVacDLBNoLimit)
2434     {
2435         if (comm->bPMELoadBalDLBLimits)
2436         {
2437             cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
2438         }
2439         grid_jump_limit = std::max(grid_jump_limit,
2440                                    cutoff/comm->cd[dim_ind].np);
2441     }
2442
2443     return grid_jump_limit;
2444 }
2445
2446 static gmx_bool check_grid_jump(gmx_int64_t     step,
2447                                 gmx_domdec_t   *dd,
2448                                 real            cutoff,
2449                                 gmx_ddbox_t    *ddbox,
2450                                 gmx_bool        bFatal)
2451 {
2452     gmx_domdec_comm_t *comm;
2453     int                d, dim;
2454     real               limit, bfac;
2455     gmx_bool           bInvalid;
2456
2457     bInvalid = FALSE;
2458
2459     comm = dd->comm;
2460
2461     for (d = 1; d < dd->ndim; d++)
2462     {
2463         dim   = dd->dim[d];
2464         limit = grid_jump_limit(comm, cutoff, d);
2465         bfac  = ddbox->box_size[dim];
2466         if (ddbox->tric_dir[dim])
2467         {
2468             bfac *= ddbox->skew_fac[dim];
2469         }
2470         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2471                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2472         {
2473             bInvalid = TRUE;
2474
2475             if (bFatal)
2476             {
2477                 char buf[22];
2478
2479                 /* This error should never be triggered under normal
2480                  * circumstances, but you never know ...
2481                  */
2482                 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2483                           gmx_step_str(step, buf),
2484                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2485             }
2486         }
2487     }
2488
2489     return bInvalid;
2490 }
2491
2492 static int dd_load_count(gmx_domdec_comm_t *comm)
2493 {
2494     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2495 }
2496
2497 static float dd_force_load(gmx_domdec_comm_t *comm)
2498 {
2499     float load;
2500
2501     if (comm->eFlop)
2502     {
2503         load = comm->flop;
2504         if (comm->eFlop > 1)
2505         {
2506             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2507         }
2508     }
2509     else
2510     {
2511         load = comm->cycl[ddCyclF];
2512         if (comm->cycl_n[ddCyclF] > 1)
2513         {
2514             /* Subtract the maximum of the last n cycle counts
2515              * to get rid of possible high counts due to other sources,
2516              * for instance system activity, that would otherwise
2517              * affect the dynamic load balancing.
2518              */
2519             load -= comm->cycl_max[ddCyclF];
2520         }
2521
2522 #if GMX_MPI
2523         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2524         {
2525             float gpu_wait, gpu_wait_sum;
2526
2527             gpu_wait = comm->cycl[ddCyclWaitGPU];
2528             if (comm->cycl_n[ddCyclF] > 1)
2529             {
2530                 /* We should remove the WaitGPU time of the same MD step
2531                  * as the one with the maximum F time, since the F time
2532                  * and the wait time are not independent.
2533                  * Furthermore, the step for the max F time should be chosen
2534                  * the same on all ranks that share the same GPU.
2535                  * But to keep the code simple, we remove the average instead.
2536                  * The main reason for artificially long times at some steps
2537                  * is spurious CPU activity or MPI time, so we don't expect
2538                  * that changes in the GPU wait time matter a lot here.
2539                  */
2540                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2541             }
2542             /* Sum the wait times over the ranks that share the same GPU */
2543             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2544                           comm->mpi_comm_gpu_shared);
2545             /* Replace the wait time by the average over the ranks */
2546             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2547         }
2548 #endif
2549     }
2550
2551     return load;
2552 }
2553
2554 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2555 {
2556     gmx_domdec_comm_t *comm;
2557     int                i;
2558
2559     comm = dd->comm;
2560
2561     snew(*dim_f, dd->nc[dim]+1);
2562     (*dim_f)[0] = 0;
2563     for (i = 1; i < dd->nc[dim]; i++)
2564     {
2565         if (comm->slb_frac[dim])
2566         {
2567             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2568         }
2569         else
2570         {
2571             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2572         }
2573     }
2574     (*dim_f)[dd->nc[dim]] = 1;
2575 }
2576
2577 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2578 {
2579     int  pmeindex, slab, nso, i;
2580     ivec xyz;
2581
2582     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2583     {
2584         ddpme->dim = YY;
2585     }
2586     else
2587     {
2588         ddpme->dim = dimind;
2589     }
2590     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2591
2592     ddpme->nslab = (ddpme->dim == 0 ?
2593                     dd->comm->npmenodes_x :
2594                     dd->comm->npmenodes_y);
2595
2596     if (ddpme->nslab <= 1)
2597     {
2598         return;
2599     }
2600
2601     nso = dd->comm->npmenodes/ddpme->nslab;
2602     /* Determine for each PME slab the PP location range for dimension dim */
2603     snew(ddpme->pp_min, ddpme->nslab);
2604     snew(ddpme->pp_max, ddpme->nslab);
2605     for (slab = 0; slab < ddpme->nslab; slab++)
2606     {
2607         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2608         ddpme->pp_max[slab] = 0;
2609     }
2610     for (i = 0; i < dd->nnodes; i++)
2611     {
2612         ddindex2xyz(dd->nc, i, xyz);
2613         /* For y only use our y/z slab.
2614          * This assumes that the PME x grid size matches the DD grid size.
2615          */
2616         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2617         {
2618             pmeindex = ddindex2pmeindex(dd, i);
2619             if (dimind == 0)
2620             {
2621                 slab = pmeindex/nso;
2622             }
2623             else
2624             {
2625                 slab = pmeindex % ddpme->nslab;
2626             }
2627             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
2628             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
2629         }
2630     }
2631
2632     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2633 }
2634
2635 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
2636 {
2637     if (dd->comm->ddpme[0].dim == XX)
2638     {
2639         return dd->comm->ddpme[0].maxshift;
2640     }
2641     else
2642     {
2643         return 0;
2644     }
2645 }
2646
2647 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
2648 {
2649     if (dd->comm->ddpme[0].dim == YY)
2650     {
2651         return dd->comm->ddpme[0].maxshift;
2652     }
2653     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2654     {
2655         return dd->comm->ddpme[1].maxshift;
2656     }
2657     else
2658     {
2659         return 0;
2660     }
2661 }
2662
2663 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2664                              gmx_bool bUniform, const gmx_ddbox_t *ddbox,
2665                              const real *cell_f)
2666 {
2667     gmx_domdec_comm_t *comm;
2668     int                nc, ns, s;
2669     int               *xmin, *xmax;
2670     real               range, pme_boundary;
2671     int                sh;
2672
2673     comm = dd->comm;
2674     nc   = dd->nc[ddpme->dim];
2675     ns   = ddpme->nslab;
2676
2677     if (!ddpme->dim_match)
2678     {
2679         /* PP decomposition is not along dim: the worst situation */
2680         sh = ns/2;
2681     }
2682     else if (ns <= 3 || (bUniform && ns == nc))
2683     {
2684         /* The optimal situation */
2685         sh = 1;
2686     }
2687     else
2688     {
2689         /* We need to check for all pme nodes which nodes they
2690          * could possibly need to communicate with.
2691          */
2692         xmin = ddpme->pp_min;
2693         xmax = ddpme->pp_max;
2694         /* Allow for atoms to be maximally 2/3 times the cut-off
2695          * out of their DD cell. This is a reasonable balance between
2696          * between performance and support for most charge-group/cut-off
2697          * combinations.
2698          */
2699         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2700         /* Avoid extra communication when we are exactly at a boundary */
2701         range *= 0.999;
2702
2703         sh = 1;
2704         for (s = 0; s < ns; s++)
2705         {
2706             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2707             pme_boundary = (real)s/ns;
2708             while (sh+1 < ns &&
2709                    ((s-(sh+1) >= 0 &&
2710                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2711                     (s-(sh+1) <  0 &&
2712                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2713             {
2714                 sh++;
2715             }
2716             pme_boundary = (real)(s+1)/ns;
2717             while (sh+1 < ns &&
2718                    ((s+(sh+1) <  ns &&
2719                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2720                     (s+(sh+1) >= ns &&
2721                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2722             {
2723                 sh++;
2724             }
2725         }
2726     }
2727
2728     ddpme->maxshift = sh;
2729
2730     if (debug)
2731     {
2732         fprintf(debug, "PME slab communication range for dim %d is %d\n",
2733                 ddpme->dim, ddpme->maxshift);
2734     }
2735 }
2736
2737 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
2738 {
2739     int d, dim;
2740
2741     for (d = 0; d < dd->ndim; d++)
2742     {
2743         dim = dd->dim[d];
2744         if (dim < ddbox->nboundeddim &&
2745             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2746             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2747         {
2748             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2749                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2750                       dd->nc[dim], dd->comm->cellsize_limit);
2751         }
2752     }
2753 }
2754
2755 enum {
2756     setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
2757 };
2758
2759 /* Set the domain boundaries. Use for static (or no) load balancing,
2760  * and also for the starting state for dynamic load balancing.
2761  * setmode determine if and where the boundaries are stored, use enum above.
2762  * Returns the number communication pulses in npulse.
2763  */
2764 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, const gmx_ddbox_t *ddbox,
2765                                   int setmode, ivec npulse)
2766 {
2767     gmx_domdec_comm_t *comm;
2768     int                d, j;
2769     rvec               cellsize_min;
2770     real              *cell_x, cell_dx, cellsize;
2771
2772     comm = dd->comm;
2773
2774     for (d = 0; d < DIM; d++)
2775     {
2776         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2777         npulse[d]       = 1;
2778         if (dd->nc[d] == 1 || comm->slb_frac[d] == nullptr)
2779         {
2780             /* Uniform grid */
2781             cell_dx = ddbox->box_size[d]/dd->nc[d];
2782             switch (setmode)
2783             {
2784                 case setcellsizeslbMASTER:
2785                     for (j = 0; j < dd->nc[d]+1; j++)
2786                     {
2787                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2788                     }
2789                     break;
2790                 case setcellsizeslbLOCAL:
2791                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2792                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2793                     break;
2794                 default:
2795                     break;
2796             }
2797             cellsize = cell_dx*ddbox->skew_fac[d];
2798             while (cellsize*npulse[d] < comm->cutoff)
2799             {
2800                 npulse[d]++;
2801             }
2802             cellsize_min[d] = cellsize;
2803         }
2804         else
2805         {
2806             /* Statically load balanced grid */
2807             /* Also when we are not doing a master distribution we determine
2808              * all cell borders in a loop to obtain identical values
2809              * to the master distribution case and to determine npulse.
2810              */
2811             if (setmode == setcellsizeslbMASTER)
2812             {
2813                 cell_x = dd->ma->cell_x[d];
2814             }
2815             else
2816             {
2817                 snew(cell_x, dd->nc[d]+1);
2818             }
2819             cell_x[0] = ddbox->box0[d];
2820             for (j = 0; j < dd->nc[d]; j++)
2821             {
2822                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
2823                 cell_x[j+1] = cell_x[j] + cell_dx;
2824                 cellsize    = cell_dx*ddbox->skew_fac[d];
2825                 while (cellsize*npulse[d] < comm->cutoff &&
2826                        npulse[d] < dd->nc[d]-1)
2827                 {
2828                     npulse[d]++;
2829                 }
2830                 cellsize_min[d] = std::min(cellsize_min[d], cellsize);
2831             }
2832             if (setmode == setcellsizeslbLOCAL)
2833             {
2834                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2835                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2836             }
2837             if (setmode != setcellsizeslbMASTER)
2838             {
2839                 sfree(cell_x);
2840             }
2841         }
2842         /* The following limitation is to avoid that a cell would receive
2843          * some of its own home charge groups back over the periodic boundary.
2844          * Double charge groups cause trouble with the global indices.
2845          */
2846         if (d < ddbox->npbcdim &&
2847             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2848         {
2849             char error_string[STRLEN];
2850
2851             sprintf(error_string,
2852                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2853                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
2854                     comm->cutoff,
2855                     dd->nc[d], dd->nc[d],
2856                     dd->nnodes > dd->nc[d] ? "cells" : "ranks");
2857
2858             if (setmode == setcellsizeslbLOCAL)
2859             {
2860                 gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2861                                      error_string);
2862             }
2863             else
2864             {
2865                 gmx_fatal(FARGS, error_string);
2866             }
2867         }
2868     }
2869
2870     if (!dlbIsOn(comm))
2871     {
2872         copy_rvec(cellsize_min, comm->cellsize_min);
2873     }
2874
2875     for (d = 0; d < comm->npmedecompdim; d++)
2876     {
2877         set_pme_maxshift(dd, &comm->ddpme[d],
2878                          comm->slb_frac[dd->dim[d]] == nullptr, ddbox,
2879                          comm->ddpme[d].slb_dim_f);
2880     }
2881 }
2882
2883
2884 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2885                                                   int d, int dim, domdec_root_t *root,
2886                                                   const gmx_ddbox_t *ddbox,
2887                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
2888 {
2889     gmx_domdec_comm_t *comm;
2890     int                ncd, i, j, nmin, nmin_old;
2891     gmx_bool           bLimLo, bLimHi;
2892     real              *cell_size;
2893     real               fac, halfway, cellsize_limit_f_i, region_size;
2894     gmx_bool           bPBC, bLastHi = FALSE;
2895     int                nrange[] = {range[0], range[1]};
2896
2897     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
2898
2899     comm = dd->comm;
2900
2901     ncd = dd->nc[dim];
2902
2903     bPBC = (dim < ddbox->npbcdim);
2904
2905     cell_size = root->buf_ncd;
2906
2907     if (debug)
2908     {
2909         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
2910     }
2911
2912     /* First we need to check if the scaling does not make cells
2913      * smaller than the smallest allowed size.
2914      * We need to do this iteratively, since if a cell is too small,
2915      * it needs to be enlarged, which makes all the other cells smaller,
2916      * which could in turn make another cell smaller than allowed.
2917      */
2918     for (i = range[0]; i < range[1]; i++)
2919     {
2920         root->bCellMin[i] = FALSE;
2921     }
2922     nmin = 0;
2923     do
2924     {
2925         nmin_old = nmin;
2926         /* We need the total for normalization */
2927         fac = 0;
2928         for (i = range[0]; i < range[1]; i++)
2929         {
2930             if (root->bCellMin[i] == FALSE)
2931             {
2932                 fac += cell_size[i];
2933             }
2934         }
2935         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
2936         /* Determine the cell boundaries */
2937         for (i = range[0]; i < range[1]; i++)
2938         {
2939             if (root->bCellMin[i] == FALSE)
2940             {
2941                 cell_size[i] *= fac;
2942                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
2943                 {
2944                     cellsize_limit_f_i = 0;
2945                 }
2946                 else
2947                 {
2948                     cellsize_limit_f_i = cellsize_limit_f;
2949                 }
2950                 if (cell_size[i] < cellsize_limit_f_i)
2951                 {
2952                     root->bCellMin[i] = TRUE;
2953                     cell_size[i]      = cellsize_limit_f_i;
2954                     nmin++;
2955                 }
2956             }
2957             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
2958         }
2959     }
2960     while (nmin > nmin_old);
2961
2962     i            = range[1]-1;
2963     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
2964     /* For this check we should not use DD_CELL_MARGIN,
2965      * but a slightly smaller factor,
2966      * since rounding could get use below the limit.
2967      */
2968     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
2969     {
2970         char buf[22];
2971         gmx_fatal(FARGS, "step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
2972                   gmx_step_str(step, buf),
2973                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2974                   ncd, comm->cellsize_min[dim]);
2975     }
2976
2977     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
2978
2979     if (!bUniform)
2980     {
2981         /* Check if the boundary did not displace more than halfway
2982          * each of the cells it bounds, as this could cause problems,
2983          * especially when the differences between cell sizes are large.
2984          * If changes are applied, they will not make cells smaller
2985          * than the cut-off, as we check all the boundaries which
2986          * might be affected by a change and if the old state was ok,
2987          * the cells will at most be shrunk back to their old size.
2988          */
2989         for (i = range[0]+1; i < range[1]; i++)
2990         {
2991             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
2992             if (root->cell_f[i] < halfway)
2993             {
2994                 root->cell_f[i] = halfway;
2995                 /* Check if the change also causes shifts of the next boundaries */
2996                 for (j = i+1; j < range[1]; j++)
2997                 {
2998                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
2999                     {
3000                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3001                     }
3002                 }
3003             }
3004             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3005             if (root->cell_f[i] > halfway)
3006             {
3007                 root->cell_f[i] = halfway;
3008                 /* Check if the change also causes shifts of the next boundaries */
3009                 for (j = i-1; j >= range[0]+1; j--)
3010                 {
3011                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3012                     {
3013                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3014                     }
3015                 }
3016             }
3017         }
3018     }
3019
3020     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3021     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3022      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3023      * for a and b nrange is used */
3024     if (d > 0)
3025     {
3026         /* Take care of the staggering of the cell boundaries */
3027         if (bUniform)
3028         {
3029             for (i = range[0]; i < range[1]; i++)
3030             {
3031                 root->cell_f_max0[i] = root->cell_f[i];
3032                 root->cell_f_min1[i] = root->cell_f[i+1];
3033             }
3034         }
3035         else
3036         {
3037             for (i = range[0]+1; i < range[1]; i++)
3038             {
3039                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3040                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3041                 if (bLimLo && bLimHi)
3042                 {
3043                     /* Both limits violated, try the best we can */
3044                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3045                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3046                     nrange[0]       = range[0];
3047                     nrange[1]       = i;
3048                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3049
3050                     nrange[0] = i;
3051                     nrange[1] = range[1];
3052                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3053
3054                     return;
3055                 }
3056                 else if (bLimLo)
3057                 {
3058                     /* root->cell_f[i] = root->bound_min[i]; */
3059                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3060                     bLastHi   = FALSE;
3061                 }
3062                 else if (bLimHi && !bLastHi)
3063                 {
3064                     bLastHi = TRUE;
3065                     if (nrange[1] < range[1])   /* found a LimLo before */
3066                     {
3067                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3068                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3069                         nrange[0] = nrange[1];
3070                     }
3071                     root->cell_f[i] = root->bound_max[i];
3072                     nrange[1]       = i;
3073                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3074                     nrange[0] = i;
3075                     nrange[1] = range[1];
3076                 }
3077             }
3078             if (nrange[1] < range[1])   /* found last a LimLo */
3079             {
3080                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3081                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3082                 nrange[0] = nrange[1];
3083                 nrange[1] = range[1];
3084                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3085             }
3086             else if (nrange[0] > range[0]) /* found at least one LimHi */
3087             {
3088                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3089             }
3090         }
3091     }
3092 }
3093
3094
3095 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3096                                        int d, int dim, domdec_root_t *root,
3097                                        const gmx_ddbox_t *ddbox,
3098                                        gmx_bool bDynamicBox,
3099                                        gmx_bool bUniform, gmx_int64_t step)
3100 {
3101     gmx_domdec_comm_t *comm;
3102     int                ncd, d1, i, pos;
3103     real              *cell_size;
3104     real               load_aver, load_i, imbalance, change, change_max, sc;
3105     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3106     real               change_limit;
3107     real               relax = 0.5;
3108     gmx_bool           bPBC;
3109     int                range[] = { 0, 0 };
3110
3111     comm = dd->comm;
3112
3113     /* Convert the maximum change from the input percentage to a fraction */
3114     change_limit = comm->dlb_scale_lim*0.01;
3115
3116     ncd = dd->nc[dim];
3117
3118     bPBC = (dim < ddbox->npbcdim);
3119
3120     cell_size = root->buf_ncd;
3121
3122     /* Store the original boundaries */
3123     for (i = 0; i < ncd+1; i++)
3124     {
3125         root->old_cell_f[i] = root->cell_f[i];
3126     }
3127     if (bUniform)
3128     {
3129         for (i = 0; i < ncd; i++)
3130         {
3131             cell_size[i] = 1.0/ncd;
3132         }
3133     }
3134     else if (dd_load_count(comm) > 0)
3135     {
3136         load_aver  = comm->load[d].sum_m/ncd;
3137         change_max = 0;
3138         for (i = 0; i < ncd; i++)
3139         {
3140             /* Determine the relative imbalance of cell i */
3141             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3142             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3143             /* Determine the change of the cell size using underrelaxation */
3144             change     = -relax*imbalance;
3145             change_max = std::max(change_max, std::max(change, -change));
3146         }
3147         /* Limit the amount of scaling.
3148          * We need to use the same rescaling for all cells in one row,
3149          * otherwise the load balancing might not converge.
3150          */
3151         sc = relax;
3152         if (change_max > change_limit)
3153         {
3154             sc *= change_limit/change_max;
3155         }
3156         for (i = 0; i < ncd; i++)
3157         {
3158             /* Determine the relative imbalance of cell i */
3159             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3160             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3161             /* Determine the change of the cell size using underrelaxation */
3162             change       = -sc*imbalance;
3163             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3164         }
3165     }
3166
3167     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3168     cellsize_limit_f *= DD_CELL_MARGIN;
3169     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3170     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3171     if (ddbox->tric_dir[dim])
3172     {
3173         cellsize_limit_f /= ddbox->skew_fac[dim];
3174         dist_min_f       /= ddbox->skew_fac[dim];
3175     }
3176     if (bDynamicBox && d > 0)
3177     {
3178         dist_min_f *= DD_PRES_SCALE_MARGIN;
3179     }
3180     if (d > 0 && !bUniform)
3181     {
3182         /* Make sure that the grid is not shifted too much */
3183         for (i = 1; i < ncd; i++)
3184         {
3185             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3186             {
3187                 gmx_incons("Inconsistent DD boundary staggering limits!");
3188             }
3189             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3190             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3191             if (space > 0)
3192             {
3193                 root->bound_min[i] += 0.5*space;
3194             }
3195             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3196             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3197             if (space < 0)
3198             {
3199                 root->bound_max[i] += 0.5*space;
3200             }
3201             if (debug)
3202             {
3203                 fprintf(debug,
3204                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3205                         d, i,
3206                         root->cell_f_max0[i-1] + dist_min_f,
3207                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3208                         root->cell_f_min1[i] - dist_min_f);
3209             }
3210         }
3211     }
3212     range[1]          = ncd;
3213     root->cell_f[0]   = 0;
3214     root->cell_f[ncd] = 1;
3215     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3216
3217
3218     /* After the checks above, the cells should obey the cut-off
3219      * restrictions, but it does not hurt to check.
3220      */
3221     for (i = 0; i < ncd; i++)
3222     {
3223         if (debug)
3224         {
3225             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3226                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3227         }
3228
3229         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3230             root->cell_f[i+1] - root->cell_f[i] <
3231             cellsize_limit_f/DD_CELL_MARGIN)
3232         {
3233             char buf[22];
3234             fprintf(stderr,
3235                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3236                     gmx_step_str(step, buf), dim2char(dim), i,
3237                     (root->cell_f[i+1] - root->cell_f[i])
3238                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3239         }
3240     }
3241
3242     pos = ncd + 1;
3243     /* Store the cell boundaries of the lower dimensions at the end */
3244     for (d1 = 0; d1 < d; d1++)
3245     {
3246         root->cell_f[pos++] = comm->cell_f0[d1];
3247         root->cell_f[pos++] = comm->cell_f1[d1];
3248     }
3249
3250     if (d < comm->npmedecompdim)
3251     {
3252         /* The master determines the maximum shift for
3253          * the coordinate communication between separate PME nodes.
3254          */
3255         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3256     }
3257     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3258     if (d >= 1)
3259     {
3260         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3261     }
3262 }
3263
3264 static void relative_to_absolute_cell_bounds(gmx_domdec_t      *dd,
3265                                              const gmx_ddbox_t *ddbox,
3266                                              int                dimind)
3267 {
3268     gmx_domdec_comm_t *comm;
3269     int                dim;
3270
3271     comm = dd->comm;
3272
3273     /* Set the cell dimensions */
3274     dim                = dd->dim[dimind];
3275     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3276     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3277     if (dim >= ddbox->nboundeddim)
3278     {
3279         comm->cell_x0[dim] += ddbox->box0[dim];
3280         comm->cell_x1[dim] += ddbox->box0[dim];
3281     }
3282 }
3283
3284 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3285                                          int d, int dim, real *cell_f_row,
3286                                          const gmx_ddbox_t *ddbox)
3287 {
3288     gmx_domdec_comm_t *comm;
3289     int                d1, pos;
3290
3291     comm = dd->comm;
3292
3293 #if GMX_MPI
3294     /* Each node would only need to know two fractions,
3295      * but it is probably cheaper to broadcast the whole array.
3296      */
3297     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3298               0, comm->mpi_comm_load[d]);
3299 #endif
3300     /* Copy the fractions for this dimension from the buffer */
3301     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3302     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3303     /* The whole array was communicated, so set the buffer position */
3304     pos = dd->nc[dim] + 1;
3305     for (d1 = 0; d1 <= d; d1++)
3306     {
3307         if (d1 < d)
3308         {
3309             /* Copy the cell fractions of the lower dimensions */
3310             comm->cell_f0[d1] = cell_f_row[pos++];
3311             comm->cell_f1[d1] = cell_f_row[pos++];
3312         }
3313         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3314     }
3315     /* Convert the communicated shift from float to int */
3316     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3317     if (d >= 1)
3318     {
3319         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3320     }
3321 }
3322
3323 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3324                                          const gmx_ddbox_t *ddbox,
3325                                          gmx_bool bDynamicBox,
3326                                          gmx_bool bUniform, gmx_int64_t step)
3327 {
3328     gmx_domdec_comm_t *comm;
3329     int                d, dim, d1;
3330     gmx_bool           bRowMember, bRowRoot;
3331     real              *cell_f_row;
3332
3333     comm = dd->comm;
3334
3335     for (d = 0; d < dd->ndim; d++)
3336     {
3337         dim        = dd->dim[d];
3338         bRowMember = TRUE;
3339         bRowRoot   = TRUE;
3340         for (d1 = d; d1 < dd->ndim; d1++)
3341         {
3342             if (dd->ci[dd->dim[d1]] > 0)
3343             {
3344                 if (d1 != d)
3345                 {
3346                     bRowMember = FALSE;
3347                 }
3348                 bRowRoot = FALSE;
3349             }
3350         }
3351         if (bRowMember)
3352         {
3353             if (bRowRoot)
3354             {
3355                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3356                                            ddbox, bDynamicBox, bUniform, step);
3357                 cell_f_row = comm->root[d]->cell_f;
3358             }
3359             else
3360             {
3361                 cell_f_row = comm->cell_f_row;
3362             }
3363             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3364         }
3365     }
3366 }
3367
3368 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t      *dd,
3369                                            const gmx_ddbox_t *ddbox)
3370 {
3371     int d;
3372
3373     /* This function assumes the box is static and should therefore
3374      * not be called when the box has changed since the last
3375      * call to dd_partition_system.
3376      */
3377     for (d = 0; d < dd->ndim; d++)
3378     {
3379         relative_to_absolute_cell_bounds(dd, ddbox, d);
3380     }
3381 }
3382
3383
3384
3385 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3386                                   const gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3387                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3388                                   gmx_wallcycle_t wcycle)
3389 {
3390     gmx_domdec_comm_t *comm;
3391     int                dim;
3392
3393     comm = dd->comm;
3394
3395     if (bDoDLB)
3396     {
3397         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3398         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3399         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3400     }
3401     else if (bDynamicBox)
3402     {
3403         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3404     }
3405
3406     /* Set the dimensions for which no DD is used */
3407     for (dim = 0; dim < DIM; dim++)
3408     {
3409         if (dd->nc[dim] == 1)
3410         {
3411             comm->cell_x0[dim] = 0;
3412             comm->cell_x1[dim] = ddbox->box_size[dim];
3413             if (dim >= ddbox->nboundeddim)
3414             {
3415                 comm->cell_x0[dim] += ddbox->box0[dim];
3416                 comm->cell_x1[dim] += ddbox->box0[dim];
3417             }
3418         }
3419     }
3420 }
3421
3422 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3423 {
3424     int                    d, np, i;
3425     gmx_domdec_comm_dim_t *cd;
3426
3427     for (d = 0; d < dd->ndim; d++)
3428     {
3429         cd = &dd->comm->cd[d];
3430         np = npulse[dd->dim[d]];
3431         if (np > cd->np_nalloc)
3432         {
3433             if (debug)
3434             {
3435                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3436                         dim2char(dd->dim[d]), np);
3437             }
3438             if (DDMASTER(dd) && cd->np_nalloc > 0)
3439             {
3440                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3441             }
3442             srenew(cd->ind, np);
3443             for (i = cd->np_nalloc; i < np; i++)
3444             {
3445                 cd->ind[i].index  = nullptr;
3446                 cd->ind[i].nalloc = 0;
3447             }
3448             cd->np_nalloc = np;
3449         }
3450         cd->np = np;
3451     }
3452 }
3453
3454
3455 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3456                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3457                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3458                               gmx_wallcycle_t wcycle)
3459 {
3460     gmx_domdec_comm_t *comm;
3461     int                d;
3462     ivec               npulse;
3463
3464     comm = dd->comm;
3465
3466     /* Copy the old cell boundaries for the cg displacement check */
3467     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3468     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3469
3470     if (dlbIsOn(comm))
3471     {
3472         if (DDMASTER(dd))
3473         {
3474             check_box_size(dd, ddbox);
3475         }
3476         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3477     }
3478     else
3479     {
3480         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3481         realloc_comm_ind(dd, npulse);
3482     }
3483
3484     if (debug)
3485     {
3486         for (d = 0; d < DIM; d++)
3487         {
3488             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3489                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3490         }
3491     }
3492 }
3493
3494 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3495                                   gmx_ddbox_t *ddbox,
3496                                   rvec cell_ns_x0, rvec cell_ns_x1,
3497                                   gmx_int64_t step)
3498 {
3499     gmx_domdec_comm_t *comm;
3500     int                dim_ind, dim;
3501
3502     comm = dd->comm;
3503
3504     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3505     {
3506         dim = dd->dim[dim_ind];
3507
3508         /* Without PBC we don't have restrictions on the outer cells */
3509         if (!(dim >= ddbox->npbcdim &&
3510               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3511             dlbIsOn(comm) &&
3512             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3513             comm->cellsize_min[dim])
3514         {
3515             char buf[22];
3516             gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3517                       gmx_step_str(step, buf), dim2char(dim),
3518                       comm->cell_x1[dim] - comm->cell_x0[dim],
3519                       ddbox->skew_fac[dim],
3520                       dd->comm->cellsize_min[dim],
3521                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3522         }
3523     }
3524
3525     if ((dlbIsOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3526     {
3527         /* Communicate the boundaries and update cell_ns_x0/1 */
3528         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3529         if (dlbIsOn(dd->comm) && dd->ndim > 1)
3530         {
3531             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3532         }
3533     }
3534 }
3535
3536 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3537 {
3538     if (YY < npbcdim)
3539     {
3540         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3541     }
3542     else
3543     {
3544         tcm[YY][XX] = 0;
3545     }
3546     if (ZZ < npbcdim)
3547     {
3548         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3549         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3550     }
3551     else
3552     {
3553         tcm[ZZ][XX] = 0;
3554         tcm[ZZ][YY] = 0;
3555     }
3556 }
3557
3558 static void check_screw_box(matrix box)
3559 {
3560     /* Mathematical limitation */
3561     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3562     {
3563         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3564     }
3565
3566     /* Limitation due to the asymmetry of the eighth shell method */
3567     if (box[ZZ][YY] != 0)
3568     {
3569         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3570     }
3571 }
3572
3573 static void distribute_cg(FILE *fplog,
3574                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3575                           gmx_domdec_t *dd)
3576 {
3577     gmx_domdec_master_t *ma;
3578     int                **tmp_ind = nullptr, *tmp_nalloc = nullptr;
3579     int                  i, icg, j, k, k0, k1, d;
3580     matrix               tcm;
3581     rvec                 cg_cm;
3582     ivec                 ind;
3583     real                 nrcg, inv_ncg, pos_d;
3584     int                 *cgindex;
3585     gmx_bool             bScrew;
3586
3587     ma = dd->ma;
3588
3589     snew(tmp_nalloc, dd->nnodes);
3590     snew(tmp_ind, dd->nnodes);
3591     for (i = 0; i < dd->nnodes; i++)
3592     {
3593         tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3594         snew(tmp_ind[i], tmp_nalloc[i]);
3595     }
3596
3597     /* Clear the count */
3598     for (i = 0; i < dd->nnodes; i++)
3599     {
3600         ma->ncg[i] = 0;
3601         ma->nat[i] = 0;
3602     }
3603
3604     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3605
3606     cgindex = cgs->index;
3607
3608     /* Compute the center of geometry for all charge groups */
3609     for (icg = 0; icg < cgs->nr; icg++)
3610     {
3611         k0      = cgindex[icg];
3612         k1      = cgindex[icg+1];
3613         nrcg    = k1 - k0;
3614         if (nrcg == 1)
3615         {
3616             copy_rvec(pos[k0], cg_cm);
3617         }
3618         else
3619         {
3620             inv_ncg = 1.0/nrcg;
3621
3622             clear_rvec(cg_cm);
3623             for (k = k0; (k < k1); k++)
3624             {
3625                 rvec_inc(cg_cm, pos[k]);
3626             }
3627             for (d = 0; (d < DIM); d++)
3628             {
3629                 cg_cm[d] *= inv_ncg;
3630             }
3631         }
3632         /* Put the charge group in the box and determine the cell index */
3633         for (d = DIM-1; d >= 0; d--)
3634         {
3635             pos_d = cg_cm[d];
3636             if (d < dd->npbcdim)
3637             {
3638                 bScrew = (dd->bScrewPBC && d == XX);
3639                 if (tric_dir[d] && dd->nc[d] > 1)
3640                 {
3641                     /* Use triclinic coordintates for this dimension */
3642                     for (j = d+1; j < DIM; j++)
3643                     {
3644                         pos_d += cg_cm[j]*tcm[j][d];
3645                     }
3646                 }
3647                 while (pos_d >= box[d][d])
3648                 {
3649                     pos_d -= box[d][d];
3650                     rvec_dec(cg_cm, box[d]);
3651                     if (bScrew)
3652                     {
3653                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3654                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3655                     }
3656                     for (k = k0; (k < k1); k++)
3657                     {
3658                         rvec_dec(pos[k], box[d]);
3659                         if (bScrew)
3660                         {
3661                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3662                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3663                         }
3664                     }
3665                 }
3666                 while (pos_d < 0)
3667                 {
3668                     pos_d += box[d][d];
3669                     rvec_inc(cg_cm, box[d]);
3670                     if (bScrew)
3671                     {
3672                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3673                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3674                     }
3675                     for (k = k0; (k < k1); k++)
3676                     {
3677                         rvec_inc(pos[k], box[d]);
3678                         if (bScrew)
3679                         {
3680                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3681                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3682                         }
3683                     }
3684                 }
3685             }
3686             /* This could be done more efficiently */
3687             ind[d] = 0;
3688             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3689             {
3690                 ind[d]++;
3691             }
3692         }
3693         i = dd_index(dd->nc, ind);
3694         if (ma->ncg[i] == tmp_nalloc[i])
3695         {
3696             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3697             srenew(tmp_ind[i], tmp_nalloc[i]);
3698         }
3699         tmp_ind[i][ma->ncg[i]] = icg;
3700         ma->ncg[i]++;
3701         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3702     }
3703
3704     k1 = 0;
3705     for (i = 0; i < dd->nnodes; i++)
3706     {
3707         ma->index[i] = k1;
3708         for (k = 0; k < ma->ncg[i]; k++)
3709         {
3710             ma->cg[k1++] = tmp_ind[i][k];
3711         }
3712     }
3713     ma->index[dd->nnodes] = k1;
3714
3715     for (i = 0; i < dd->nnodes; i++)
3716     {
3717         sfree(tmp_ind[i]);
3718     }
3719     sfree(tmp_ind);
3720     sfree(tmp_nalloc);
3721
3722     if (fplog)
3723     {
3724         // Use double for the sums to avoid natoms^2 overflowing
3725         // (65537^2 > 2^32)
3726         int    nat_sum, nat_min, nat_max;
3727         double nat2_sum;
3728
3729         nat_sum  = 0;
3730         nat2_sum = 0;
3731         nat_min  = ma->nat[0];
3732         nat_max  = ma->nat[0];
3733         for (i = 0; i < dd->nnodes; i++)
3734         {
3735             nat_sum  += ma->nat[i];
3736             // cast to double to avoid integer overflows when squaring
3737             nat2_sum += gmx::square(static_cast<double>(ma->nat[i]));
3738             nat_min   = std::min(nat_min, ma->nat[i]);
3739             nat_max   = std::max(nat_max, ma->nat[i]);
3740         }
3741         nat_sum  /= dd->nnodes;
3742         nat2_sum /= dd->nnodes;
3743
3744         fprintf(fplog, "Atom distribution over %d domains: av %d stddev %d min %d max %d\n",
3745                 dd->nnodes,
3746                 nat_sum,
3747                 static_cast<int>(std::sqrt(nat2_sum - gmx::square(static_cast<double>(nat_sum)) + 0.5)),
3748                 nat_min, nat_max);
3749     }
3750 }
3751
3752 static void get_cg_distribution(FILE *fplog, gmx_domdec_t *dd,
3753                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
3754                                 rvec pos[])
3755 {
3756     gmx_domdec_master_t *ma = nullptr;
3757     ivec                 npulse;
3758     int                  i, cg_gl;
3759     int                 *ibuf, buf2[2] = { 0, 0 };
3760     gmx_bool             bMaster = DDMASTER(dd);
3761
3762     if (bMaster)
3763     {
3764         ma = dd->ma;
3765
3766         if (dd->bScrewPBC)
3767         {
3768             check_screw_box(box);
3769         }
3770
3771         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
3772
3773         distribute_cg(fplog, box, ddbox->tric_dir, cgs, pos, dd);
3774         for (i = 0; i < dd->nnodes; i++)
3775         {
3776             ma->ibuf[2*i]   = ma->ncg[i];
3777             ma->ibuf[2*i+1] = ma->nat[i];
3778         }
3779         ibuf = ma->ibuf;
3780     }
3781     else
3782     {
3783         ibuf = nullptr;
3784     }
3785     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
3786
3787     dd->ncg_home = buf2[0];
3788     dd->nat_home = buf2[1];
3789     dd->ncg_tot  = dd->ncg_home;
3790     dd->nat_tot  = dd->nat_home;
3791     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3792     {
3793         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3794         srenew(dd->index_gl, dd->cg_nalloc);
3795         srenew(dd->cgindex, dd->cg_nalloc+1);
3796     }
3797     if (bMaster)
3798     {
3799         for (i = 0; i < dd->nnodes; i++)
3800         {
3801             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
3802             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3803         }
3804     }
3805
3806     dd_scatterv(dd,
3807                 bMaster ? ma->ibuf : nullptr,
3808                 bMaster ? ma->ibuf+dd->nnodes : nullptr,
3809                 bMaster ? ma->cg : nullptr,
3810                 dd->ncg_home*sizeof(int), dd->index_gl);
3811
3812     /* Determine the home charge group sizes */
3813     dd->cgindex[0] = 0;
3814     for (i = 0; i < dd->ncg_home; i++)
3815     {
3816         cg_gl            = dd->index_gl[i];
3817         dd->cgindex[i+1] =
3818             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3819     }
3820
3821     if (debug)
3822     {
3823         fprintf(debug, "Home charge groups:\n");
3824         for (i = 0; i < dd->ncg_home; i++)
3825         {
3826             fprintf(debug, " %d", dd->index_gl[i]);
3827             if (i % 10 == 9)
3828             {
3829                 fprintf(debug, "\n");
3830             }
3831         }
3832         fprintf(debug, "\n");
3833     }
3834 }
3835
3836 static int compact_and_copy_vec_at(int ncg, int *move,
3837                                    int *cgindex,
3838                                    int nvec, int vec,
3839                                    rvec *src, gmx_domdec_comm_t *comm,
3840                                    gmx_bool bCompact)
3841 {
3842     int m, icg, i, i0, i1, nrcg;
3843     int home_pos;
3844     int pos_vec[DIM*2];
3845
3846     home_pos = 0;
3847
3848     for (m = 0; m < DIM*2; m++)
3849     {
3850         pos_vec[m] = 0;
3851     }
3852
3853     i0 = 0;
3854     for (icg = 0; icg < ncg; icg++)
3855     {
3856         i1 = cgindex[icg+1];
3857         m  = move[icg];
3858         if (m == -1)
3859         {
3860             if (bCompact)
3861             {
3862                 /* Compact the home array in place */
3863                 for (i = i0; i < i1; i++)
3864                 {
3865                     copy_rvec(src[i], src[home_pos++]);
3866                 }
3867             }
3868         }
3869         else
3870         {
3871             /* Copy to the communication buffer */
3872             nrcg        = i1 - i0;
3873             pos_vec[m] += 1 + vec*nrcg;
3874             for (i = i0; i < i1; i++)
3875             {
3876                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
3877             }
3878             pos_vec[m] += (nvec - vec - 1)*nrcg;
3879         }
3880         if (!bCompact)
3881         {
3882             home_pos += i1 - i0;
3883         }
3884         i0 = i1;
3885     }
3886
3887     return home_pos;
3888 }
3889
3890 static int compact_and_copy_vec_cg(int ncg, int *move,
3891                                    int *cgindex,
3892                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
3893                                    gmx_bool bCompact)
3894 {
3895     int m, icg, i0, i1, nrcg;
3896     int home_pos;
3897     int pos_vec[DIM*2];
3898
3899     home_pos = 0;
3900
3901     for (m = 0; m < DIM*2; m++)
3902     {
3903         pos_vec[m] = 0;
3904     }
3905
3906     i0 = 0;
3907     for (icg = 0; icg < ncg; icg++)
3908     {
3909         i1 = cgindex[icg+1];
3910         m  = move[icg];
3911         if (m == -1)
3912         {
3913             if (bCompact)
3914             {
3915                 /* Compact the home array in place */
3916                 copy_rvec(src[icg], src[home_pos++]);
3917             }
3918         }
3919         else
3920         {
3921             nrcg = i1 - i0;
3922             /* Copy to the communication buffer */
3923             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
3924             pos_vec[m] += 1 + nrcg*nvec;
3925         }
3926         i0 = i1;
3927     }
3928     if (!bCompact)
3929     {
3930         home_pos = ncg;
3931     }
3932
3933     return home_pos;
3934 }
3935
3936 static int compact_ind(int ncg, int *move,
3937                        int *index_gl, int *cgindex,
3938                        int *gatindex,
3939                        gmx_ga2la_t *ga2la, char *bLocalCG,
3940                        int *cginfo)
3941 {
3942     int cg, nat, a0, a1, a, a_gl;
3943     int home_pos;
3944
3945     home_pos = 0;
3946     nat      = 0;
3947     for (cg = 0; cg < ncg; cg++)
3948     {
3949         a0 = cgindex[cg];
3950         a1 = cgindex[cg+1];
3951         if (move[cg] == -1)
3952         {
3953             /* Compact the home arrays in place.
3954              * Anything that can be done here avoids access to global arrays.
3955              */
3956             cgindex[home_pos] = nat;
3957             for (a = a0; a < a1; a++)
3958             {
3959                 a_gl          = gatindex[a];
3960                 gatindex[nat] = a_gl;
3961                 /* The cell number stays 0, so we don't need to set it */
3962                 ga2la_change_la(ga2la, a_gl, nat);
3963                 nat++;
3964             }
3965             index_gl[home_pos] = index_gl[cg];
3966             cginfo[home_pos]   = cginfo[cg];
3967             /* The charge group remains local, so bLocalCG does not change */
3968             home_pos++;
3969         }
3970         else
3971         {
3972             /* Clear the global indices */
3973             for (a = a0; a < a1; a++)
3974             {
3975                 ga2la_del(ga2la, gatindex[a]);
3976             }
3977             if (bLocalCG)
3978             {
3979                 bLocalCG[index_gl[cg]] = FALSE;
3980             }
3981         }
3982     }
3983     cgindex[home_pos] = nat;
3984
3985     return home_pos;
3986 }
3987
3988 static void clear_and_mark_ind(int ncg, int *move,
3989                                int *index_gl, int *cgindex, int *gatindex,
3990                                gmx_ga2la_t *ga2la, char *bLocalCG,
3991                                int *cell_index)
3992 {
3993     int cg, a0, a1, a;
3994
3995     for (cg = 0; cg < ncg; cg++)
3996     {
3997         if (move[cg] >= 0)
3998         {
3999             a0 = cgindex[cg];
4000             a1 = cgindex[cg+1];
4001             /* Clear the global indices */
4002             for (a = a0; a < a1; a++)
4003             {
4004                 ga2la_del(ga2la, gatindex[a]);
4005             }
4006             if (bLocalCG)
4007             {
4008                 bLocalCG[index_gl[cg]] = FALSE;
4009             }
4010             /* Signal that this cg has moved using the ns cell index.
4011              * Here we set it to -1. fill_grid will change it
4012              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4013              */
4014             cell_index[cg] = -1;
4015         }
4016     }
4017 }
4018
4019 static void print_cg_move(FILE *fplog,
4020                           gmx_domdec_t *dd,
4021                           gmx_int64_t step, int cg, int dim, int dir,
4022                           gmx_bool bHaveCgcmOld, real limitd,
4023                           rvec cm_old, rvec cm_new, real pos_d)
4024 {
4025     gmx_domdec_comm_t *comm;
4026     char               buf[22];
4027
4028     comm = dd->comm;
4029
4030     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4031     if (limitd > 0)
4032     {
4033         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4034                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4035                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4036     }
4037     else
4038     {
4039         /* We don't have a limiting distance available: don't print it */
4040         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4041                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4042                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4043     }
4044     fprintf(fplog, "distance out of cell %f\n",
4045             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4046     if (bHaveCgcmOld)
4047     {
4048         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4049                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4050     }
4051     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4052             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4053     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4054             dim2char(dim),
4055             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4056     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4057             dim2char(dim),
4058             comm->cell_x0[dim], comm->cell_x1[dim]);
4059 }
4060
4061 static void cg_move_error(FILE *fplog,
4062                           gmx_domdec_t *dd,
4063                           gmx_int64_t step, int cg, int dim, int dir,
4064                           gmx_bool bHaveCgcmOld, real limitd,
4065                           rvec cm_old, rvec cm_new, real pos_d)
4066 {
4067     if (fplog)
4068     {
4069         print_cg_move(fplog, dd, step, cg, dim, dir,
4070                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4071     }
4072     print_cg_move(stderr, dd, step, cg, dim, dir,
4073                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4074     gmx_fatal(FARGS,
4075               "%s moved too far between two domain decomposition steps\n"
4076               "This usually means that your system is not well equilibrated",
4077               dd->comm->bCGs ? "A charge group" : "An atom");
4078 }
4079
4080 static void rotate_state_atom(t_state *state, int a)
4081 {
4082     if (state->flags & (1 << estX))
4083     {
4084         /* Rotate the complete state; for a rectangular box only */
4085         state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4086         state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4087     }
4088     if (state->flags & (1 << estV))
4089     {
4090         state->v[a][YY] = -state->v[a][YY];
4091         state->v[a][ZZ] = -state->v[a][ZZ];
4092     }
4093     if (state->flags & (1 << estCGP))
4094     {
4095         state->cg_p[a][YY] = -state->cg_p[a][YY];
4096         state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4097     }
4098 }
4099
4100 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4101 {
4102     if (natoms > comm->moved_nalloc)
4103     {
4104         /* Contents should be preserved here */
4105         comm->moved_nalloc = over_alloc_dd(natoms);
4106         srenew(comm->moved, comm->moved_nalloc);
4107     }
4108
4109     return comm->moved;
4110 }
4111
4112 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4113                          gmx_domdec_t *dd,
4114                          t_state *state,
4115                          ivec tric_dir, matrix tcm,
4116                          rvec cell_x0, rvec cell_x1,
4117                          rvec limitd, rvec limit0, rvec limit1,
4118                          const int *cgindex,
4119                          int cg_start, int cg_end,
4120                          rvec *cg_cm,
4121                          int *move)
4122 {
4123     int      npbcdim;
4124     int      cg, k, k0, k1, d, dim, d2;
4125     int      mc, nrcg;
4126     int      flag;
4127     gmx_bool bScrew;
4128     ivec     dev;
4129     real     inv_ncg, pos_d;
4130     rvec     cm_new;
4131
4132     npbcdim = dd->npbcdim;
4133
4134     for (cg = cg_start; cg < cg_end; cg++)
4135     {
4136         k0   = cgindex[cg];
4137         k1   = cgindex[cg+1];
4138         nrcg = k1 - k0;
4139         if (nrcg == 1)
4140         {
4141             copy_rvec(state->x[k0], cm_new);
4142         }
4143         else
4144         {
4145             inv_ncg = 1.0/nrcg;
4146
4147             clear_rvec(cm_new);
4148             for (k = k0; (k < k1); k++)
4149             {
4150                 rvec_inc(cm_new, state->x[k]);
4151             }
4152             for (d = 0; (d < DIM); d++)
4153             {
4154                 cm_new[d] = inv_ncg*cm_new[d];
4155             }
4156         }
4157
4158         clear_ivec(dev);
4159         /* Do pbc and check DD cell boundary crossings */
4160         for (d = DIM-1; d >= 0; d--)
4161         {
4162             if (dd->nc[d] > 1)
4163             {
4164                 bScrew = (dd->bScrewPBC && d == XX);
4165                 /* Determine the location of this cg in lattice coordinates */
4166                 pos_d = cm_new[d];
4167                 if (tric_dir[d])
4168                 {
4169                     for (d2 = d+1; d2 < DIM; d2++)
4170                     {
4171                         pos_d += cm_new[d2]*tcm[d2][d];
4172                     }
4173                 }
4174                 /* Put the charge group in the triclinic unit-cell */
4175                 if (pos_d >= cell_x1[d])
4176                 {
4177                     if (pos_d >= limit1[d])
4178                     {
4179                         cg_move_error(fplog, dd, step, cg, d, 1,
4180                                       cg_cm != as_rvec_array(state->x.data()), limitd[d],
4181                                       cg_cm[cg], cm_new, pos_d);
4182                     }
4183                     dev[d] = 1;
4184                     if (dd->ci[d] == dd->nc[d] - 1)
4185                     {
4186                         rvec_dec(cm_new, state->box[d]);
4187                         if (bScrew)
4188                         {
4189                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4190                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4191                         }
4192                         for (k = k0; (k < k1); k++)
4193                         {
4194                             rvec_dec(state->x[k], state->box[d]);
4195                             if (bScrew)
4196                             {
4197                                 rotate_state_atom(state, k);
4198                             }
4199                         }
4200                     }
4201                 }
4202                 else if (pos_d < cell_x0[d])
4203                 {
4204                     if (pos_d < limit0[d])
4205                     {
4206                         cg_move_error(fplog, dd, step, cg, d, -1,
4207                                       cg_cm != as_rvec_array(state->x.data()), limitd[d],
4208                                       cg_cm[cg], cm_new, pos_d);
4209                     }
4210                     dev[d] = -1;
4211                     if (dd->ci[d] == 0)
4212                     {
4213                         rvec_inc(cm_new, state->box[d]);
4214                         if (bScrew)
4215                         {
4216                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4217                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4218                         }
4219                         for (k = k0; (k < k1); k++)
4220                         {
4221                             rvec_inc(state->x[k], state->box[d]);
4222                             if (bScrew)
4223                             {
4224                                 rotate_state_atom(state, k);
4225                             }
4226                         }
4227                     }
4228                 }
4229             }
4230             else if (d < npbcdim)
4231             {
4232                 /* Put the charge group in the rectangular unit-cell */
4233                 while (cm_new[d] >= state->box[d][d])
4234                 {
4235                     rvec_dec(cm_new, state->box[d]);
4236                     for (k = k0; (k < k1); k++)
4237                     {
4238                         rvec_dec(state->x[k], state->box[d]);
4239                     }
4240                 }
4241                 while (cm_new[d] < 0)
4242                 {
4243                     rvec_inc(cm_new, state->box[d]);
4244                     for (k = k0; (k < k1); k++)
4245                     {
4246                         rvec_inc(state->x[k], state->box[d]);
4247                     }
4248                 }
4249             }
4250         }
4251
4252         copy_rvec(cm_new, cg_cm[cg]);
4253
4254         /* Determine where this cg should go */
4255         flag = 0;
4256         mc   = -1;
4257         for (d = 0; d < dd->ndim; d++)
4258         {
4259             dim = dd->dim[d];
4260             if (dev[dim] == 1)
4261             {
4262                 flag |= DD_FLAG_FW(d);
4263                 if (mc == -1)
4264                 {
4265                     mc = d*2;
4266                 }
4267             }
4268             else if (dev[dim] == -1)
4269             {
4270                 flag |= DD_FLAG_BW(d);
4271                 if (mc == -1)
4272                 {
4273                     if (dd->nc[dim] > 2)
4274                     {
4275                         mc = d*2 + 1;
4276                     }
4277                     else
4278                     {
4279                         mc = d*2;
4280                     }
4281                 }
4282             }
4283         }
4284         /* Temporarily store the flag in move */
4285         move[cg] = mc + flag;
4286     }
4287 }
4288
4289 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4290                                gmx_domdec_t *dd, ivec tric_dir,
4291                                t_state *state, PaddedRVecVector *f,
4292                                t_forcerec *fr,
4293                                gmx_bool bCompact,
4294                                t_nrnb *nrnb,
4295                                int *ncg_stay_home,
4296                                int *ncg_moved)
4297 {
4298     int               *move;
4299     int                npbcdim;
4300     int                ncg[DIM*2] = { 0 }, nat[DIM*2] = { 0 };
4301     int                i, cg, k, d, dim, dim2, dir, d2, d3;
4302     int                mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
4303     int                sbuf[2], rbuf[2];
4304     int                home_pos_cg, home_pos_at, buf_pos;
4305     int                flag;
4306     real               pos_d;
4307     matrix             tcm;
4308     rvec              *cg_cm = nullptr, cell_x0, cell_x1, limitd, limit0, limit1;
4309     int               *cgindex;
4310     cginfo_mb_t       *cginfo_mb;
4311     gmx_domdec_comm_t *comm;
4312     int               *moved;
4313     int                nthread, thread;
4314
4315     if (dd->bScrewPBC)
4316     {
4317         check_screw_box(state->box);
4318     }
4319
4320     comm  = dd->comm;
4321     if (fr->cutoff_scheme == ecutsGROUP)
4322     {
4323         cg_cm = fr->cg_cm;
4324     }
4325
4326     // Positions are always present, so there's nothing to flag
4327     bool bV   = state->flags & (1<<estV);
4328     bool bCGP = state->flags & (1<<estCGP);
4329
4330     if (dd->ncg_tot > comm->nalloc_int)
4331     {
4332         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4333         srenew(comm->buf_int, comm->nalloc_int);
4334     }
4335     move = comm->buf_int;
4336
4337     npbcdim = dd->npbcdim;
4338
4339     for (d = 0; (d < DIM); d++)
4340     {
4341         limitd[d] = dd->comm->cellsize_min[d];
4342         if (d >= npbcdim && dd->ci[d] == 0)
4343         {
4344             cell_x0[d] = -GMX_FLOAT_MAX;
4345         }
4346         else
4347         {
4348             cell_x0[d] = comm->cell_x0[d];
4349         }
4350         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4351         {
4352             cell_x1[d] = GMX_FLOAT_MAX;
4353         }
4354         else
4355         {
4356             cell_x1[d] = comm->cell_x1[d];
4357         }
4358         if (d < npbcdim)
4359         {
4360             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4361             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4362         }
4363         else
4364         {
4365             /* We check after communication if a charge group moved
4366              * more than one cell. Set the pre-comm check limit to float_max.
4367              */
4368             limit0[d] = -GMX_FLOAT_MAX;
4369             limit1[d] =  GMX_FLOAT_MAX;
4370         }
4371     }
4372
4373     make_tric_corr_matrix(npbcdim, state->box, tcm);
4374
4375     cgindex = dd->cgindex;
4376
4377     nthread = gmx_omp_nthreads_get(emntDomdec);
4378
4379     /* Compute the center of geometry for all home charge groups
4380      * and put them in the box and determine where they should go.
4381      */
4382 #pragma omp parallel for num_threads(nthread) schedule(static)
4383     for (thread = 0; thread < nthread; thread++)
4384     {
4385         try
4386         {
4387             calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4388                          cell_x0, cell_x1, limitd, limit0, limit1,
4389                          cgindex,
4390                          ( thread   *dd->ncg_home)/nthread,
4391                          ((thread+1)*dd->ncg_home)/nthread,
4392                          fr->cutoff_scheme == ecutsGROUP ? cg_cm : as_rvec_array(state->x.data()),
4393                          move);
4394         }
4395         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4396     }
4397
4398     for (cg = 0; cg < dd->ncg_home; cg++)
4399     {
4400         if (move[cg] >= 0)
4401         {
4402             mc       = move[cg];
4403             flag     = mc & ~DD_FLAG_NRCG;
4404             mc       = mc & DD_FLAG_NRCG;
4405             move[cg] = mc;
4406
4407             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4408             {
4409                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4410                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4411             }
4412             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4413             /* We store the cg size in the lower 16 bits
4414              * and the place where the charge group should go
4415              * in the next 6 bits. This saves some communication volume.
4416              */
4417             nrcg = cgindex[cg+1] - cgindex[cg];
4418             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4419             ncg[mc] += 1;
4420             nat[mc] += nrcg;
4421         }
4422     }
4423
4424     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4425     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4426
4427     *ncg_moved = 0;
4428     for (i = 0; i < dd->ndim*2; i++)
4429     {
4430         *ncg_moved += ncg[i];
4431     }
4432
4433     nvec = 1;
4434     if (bV)
4435     {
4436         nvec++;
4437     }
4438     if (bCGP)
4439     {
4440         nvec++;
4441     }
4442
4443     /* Make sure the communication buffers are large enough */
4444     for (mc = 0; mc < dd->ndim*2; mc++)
4445     {
4446         nvr = ncg[mc] + nat[mc]*nvec;
4447         if (nvr > comm->cgcm_state_nalloc[mc])
4448         {
4449             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4450             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4451         }
4452     }
4453
4454     switch (fr->cutoff_scheme)
4455     {
4456         case ecutsGROUP:
4457             /* Recalculating cg_cm might be cheaper than communicating,
4458              * but that could give rise to rounding issues.
4459              */
4460             home_pos_cg =
4461                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4462                                         nvec, cg_cm, comm, bCompact);
4463             break;
4464         case ecutsVERLET:
4465             /* Without charge groups we send the moved atom coordinates
4466              * over twice. This is so the code below can be used without
4467              * many conditionals for both for with and without charge groups.
4468              */
4469             home_pos_cg =
4470                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4471                                         nvec, as_rvec_array(state->x.data()), comm, FALSE);
4472             if (bCompact)
4473             {
4474                 home_pos_cg -= *ncg_moved;
4475             }
4476             break;
4477         default:
4478             gmx_incons("unimplemented");
4479             home_pos_cg = 0;
4480     }
4481
4482     vec         = 0;
4483     home_pos_at =
4484         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4485                                 nvec, vec++, as_rvec_array(state->x.data()),
4486                                 comm, bCompact);
4487     if (bV)
4488     {
4489         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4490                                 nvec, vec++, as_rvec_array(state->v.data()),
4491                                 comm, bCompact);
4492     }
4493     if (bCGP)
4494     {
4495         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4496                                 nvec, vec++, as_rvec_array(state->cg_p.data()),
4497                                 comm, bCompact);
4498     }
4499
4500     if (bCompact)
4501     {
4502         compact_ind(dd->ncg_home, move,
4503                     dd->index_gl, dd->cgindex, dd->gatindex,
4504                     dd->ga2la, comm->bLocalCG,
4505                     fr->cginfo);
4506     }
4507     else
4508     {
4509         if (fr->cutoff_scheme == ecutsVERLET)
4510         {
4511             moved = get_moved(comm, dd->ncg_home);
4512
4513             for (k = 0; k < dd->ncg_home; k++)
4514             {
4515                 moved[k] = 0;
4516             }
4517         }
4518         else
4519         {
4520             moved = fr->ns->grid->cell_index;
4521         }
4522
4523         clear_and_mark_ind(dd->ncg_home, move,
4524                            dd->index_gl, dd->cgindex, dd->gatindex,
4525                            dd->ga2la, comm->bLocalCG,
4526                            moved);
4527     }
4528
4529     cginfo_mb = fr->cginfo_mb;
4530
4531     *ncg_stay_home = home_pos_cg;
4532     for (d = 0; d < dd->ndim; d++)
4533     {
4534         dim      = dd->dim[d];
4535         ncg_recv = 0;
4536         nvr      = 0;
4537         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4538         {
4539             cdd = d*2 + dir;
4540             /* Communicate the cg and atom counts */
4541             sbuf[0] = ncg[cdd];
4542             sbuf[1] = nat[cdd];
4543             if (debug)
4544             {
4545                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4546                         d, dir, sbuf[0], sbuf[1]);
4547             }
4548             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4549
4550             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4551             {
4552                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4553                 srenew(comm->buf_int, comm->nalloc_int);
4554             }
4555
4556             /* Communicate the charge group indices, sizes and flags */
4557             dd_sendrecv_int(dd, d, dir,
4558                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4559                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4560
4561             nvs = ncg[cdd] + nat[cdd]*nvec;
4562             i   = rbuf[0]  + rbuf[1] *nvec;
4563             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4564
4565             /* Communicate cgcm and state */
4566             dd_sendrecv_rvec(dd, d, dir,
4567                              comm->cgcm_state[cdd], nvs,
4568                              comm->vbuf.v+nvr, i);
4569             ncg_recv += rbuf[0];
4570             nvr      += i;
4571         }
4572
4573         dd_check_alloc_ncg(fr, state, f, home_pos_cg + ncg_recv);
4574         if (fr->cutoff_scheme == ecutsGROUP)
4575         {
4576             /* Here we resize to more than necessary and shrink later */
4577             dd_resize_state(state, f, home_pos_at + ncg_recv*MAX_CGCGSIZE);
4578         }
4579
4580         /* Process the received charge groups */
4581         buf_pos = 0;
4582         for (cg = 0; cg < ncg_recv; cg++)
4583         {
4584             flag = comm->buf_int[cg*DD_CGIBS+1];
4585
4586             if (dim >= npbcdim && dd->nc[dim] > 2)
4587             {
4588                 /* No pbc in this dim and more than one domain boundary.
4589                  * We do a separate check if a charge group didn't move too far.
4590                  */
4591                 if (((flag & DD_FLAG_FW(d)) &&
4592                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4593                     ((flag & DD_FLAG_BW(d)) &&
4594                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4595                 {
4596                     cg_move_error(fplog, dd, step, cg, dim,
4597                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4598                                   fr->cutoff_scheme == ecutsGROUP, 0,
4599                                   comm->vbuf.v[buf_pos],
4600                                   comm->vbuf.v[buf_pos],
4601                                   comm->vbuf.v[buf_pos][dim]);
4602                 }
4603             }
4604
4605             mc = -1;
4606             if (d < dd->ndim-1)
4607             {
4608                 /* Check which direction this cg should go */
4609                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4610                 {
4611                     if (dlbIsOn(dd->comm))
4612                     {
4613                         /* The cell boundaries for dimension d2 are not equal
4614                          * for each cell row of the lower dimension(s),
4615                          * therefore we might need to redetermine where
4616                          * this cg should go.
4617                          */
4618                         dim2 = dd->dim[d2];
4619                         /* If this cg crosses the box boundary in dimension d2
4620                          * we can use the communicated flag, so we do not
4621                          * have to worry about pbc.
4622                          */
4623                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4624                                (flag & DD_FLAG_FW(d2))) ||
4625                               (dd->ci[dim2] == 0 &&
4626                                (flag & DD_FLAG_BW(d2)))))
4627                         {
4628                             /* Clear the two flags for this dimension */
4629                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4630                             /* Determine the location of this cg
4631                              * in lattice coordinates
4632                              */
4633                             pos_d = comm->vbuf.v[buf_pos][dim2];
4634                             if (tric_dir[dim2])
4635                             {
4636                                 for (d3 = dim2+1; d3 < DIM; d3++)
4637                                 {
4638                                     pos_d +=
4639                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4640                                 }
4641                             }
4642                             /* Check of we are not at the box edge.
4643                              * pbc is only handled in the first step above,
4644                              * but this check could move over pbc while
4645                              * the first step did not due to different rounding.
4646                              */
4647                             if (pos_d >= cell_x1[dim2] &&
4648                                 dd->ci[dim2] != dd->nc[dim2]-1)
4649                             {
4650                                 flag |= DD_FLAG_FW(d2);
4651                             }
4652                             else if (pos_d < cell_x0[dim2] &&
4653                                      dd->ci[dim2] != 0)
4654                             {
4655                                 flag |= DD_FLAG_BW(d2);
4656                             }
4657                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4658                         }
4659                     }
4660                     /* Set to which neighboring cell this cg should go */
4661                     if (flag & DD_FLAG_FW(d2))
4662                     {
4663                         mc = d2*2;
4664                     }
4665                     else if (flag & DD_FLAG_BW(d2))
4666                     {
4667                         if (dd->nc[dd->dim[d2]] > 2)
4668                         {
4669                             mc = d2*2+1;
4670                         }
4671                         else
4672                         {
4673                             mc = d2*2;
4674                         }
4675                     }
4676                 }
4677             }
4678
4679             nrcg = flag & DD_FLAG_NRCG;
4680             if (mc == -1)
4681             {
4682                 if (home_pos_cg+1 > dd->cg_nalloc)
4683                 {
4684                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4685                     srenew(dd->index_gl, dd->cg_nalloc);
4686                     srenew(dd->cgindex, dd->cg_nalloc+1);
4687                 }
4688                 /* Set the global charge group index and size */
4689                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
4690                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4691                 /* Copy the state from the buffer */
4692                 if (fr->cutoff_scheme == ecutsGROUP)
4693                 {
4694                     cg_cm = fr->cg_cm;
4695                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4696                 }
4697                 buf_pos++;
4698
4699                 /* Set the cginfo */
4700                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4701                                                    dd->index_gl[home_pos_cg]);
4702                 if (comm->bLocalCG)
4703                 {
4704                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4705                 }
4706
4707                 for (i = 0; i < nrcg; i++)
4708                 {
4709                     copy_rvec(comm->vbuf.v[buf_pos++],
4710                               state->x[home_pos_at+i]);
4711                 }
4712                 if (bV)
4713                 {
4714                     for (i = 0; i < nrcg; i++)
4715                     {
4716                         copy_rvec(comm->vbuf.v[buf_pos++],
4717                                   state->v[home_pos_at+i]);
4718                     }
4719                 }
4720                 if (bCGP)
4721                 {
4722                     for (i = 0; i < nrcg; i++)
4723                     {
4724                         copy_rvec(comm->vbuf.v[buf_pos++],
4725                                   state->cg_p[home_pos_at+i]);
4726                     }
4727                 }
4728                 home_pos_cg += 1;
4729                 home_pos_at += nrcg;
4730             }
4731             else
4732             {
4733                 /* Reallocate the buffers if necessary  */
4734                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4735                 {
4736                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4737                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4738                 }
4739                 nvr = ncg[mc] + nat[mc]*nvec;
4740                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4741                 {
4742                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4743                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4744                 }
4745                 /* Copy from the receive to the send buffers */
4746                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4747                        comm->buf_int + cg*DD_CGIBS,
4748                        DD_CGIBS*sizeof(int));
4749                 memcpy(comm->cgcm_state[mc][nvr],
4750                        comm->vbuf.v[buf_pos],
4751                        (1+nrcg*nvec)*sizeof(rvec));
4752                 buf_pos += 1 + nrcg*nvec;
4753                 ncg[mc] += 1;
4754                 nat[mc] += nrcg;
4755             }
4756         }
4757     }
4758
4759     /* With sorting (!bCompact) the indices are now only partially up to date
4760      * and ncg_home and nat_home are not the real count, since there are
4761      * "holes" in the arrays for the charge groups that moved to neighbors.
4762      */
4763     if (fr->cutoff_scheme == ecutsVERLET)
4764     {
4765         moved = get_moved(comm, home_pos_cg);
4766
4767         for (i = dd->ncg_home; i < home_pos_cg; i++)
4768         {
4769             moved[i] = 0;
4770         }
4771     }
4772     dd->ncg_home = home_pos_cg;
4773     dd->nat_home = home_pos_at;
4774
4775     if (fr->cutoff_scheme == ecutsGROUP && !bCompact)
4776     {
4777         /* We overallocated before, we need to set the right size here */
4778         dd_resize_state(state, f, dd->nat_home);
4779     }
4780
4781     if (debug)
4782     {
4783         fprintf(debug,
4784                 "Finished repartitioning: cgs moved out %d, new home %d\n",
4785                 *ncg_moved, dd->ncg_home-*ncg_moved);
4786
4787     }
4788 }
4789
4790 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
4791 {
4792     /* Note that the cycles value can be incorrect, either 0 or some
4793      * extremely large value, when our thread migrated to another core
4794      * with an unsynchronized cycle counter. If this happens less often
4795      * that once per nstlist steps, this will not cause issues, since
4796      * we later subtract the maximum value from the sum over nstlist steps.
4797      * A zero count will slightly lower the total, but that's a small effect.
4798      * Note that the main purpose of the subtraction of the maximum value
4799      * is to avoid throwing off the load balancing when stalls occur due
4800      * e.g. system activity or network congestion.
4801      */
4802     dd->comm->cycl[ddCycl] += cycles;
4803     dd->comm->cycl_n[ddCycl]++;
4804     if (cycles > dd->comm->cycl_max[ddCycl])
4805     {
4806         dd->comm->cycl_max[ddCycl] = cycles;
4807     }
4808 }
4809
4810 static double force_flop_count(t_nrnb *nrnb)
4811 {
4812     int         i;
4813     double      sum;
4814     const char *name;
4815
4816     sum = 0;
4817     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
4818     {
4819         /* To get closer to the real timings, we half the count
4820          * for the normal loops and again half it for water loops.
4821          */
4822         name = nrnb_str(i);
4823         if (strstr(name, "W3") != nullptr || strstr(name, "W4") != nullptr)
4824         {
4825             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4826         }
4827         else
4828         {
4829             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4830         }
4831     }
4832     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
4833     {
4834         name = nrnb_str(i);
4835         if (strstr(name, "W3") != nullptr || strstr(name, "W4") != nullptr)
4836         {
4837             sum += nrnb->n[i]*cost_nrnb(i);
4838         }
4839     }
4840     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
4841     {
4842         sum += nrnb->n[i]*cost_nrnb(i);
4843     }
4844
4845     return sum;
4846 }
4847
4848 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
4849 {
4850     if (dd->comm->eFlop)
4851     {
4852         dd->comm->flop -= force_flop_count(nrnb);
4853     }
4854 }
4855 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
4856 {
4857     if (dd->comm->eFlop)
4858     {
4859         dd->comm->flop += force_flop_count(nrnb);
4860         dd->comm->flop_n++;
4861     }
4862 }
4863
4864 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4865 {
4866     int i;
4867
4868     for (i = 0; i < ddCyclNr; i++)
4869     {
4870         dd->comm->cycl[i]     = 0;
4871         dd->comm->cycl_n[i]   = 0;
4872         dd->comm->cycl_max[i] = 0;
4873     }
4874     dd->comm->flop   = 0;
4875     dd->comm->flop_n = 0;
4876 }
4877
4878 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
4879 {
4880     gmx_domdec_comm_t *comm;
4881     domdec_load_t     *load;
4882     domdec_root_t     *root = nullptr;
4883     int                d, dim, i, pos;
4884     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
4885     gmx_bool           bSepPME;
4886
4887     if (debug)
4888     {
4889         fprintf(debug, "get_load_distribution start\n");
4890     }
4891
4892     wallcycle_start(wcycle, ewcDDCOMMLOAD);
4893
4894     comm = dd->comm;
4895
4896     bSepPME = (dd->pme_nodeid >= 0);
4897
4898     if (dd->ndim == 0 && bSepPME)
4899     {
4900         /* Without decomposition, but with PME nodes, we need the load */
4901         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
4902         comm->load[0].pme = comm->cycl[ddCyclPME];
4903     }
4904
4905     for (d = dd->ndim-1; d >= 0; d--)
4906     {
4907         dim = dd->dim[d];
4908         /* Check if we participate in the communication in this dimension */
4909         if (d == dd->ndim-1 ||
4910             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
4911         {
4912             load = &comm->load[d];
4913             if (dlbIsOn(dd->comm))
4914             {
4915                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4916             }
4917             pos = 0;
4918             if (d == dd->ndim-1)
4919             {
4920                 sbuf[pos++] = dd_force_load(comm);
4921                 sbuf[pos++] = sbuf[0];
4922                 if (dlbIsOn(dd->comm))
4923                 {
4924                     sbuf[pos++] = sbuf[0];
4925                     sbuf[pos++] = cell_frac;
4926                     if (d > 0)
4927                     {
4928                         sbuf[pos++] = comm->cell_f_max0[d];
4929                         sbuf[pos++] = comm->cell_f_min1[d];
4930                     }
4931                 }
4932                 if (bSepPME)
4933                 {
4934                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4935                     sbuf[pos++] = comm->cycl[ddCyclPME];
4936                 }
4937             }
4938             else
4939             {
4940                 sbuf[pos++] = comm->load[d+1].sum;
4941                 sbuf[pos++] = comm->load[d+1].max;
4942                 if (dlbIsOn(dd->comm))
4943                 {
4944                     sbuf[pos++] = comm->load[d+1].sum_m;
4945                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4946                     sbuf[pos++] = comm->load[d+1].flags;
4947                     if (d > 0)
4948                     {
4949                         sbuf[pos++] = comm->cell_f_max0[d];
4950                         sbuf[pos++] = comm->cell_f_min1[d];
4951                     }
4952                 }
4953                 if (bSepPME)
4954                 {
4955                     sbuf[pos++] = comm->load[d+1].mdf;
4956                     sbuf[pos++] = comm->load[d+1].pme;
4957                 }
4958             }
4959             load->nload = pos;
4960             /* Communicate a row in DD direction d.
4961              * The communicators are setup such that the root always has rank 0.
4962              */
4963 #if GMX_MPI
4964             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
4965                        load->load, load->nload*sizeof(float), MPI_BYTE,
4966                        0, comm->mpi_comm_load[d]);
4967 #endif
4968             if (dd->ci[dim] == dd->master_ci[dim])
4969             {
4970                 /* We are the root, process this row */
4971                 if (dlbIsOn(comm))
4972                 {
4973                     root = comm->root[d];
4974                 }
4975                 load->sum      = 0;
4976                 load->max      = 0;
4977                 load->sum_m    = 0;
4978                 load->cvol_min = 1;
4979                 load->flags    = 0;
4980                 load->mdf      = 0;
4981                 load->pme      = 0;
4982                 pos            = 0;
4983                 for (i = 0; i < dd->nc[dim]; i++)
4984                 {
4985                     load->sum += load->load[pos++];
4986                     load->max  = std::max(load->max, load->load[pos]);
4987                     pos++;
4988                     if (dlbIsOn(dd->comm))
4989                     {
4990                         if (root->bLimited)
4991                         {
4992                             /* This direction could not be load balanced properly,
4993                              * therefore we need to use the maximum iso the average load.
4994                              */
4995                             load->sum_m = std::max(load->sum_m, load->load[pos]);
4996                         }
4997                         else
4998                         {
4999                             load->sum_m += load->load[pos];
5000                         }
5001                         pos++;
5002                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
5003                         pos++;
5004                         if (d < dd->ndim-1)
5005                         {
5006                             load->flags = (int)(load->load[pos++] + 0.5);
5007                         }
5008                         if (d > 0)
5009                         {
5010                             root->cell_f_max0[i] = load->load[pos++];
5011                             root->cell_f_min1[i] = load->load[pos++];
5012                         }
5013                     }
5014                     if (bSepPME)
5015                     {
5016                         load->mdf = std::max(load->mdf, load->load[pos]);
5017                         pos++;
5018                         load->pme = std::max(load->pme, load->load[pos]);
5019                         pos++;
5020                     }
5021                 }
5022                 if (dlbIsOn(comm) && root->bLimited)
5023                 {
5024                     load->sum_m *= dd->nc[dim];
5025                     load->flags |= (1<<d);
5026                 }
5027             }
5028         }
5029     }
5030
5031     if (DDMASTER(dd))
5032     {
5033         comm->nload      += dd_load_count(comm);
5034         comm->load_step  += comm->cycl[ddCyclStep];
5035         comm->load_sum   += comm->load[0].sum;
5036         comm->load_max   += comm->load[0].max;
5037         if (dlbIsOn(comm))
5038         {
5039             for (d = 0; d < dd->ndim; d++)
5040             {
5041                 if (comm->load[0].flags & (1<<d))
5042                 {
5043                     comm->load_lim[d]++;
5044                 }
5045             }
5046         }
5047         if (bSepPME)
5048         {
5049             comm->load_mdf += comm->load[0].mdf;
5050             comm->load_pme += comm->load[0].pme;
5051         }
5052     }
5053
5054     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5055
5056     if (debug)
5057     {
5058         fprintf(debug, "get_load_distribution finished\n");
5059     }
5060 }
5061
5062 static float dd_force_load_fraction(gmx_domdec_t *dd)
5063 {
5064     /* Return the relative performance loss on the total run time
5065      * due to the force calculation load imbalance.
5066      */
5067     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5068     {
5069         return dd->comm->load_sum/(dd->comm->load_step*dd->nnodes);
5070     }
5071     else
5072     {
5073         return 0;
5074     }
5075 }
5076
5077 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5078 {
5079     /* Return the relative performance loss on the total run time
5080      * due to the force calculation load imbalance.
5081      */
5082     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5083     {
5084         return
5085             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5086             (dd->comm->load_step*dd->nnodes);
5087     }
5088     else
5089     {
5090         return 0;
5091     }
5092 }
5093
5094 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5095 {
5096     gmx_domdec_comm_t *comm = dd->comm;
5097
5098     /* Only the master rank prints loads and only if we measured loads */
5099     if (!DDMASTER(dd) || comm->nload == 0)
5100     {
5101         return;
5102     }
5103
5104     char  buf[STRLEN];
5105     int   numPpRanks   = dd->nnodes;
5106     int   numPmeRanks  = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5107     int   numRanks     = numPpRanks + numPmeRanks;
5108     float lossFraction = 0;
5109
5110     /* Print the average load imbalance and performance loss */
5111     if (dd->nnodes > 1 && comm->load_sum > 0)
5112     {
5113         float imbalance = comm->load_max*numPpRanks/comm->load_sum - 1;
5114         lossFraction    = dd_force_imb_perf_loss(dd);
5115         fprintf(stderr, "\n");
5116         sprintf(buf,
5117                 " Load balancing based on %d %% of the MD step time\n"
5118                 " Average load imbalance: %.1f %%\n"
5119                 " Part of the total run time spent waiting due to load imbalance: %.1f %%\n",
5120                 static_cast<int>(dd_force_load_fraction(dd)*100 + 0.5),
5121                 imbalance*100,
5122                 lossFraction*100);
5123         fprintf(fplog, "%s", buf);
5124         fprintf(stderr, "%s", buf);
5125     }
5126
5127     /* Print during what percentage of steps the  load balancing was limited */
5128     bool dlbWasLimited = false;
5129     if (dlbIsOn(comm))
5130     {
5131         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5132         for (int d = 0; d < dd->ndim; d++)
5133         {
5134             int limitPercentage = (200*comm->load_lim[d] + 1)/(2*comm->nload);
5135             sprintf(buf+strlen(buf), " %c %d %%",
5136                     dim2char(dd->dim[d]), limitPercentage);
5137             if (limitPercentage >= 50)
5138             {
5139                 dlbWasLimited = true;
5140             }
5141         }
5142         sprintf(buf + strlen(buf), "\n");
5143         fprintf(fplog, "%s", buf);
5144         fprintf(stderr, "%s", buf);
5145     }
5146
5147     /* Print the performance loss due to separate PME - PP rank imbalance */
5148     float lossFractionPme = 0;
5149     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
5150     {
5151         float pmeForceRatio = comm->load_pme/comm->load_mdf;
5152         lossFractionPme     = (comm->load_pme - comm->load_mdf)/comm->load_step;
5153         if (lossFractionPme <= 0)
5154         {
5155             lossFractionPme *= numPmeRanks/static_cast<float>(numRanks);
5156         }
5157         else
5158         {
5159             lossFractionPme *= numPpRanks/static_cast<float>(numRanks);
5160         }
5161         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
5162         fprintf(fplog, "%s", buf);
5163         fprintf(stderr, "%s", buf);
5164         sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossFractionPme)*100);
5165         fprintf(fplog, "%s", buf);
5166         fprintf(stderr, "%s", buf);
5167     }
5168     fprintf(fplog, "\n");
5169     fprintf(stderr, "\n");
5170
5171     if (lossFraction >= DD_PERF_LOSS_WARN)
5172     {
5173         sprintf(buf,
5174                 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5175                 "      in the domain decomposition.\n", lossFraction*100);
5176         if (!dlbIsOn(comm))
5177         {
5178             sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5179         }
5180         else if (dlbWasLimited)
5181         {
5182             sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5183         }
5184         fprintf(fplog, "%s\n", buf);
5185         fprintf(stderr, "%s\n", buf);
5186     }
5187     if (numPmeRanks > 0 && fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
5188     {
5189         sprintf(buf,
5190                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
5191                 "      had %s work to do than the PP ranks.\n"
5192                 "      You might want to %s the number of PME ranks\n"
5193                 "      or %s the cut-off and the grid spacing.\n",
5194                 fabs(lossFractionPme*100),
5195                 (lossFractionPme < 0) ? "less"     : "more",
5196                 (lossFractionPme < 0) ? "decrease" : "increase",
5197                 (lossFractionPme < 0) ? "decrease" : "increase");
5198         fprintf(fplog, "%s\n", buf);
5199         fprintf(stderr, "%s\n", buf);
5200     }
5201 }
5202
5203 static float dd_vol_min(gmx_domdec_t *dd)
5204 {
5205     return dd->comm->load[0].cvol_min*dd->nnodes;
5206 }
5207
5208 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5209 {
5210     return dd->comm->load[0].flags;
5211 }
5212
5213 static float dd_f_imbal(gmx_domdec_t *dd)
5214 {
5215     if (dd->comm->load[0].sum > 0)
5216     {
5217         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
5218     }
5219     else
5220     {
5221         /* Something is wrong in the cycle counting, report no load imbalance */
5222         return 0.0f;
5223     }
5224 }
5225
5226 float dd_pme_f_ratio(gmx_domdec_t *dd)
5227 {
5228     /* Should only be called on the DD master rank */
5229     assert(DDMASTER(dd));
5230
5231     if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
5232     {
5233         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5234     }
5235     else
5236     {
5237         return -1.0;
5238     }
5239 }
5240
5241 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5242 {
5243     int  flags, d;
5244     char buf[22];
5245
5246     flags = dd_load_flags(dd);
5247     if (flags)
5248     {
5249         fprintf(fplog,
5250                 "DD  load balancing is limited by minimum cell size in dimension");
5251         for (d = 0; d < dd->ndim; d++)
5252         {
5253             if (flags & (1<<d))
5254             {
5255                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5256             }
5257         }
5258         fprintf(fplog, "\n");
5259     }
5260     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5261     if (dlbIsOn(dd->comm))
5262     {
5263         fprintf(fplog, "  vol min/aver %5.3f%c",
5264                 dd_vol_min(dd), flags ? '!' : ' ');
5265     }
5266     if (dd->nnodes > 1)
5267     {
5268         fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5269     }
5270     if (dd->comm->cycl_n[ddCyclPME])
5271     {
5272         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5273     }
5274     fprintf(fplog, "\n\n");
5275 }
5276
5277 static void dd_print_load_verbose(gmx_domdec_t *dd)
5278 {
5279     if (dlbIsOn(dd->comm))
5280     {
5281         fprintf(stderr, "vol %4.2f%c ",
5282                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5283     }
5284     if (dd->nnodes > 1)
5285     {
5286         fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5287     }
5288     if (dd->comm->cycl_n[ddCyclPME])
5289     {
5290         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5291     }
5292 }
5293
5294 #if GMX_MPI
5295 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5296 {
5297     MPI_Comm           c_row;
5298     int                dim, i, rank;
5299     ivec               loc_c;
5300     domdec_root_t     *root;
5301     gmx_bool           bPartOfGroup = FALSE;
5302
5303     dim = dd->dim[dim_ind];
5304     copy_ivec(loc, loc_c);
5305     for (i = 0; i < dd->nc[dim]; i++)
5306     {
5307         loc_c[dim] = i;
5308         rank       = dd_index(dd->nc, loc_c);
5309         if (rank == dd->rank)
5310         {
5311             /* This process is part of the group */
5312             bPartOfGroup = TRUE;
5313         }
5314     }
5315     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5316                    &c_row);
5317     if (bPartOfGroup)
5318     {
5319         dd->comm->mpi_comm_load[dim_ind] = c_row;
5320         if (dd->comm->dlbState != edlbsOffForever)
5321         {
5322             if (dd->ci[dim] == dd->master_ci[dim])
5323             {
5324                 /* This is the root process of this row */
5325                 snew(dd->comm->root[dim_ind], 1);
5326                 root = dd->comm->root[dim_ind];
5327                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5328                 snew(root->old_cell_f, dd->nc[dim]+1);
5329                 snew(root->bCellMin, dd->nc[dim]);
5330                 if (dim_ind > 0)
5331                 {
5332                     snew(root->cell_f_max0, dd->nc[dim]);
5333                     snew(root->cell_f_min1, dd->nc[dim]);
5334                     snew(root->bound_min, dd->nc[dim]);
5335                     snew(root->bound_max, dd->nc[dim]);
5336                 }
5337                 snew(root->buf_ncd, dd->nc[dim]);
5338             }
5339             else
5340             {
5341                 /* This is not a root process, we only need to receive cell_f */
5342                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5343             }
5344         }
5345         if (dd->ci[dim] == dd->master_ci[dim])
5346         {
5347             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5348         }
5349     }
5350 }
5351 #endif
5352
5353 void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
5354                                    const gmx_hw_info_t gmx_unused *hwinfo,
5355                                    const gmx_hw_opt_t  gmx_unused *hw_opt)
5356 {
5357 #if GMX_MPI
5358     int           physicalnode_id_hash;
5359     int           gpu_id;
5360     gmx_domdec_t *dd;
5361     MPI_Comm      mpi_comm_pp_physicalnode;
5362
5363     if (!(cr->duty & DUTY_PP) || hw_opt->gpu_opt.n_dev_use == 0)
5364     {
5365         /* Only PP nodes (currently) use GPUs.
5366          * If we don't have GPUs, there are no resources to share.
5367          */
5368         return;
5369     }
5370
5371     physicalnode_id_hash = gmx_physicalnode_id_hash();
5372
5373     gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
5374
5375     dd = cr->dd;
5376
5377     if (debug)
5378     {
5379         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5380         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5381                 dd->rank, physicalnode_id_hash, gpu_id);
5382     }
5383     /* Split the PP communicator over the physical nodes */
5384     /* TODO: See if we should store this (before), as it's also used for
5385      * for the nodecomm summution.
5386      */
5387     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5388                    &mpi_comm_pp_physicalnode);
5389     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5390                    &dd->comm->mpi_comm_gpu_shared);
5391     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5392     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5393
5394     if (debug)
5395     {
5396         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5397     }
5398
5399     /* Note that some ranks could share a GPU, while others don't */
5400
5401     if (dd->comm->nrank_gpu_shared == 1)
5402     {
5403         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5404     }
5405 #endif
5406 }
5407
5408 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5409 {
5410 #if GMX_MPI
5411     int  dim0, dim1, i, j;
5412     ivec loc;
5413
5414     if (debug)
5415     {
5416         fprintf(debug, "Making load communicators\n");
5417     }
5418
5419     snew(dd->comm->load,          std::max(dd->ndim, 1));
5420     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
5421
5422     if (dd->ndim == 0)
5423     {
5424         return;
5425     }
5426
5427     clear_ivec(loc);
5428     make_load_communicator(dd, 0, loc);
5429     if (dd->ndim > 1)
5430     {
5431         dim0 = dd->dim[0];
5432         for (i = 0; i < dd->nc[dim0]; i++)
5433         {
5434             loc[dim0] = i;
5435             make_load_communicator(dd, 1, loc);
5436         }
5437     }
5438     if (dd->ndim > 2)
5439     {
5440         dim0 = dd->dim[0];
5441         for (i = 0; i < dd->nc[dim0]; i++)
5442         {
5443             loc[dim0] = i;
5444             dim1      = dd->dim[1];
5445             for (j = 0; j < dd->nc[dim1]; j++)
5446             {
5447                 loc[dim1] = j;
5448                 make_load_communicator(dd, 2, loc);
5449             }
5450         }
5451     }
5452
5453     if (debug)
5454     {
5455         fprintf(debug, "Finished making load communicators\n");
5456     }
5457 #endif
5458 }
5459
5460 /*! \brief Sets up the relation between neighboring domains and zones */
5461 static void setup_neighbor_relations(gmx_domdec_t *dd)
5462 {
5463     int                     d, dim, i, j, m;
5464     ivec                    tmp, s;
5465     gmx_domdec_zones_t     *zones;
5466     gmx_domdec_ns_ranges_t *izone;
5467
5468     for (d = 0; d < dd->ndim; d++)
5469     {
5470         dim = dd->dim[d];
5471         copy_ivec(dd->ci, tmp);
5472         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5473         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5474         copy_ivec(dd->ci, tmp);
5475         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5476         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5477         if (debug)
5478         {
5479             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5480                     dd->rank, dim,
5481                     dd->neighbor[d][0],
5482                     dd->neighbor[d][1]);
5483         }
5484     }
5485
5486     int nzone  = (1 << dd->ndim);
5487     int nizone = (1 << std::max(dd->ndim - 1, 0));
5488     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
5489
5490     zones = &dd->comm->zones;
5491
5492     for (i = 0; i < nzone; i++)
5493     {
5494         m = 0;
5495         clear_ivec(zones->shift[i]);
5496         for (d = 0; d < dd->ndim; d++)
5497         {
5498             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5499         }
5500     }
5501
5502     zones->n = nzone;
5503     for (i = 0; i < nzone; i++)
5504     {
5505         for (d = 0; d < DIM; d++)
5506         {
5507             s[d] = dd->ci[d] - zones->shift[i][d];
5508             if (s[d] < 0)
5509             {
5510                 s[d] += dd->nc[d];
5511             }
5512             else if (s[d] >= dd->nc[d])
5513             {
5514                 s[d] -= dd->nc[d];
5515             }
5516         }
5517     }
5518     zones->nizone = nizone;
5519     for (i = 0; i < zones->nizone; i++)
5520     {
5521         assert(ddNonbondedZonePairRanges[i][0] == i);
5522
5523         izone     = &zones->izone[i];
5524         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
5525          * j-zones up to nzone.
5526          */
5527         izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
5528         izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
5529         for (dim = 0; dim < DIM; dim++)
5530         {
5531             if (dd->nc[dim] == 1)
5532             {
5533                 /* All shifts should be allowed */
5534                 izone->shift0[dim] = -1;
5535                 izone->shift1[dim] = 1;
5536             }
5537             else
5538             {
5539                 /* Determine the min/max j-zone shift wrt the i-zone */
5540                 izone->shift0[dim] = 1;
5541                 izone->shift1[dim] = -1;
5542                 for (j = izone->j0; j < izone->j1; j++)
5543                 {
5544                     int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5545                     if (shift_diff < izone->shift0[dim])
5546                     {
5547                         izone->shift0[dim] = shift_diff;
5548                     }
5549                     if (shift_diff > izone->shift1[dim])
5550                     {
5551                         izone->shift1[dim] = shift_diff;
5552                     }
5553                 }
5554             }
5555         }
5556     }
5557
5558     if (dd->comm->dlbState != edlbsOffForever)
5559     {
5560         snew(dd->comm->root, dd->ndim);
5561     }
5562
5563     if (dd->comm->bRecordLoad)
5564     {
5565         make_load_communicators(dd);
5566     }
5567 }
5568
5569 static void make_pp_communicator(FILE                 *fplog,
5570                                  gmx_domdec_t         *dd,
5571                                  t_commrec gmx_unused *cr,
5572                                  int gmx_unused        reorder)
5573 {
5574 #if GMX_MPI
5575     gmx_domdec_comm_t *comm;
5576     int                rank, *buf;
5577     ivec               periods;
5578     MPI_Comm           comm_cart;
5579
5580     comm = dd->comm;
5581
5582     if (comm->bCartesianPP)
5583     {
5584         /* Set up cartesian communication for the particle-particle part */
5585         if (fplog)
5586         {
5587             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5588                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5589         }
5590
5591         for (int i = 0; i < DIM; i++)
5592         {
5593             periods[i] = TRUE;
5594         }
5595         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5596                         &comm_cart);
5597         /* We overwrite the old communicator with the new cartesian one */
5598         cr->mpi_comm_mygroup = comm_cart;
5599     }
5600
5601     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5602     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5603
5604     if (comm->bCartesianPP_PME)
5605     {
5606         /* Since we want to use the original cartesian setup for sim,
5607          * and not the one after split, we need to make an index.
5608          */
5609         snew(comm->ddindex2ddnodeid, dd->nnodes);
5610         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5611         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5612         /* Get the rank of the DD master,
5613          * above we made sure that the master node is a PP node.
5614          */
5615         if (MASTER(cr))
5616         {
5617             rank = dd->rank;
5618         }
5619         else
5620         {
5621             rank = 0;
5622         }
5623         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5624     }
5625     else if (comm->bCartesianPP)
5626     {
5627         if (cr->npmenodes == 0)
5628         {
5629             /* The PP communicator is also
5630              * the communicator for this simulation
5631              */
5632             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5633         }
5634         cr->nodeid = dd->rank;
5635
5636         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5637
5638         /* We need to make an index to go from the coordinates
5639          * to the nodeid of this simulation.
5640          */
5641         snew(comm->ddindex2simnodeid, dd->nnodes);
5642         snew(buf, dd->nnodes);
5643         if (cr->duty & DUTY_PP)
5644         {
5645             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5646         }
5647         /* Communicate the ddindex to simulation nodeid index */
5648         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5649                       cr->mpi_comm_mysim);
5650         sfree(buf);
5651
5652         /* Determine the master coordinates and rank.
5653          * The DD master should be the same node as the master of this sim.
5654          */
5655         for (int i = 0; i < dd->nnodes; i++)
5656         {
5657             if (comm->ddindex2simnodeid[i] == 0)
5658             {
5659                 ddindex2xyz(dd->nc, i, dd->master_ci);
5660                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5661             }
5662         }
5663         if (debug)
5664         {
5665             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5666         }
5667     }
5668     else
5669     {
5670         /* No Cartesian communicators */
5671         /* We use the rank in dd->comm->all as DD index */
5672         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5673         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5674         dd->masterrank = 0;
5675         clear_ivec(dd->master_ci);
5676     }
5677 #endif
5678
5679     if (fplog)
5680     {
5681         fprintf(fplog,
5682                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5683                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5684     }
5685     if (debug)
5686     {
5687         fprintf(debug,
5688                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5689                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5690     }
5691 }
5692
5693 static void receive_ddindex2simnodeid(gmx_domdec_t         *dd,
5694                                       t_commrec            *cr)
5695 {
5696 #if GMX_MPI
5697     gmx_domdec_comm_t *comm = dd->comm;
5698
5699     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5700     {
5701         int *buf;
5702         snew(comm->ddindex2simnodeid, dd->nnodes);
5703         snew(buf, dd->nnodes);
5704         if (cr->duty & DUTY_PP)
5705         {
5706             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5707         }
5708         /* Communicate the ddindex to simulation nodeid index */
5709         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5710                       cr->mpi_comm_mysim);
5711         sfree(buf);
5712     }
5713 #else
5714     GMX_UNUSED_VALUE(dd);
5715     GMX_UNUSED_VALUE(cr);
5716 #endif
5717 }
5718
5719 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5720                                                      int ncg, int natoms)
5721 {
5722     gmx_domdec_master_t *ma;
5723     int                  i;
5724
5725     snew(ma, 1);
5726
5727     snew(ma->ncg, dd->nnodes);
5728     snew(ma->index, dd->nnodes+1);
5729     snew(ma->cg, ncg);
5730     snew(ma->nat, dd->nnodes);
5731     snew(ma->ibuf, dd->nnodes*2);
5732     snew(ma->cell_x, DIM);
5733     for (i = 0; i < DIM; i++)
5734     {
5735         snew(ma->cell_x[i], dd->nc[i]+1);
5736     }
5737
5738     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5739     {
5740         ma->vbuf = nullptr;
5741     }
5742     else
5743     {
5744         snew(ma->vbuf, natoms);
5745     }
5746
5747     return ma;
5748 }
5749
5750 static void split_communicator(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
5751                                int gmx_unused dd_rank_order,
5752                                int gmx_unused reorder)
5753 {
5754     gmx_domdec_comm_t *comm;
5755     int                i;
5756     gmx_bool           bDiv[DIM];
5757 #if GMX_MPI
5758     MPI_Comm           comm_cart;
5759 #endif
5760
5761     comm = dd->comm;
5762
5763     if (comm->bCartesianPP)
5764     {
5765         for (i = 1; i < DIM; i++)
5766         {
5767             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5768         }
5769         if (bDiv[YY] || bDiv[ZZ])
5770         {
5771             comm->bCartesianPP_PME = TRUE;
5772             /* If we have 2D PME decomposition, which is always in x+y,
5773              * we stack the PME only nodes in z.
5774              * Otherwise we choose the direction that provides the thinnest slab
5775              * of PME only nodes as this will have the least effect
5776              * on the PP communication.
5777              * But for the PME communication the opposite might be better.
5778              */
5779             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5780                              !bDiv[YY] ||
5781                              dd->nc[YY] > dd->nc[ZZ]))
5782             {
5783                 comm->cartpmedim = ZZ;
5784             }
5785             else
5786             {
5787                 comm->cartpmedim = YY;
5788             }
5789             comm->ntot[comm->cartpmedim]
5790                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5791         }
5792         else if (fplog)
5793         {
5794             fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
5795             fprintf(fplog,
5796                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5797         }
5798     }
5799
5800 #if GMX_MPI
5801     if (comm->bCartesianPP_PME)
5802     {
5803         int  rank;
5804         ivec periods;
5805
5806         if (fplog)
5807         {
5808             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
5809         }
5810
5811         for (i = 0; i < DIM; i++)
5812         {
5813             periods[i] = TRUE;
5814         }
5815         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
5816                         &comm_cart);
5817         MPI_Comm_rank(comm_cart, &rank);
5818         if (MASTER(cr) && rank != 0)
5819         {
5820             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5821         }
5822
5823         /* With this assigment we loose the link to the original communicator
5824          * which will usually be MPI_COMM_WORLD, unless have multisim.
5825          */
5826         cr->mpi_comm_mysim = comm_cart;
5827         cr->sim_nodeid     = rank;
5828
5829         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
5830
5831         if (fplog)
5832         {
5833             fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
5834                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5835         }
5836
5837         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5838         {
5839             cr->duty = DUTY_PP;
5840         }
5841         if (cr->npmenodes == 0 ||
5842             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5843         {
5844             cr->duty = DUTY_PME;
5845         }
5846
5847         /* Split the sim communicator into PP and PME only nodes */
5848         MPI_Comm_split(cr->mpi_comm_mysim,
5849                        cr->duty,
5850                        dd_index(comm->ntot, dd->ci),
5851                        &cr->mpi_comm_mygroup);
5852     }
5853     else
5854     {
5855         switch (dd_rank_order)
5856         {
5857             case ddrankorderPP_PME:
5858                 if (fplog)
5859                 {
5860                     fprintf(fplog, "Order of the ranks: PP first, PME last\n");
5861                 }
5862                 break;
5863             case ddrankorderINTERLEAVE:
5864                 /* Interleave the PP-only and PME-only ranks */
5865                 if (fplog)
5866                 {
5867                     fprintf(fplog, "Interleaving PP and PME ranks\n");
5868                 }
5869                 comm->pmenodes = dd_interleaved_pme_ranks(dd);
5870                 break;
5871             case ddrankorderCARTESIAN:
5872                 break;
5873             default:
5874                 gmx_fatal(FARGS, "Unknown dd_rank_order=%d", dd_rank_order);
5875         }
5876
5877         if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
5878         {
5879             cr->duty = DUTY_PME;
5880         }
5881         else
5882         {
5883             cr->duty = DUTY_PP;
5884         }
5885
5886         /* Split the sim communicator into PP and PME only nodes */
5887         MPI_Comm_split(cr->mpi_comm_mysim,
5888                        cr->duty,
5889                        cr->nodeid,
5890                        &cr->mpi_comm_mygroup);
5891         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
5892     }
5893 #endif
5894
5895     if (fplog)
5896     {
5897         fprintf(fplog, "This rank does only %s work.\n\n",
5898                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5899     }
5900 }
5901
5902 /*! \brief Generates the MPI communicators for domain decomposition */
5903 static void make_dd_communicators(FILE *fplog, t_commrec *cr,
5904                                   gmx_domdec_t *dd, int dd_rank_order)
5905 {
5906     gmx_domdec_comm_t *comm;
5907     int                CartReorder;
5908
5909     comm = dd->comm;
5910
5911     copy_ivec(dd->nc, comm->ntot);
5912
5913     comm->bCartesianPP     = (dd_rank_order == ddrankorderCARTESIAN);
5914     comm->bCartesianPP_PME = FALSE;
5915
5916     /* Reorder the nodes by default. This might change the MPI ranks.
5917      * Real reordering is only supported on very few architectures,
5918      * Blue Gene is one of them.
5919      */
5920     CartReorder = (getenv("GMX_NO_CART_REORDER") == nullptr);
5921
5922     if (cr->npmenodes > 0)
5923     {
5924         /* Split the communicator into a PP and PME part */
5925         split_communicator(fplog, cr, dd, dd_rank_order, CartReorder);
5926         if (comm->bCartesianPP_PME)
5927         {
5928             /* We (possibly) reordered the nodes in split_communicator,
5929              * so it is no longer required in make_pp_communicator.
5930              */
5931             CartReorder = FALSE;
5932         }
5933     }
5934     else
5935     {
5936         /* All nodes do PP and PME */
5937 #if GMX_MPI
5938         /* We do not require separate communicators */
5939         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5940 #endif
5941     }
5942
5943     if (cr->duty & DUTY_PP)
5944     {
5945         /* Copy or make a new PP communicator */
5946         make_pp_communicator(fplog, dd, cr, CartReorder);
5947     }
5948     else
5949     {
5950         receive_ddindex2simnodeid(dd, cr);
5951     }
5952
5953     if (!(cr->duty & DUTY_PME))
5954     {
5955         /* Set up the commnuication to our PME node */
5956         dd->pme_nodeid           = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
5957         dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
5958         if (debug)
5959         {
5960             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
5961                     dd->pme_nodeid, dd->pme_receive_vir_ener);
5962         }
5963     }
5964     else
5965     {
5966         dd->pme_nodeid = -1;
5967     }
5968
5969     if (DDMASTER(dd))
5970     {
5971         dd->ma = init_gmx_domdec_master_t(dd,
5972                                           comm->cgs_gl.nr,
5973                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
5974     }
5975 }
5976
5977 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
5978 {
5979     real  *slb_frac, tot;
5980     int    i, n;
5981     double dbl;
5982
5983     slb_frac = nullptr;
5984     if (nc > 1 && size_string != nullptr)
5985     {
5986         if (fplog)
5987         {
5988             fprintf(fplog, "Using static load balancing for the %s direction\n",
5989                     dir);
5990         }
5991         snew(slb_frac, nc);
5992         tot = 0;
5993         for (i = 0; i < nc; i++)
5994         {
5995             dbl = 0;
5996             sscanf(size_string, "%20lf%n", &dbl, &n);
5997             if (dbl == 0)
5998             {
5999                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6000             }
6001             slb_frac[i]  = dbl;
6002             size_string += n;
6003             tot         += slb_frac[i];
6004         }
6005         /* Normalize */
6006         if (fplog)
6007         {
6008             fprintf(fplog, "Relative cell sizes:");
6009         }
6010         for (i = 0; i < nc; i++)
6011         {
6012             slb_frac[i] /= tot;
6013             if (fplog)
6014             {
6015                 fprintf(fplog, " %5.3f", slb_frac[i]);
6016             }
6017         }
6018         if (fplog)
6019         {
6020             fprintf(fplog, "\n");
6021         }
6022     }
6023
6024     return slb_frac;
6025 }
6026
6027 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
6028 {
6029     int                  n, nmol, ftype;
6030     gmx_mtop_ilistloop_t iloop;
6031     t_ilist             *il;
6032
6033     n     = 0;
6034     iloop = gmx_mtop_ilistloop_init(mtop);
6035     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6036     {
6037         for (ftype = 0; ftype < F_NRE; ftype++)
6038         {
6039             if ((interaction_function[ftype].flags & IF_BOND) &&
6040                 NRAL(ftype) >  2)
6041             {
6042                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6043             }
6044         }
6045     }
6046
6047     return n;
6048 }
6049
6050 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6051 {
6052     char *val;
6053     int   nst;
6054
6055     nst = def;
6056     val = getenv(env_var);
6057     if (val)
6058     {
6059         if (sscanf(val, "%20d", &nst) <= 0)
6060         {
6061             nst = 1;
6062         }
6063         if (fplog)
6064         {
6065             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6066                     env_var, val, nst);
6067         }
6068     }
6069
6070     return nst;
6071 }
6072
6073 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6074 {
6075     if (MASTER(cr))
6076     {
6077         fprintf(stderr, "\n%s\n", warn_string);
6078     }
6079     if (fplog)
6080     {
6081         fprintf(fplog, "\n%s\n", warn_string);
6082     }
6083 }
6084
6085 static void check_dd_restrictions(t_commrec *cr, const gmx_domdec_t *dd,
6086                                   const t_inputrec *ir, FILE *fplog)
6087 {
6088     if (ir->ePBC == epbcSCREW &&
6089         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6090     {
6091         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6092     }
6093
6094     if (ir->ns_type == ensSIMPLE)
6095     {
6096         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6097     }
6098
6099     if (ir->nstlist == 0)
6100     {
6101         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6102     }
6103
6104     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6105     {
6106         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6107     }
6108 }
6109
6110 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6111 {
6112     int  di, d;
6113     real r;
6114
6115     r = ddbox->box_size[XX];
6116     for (di = 0; di < dd->ndim; di++)
6117     {
6118         d = dd->dim[di];
6119         /* Check using the initial average cell size */
6120         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6121     }
6122
6123     return r;
6124 }
6125
6126 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6127                              const char *dlb_opt, gmx_bool bRecordLoad,
6128                              unsigned long Flags, const t_inputrec *ir)
6129 {
6130     int           dlbState = -1;
6131     char          buf[STRLEN];
6132
6133     switch (dlb_opt[0])
6134     {
6135         case 'a': dlbState = edlbsOffCanTurnOn; break;
6136         case 'n': dlbState = edlbsOffForever;   break;
6137         case 'y': dlbState = edlbsOnForever;    break;
6138         default: gmx_incons("Unknown dlb_opt");
6139     }
6140
6141     if (Flags & MD_RERUN)
6142     {
6143         return edlbsOffForever;
6144     }
6145
6146     if (!EI_DYNAMICS(ir->eI))
6147     {
6148         if (dlbState == edlbsOnForever)
6149         {
6150             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6151             dd_warning(cr, fplog, buf);
6152         }
6153
6154         return edlbsOffForever;
6155     }
6156
6157     if (!bRecordLoad)
6158     {
6159         dd_warning(cr, fplog, "NOTE: Cycle counters unsupported or not enabled in kernel. Cannot use dynamic load balancing.\n");
6160         return edlbsOffForever;
6161     }
6162
6163     if (Flags & MD_REPRODUCIBLE)
6164     {
6165         switch (dlbState)
6166         {
6167             case edlbsOffForever:
6168                 break;
6169             case edlbsOffCanTurnOn:
6170             case edlbsOnCanTurnOff:
6171                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6172                 dlbState = edlbsOffForever;
6173                 break;
6174             case edlbsOnForever:
6175                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6176                 break;
6177             default:
6178                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
6179                 break;
6180         }
6181     }
6182
6183     return dlbState;
6184 }
6185
6186 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6187 {
6188     int dim;
6189
6190     dd->ndim = 0;
6191     if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
6192     {
6193         /* Decomposition order z,y,x */
6194         if (fplog)
6195         {
6196             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6197         }
6198         for (dim = DIM-1; dim >= 0; dim--)
6199         {
6200             if (dd->nc[dim] > 1)
6201             {
6202                 dd->dim[dd->ndim++] = dim;
6203             }
6204         }
6205     }
6206     else
6207     {
6208         /* Decomposition order x,y,z */
6209         for (dim = 0; dim < DIM; dim++)
6210         {
6211             if (dd->nc[dim] > 1)
6212             {
6213                 dd->dim[dd->ndim++] = dim;
6214             }
6215         }
6216     }
6217 }
6218
6219 static gmx_domdec_comm_t *init_dd_comm()
6220 {
6221     gmx_domdec_comm_t *comm;
6222     int                i;
6223
6224     snew(comm, 1);
6225     snew(comm->cggl_flag, DIM*2);
6226     snew(comm->cgcm_state, DIM*2);
6227     for (i = 0; i < DIM*2; i++)
6228     {
6229         comm->cggl_flag_nalloc[i]  = 0;
6230         comm->cgcm_state_nalloc[i] = 0;
6231     }
6232
6233     comm->nalloc_int = 0;
6234     comm->buf_int    = nullptr;
6235
6236     vec_rvec_init(&comm->vbuf);
6237
6238     comm->n_load_have    = 0;
6239     comm->n_load_collect = 0;
6240
6241     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6242     {
6243         comm->sum_nat[i] = 0;
6244     }
6245     comm->ndecomp   = 0;
6246     comm->nload     = 0;
6247     comm->load_step = 0;
6248     comm->load_sum  = 0;
6249     comm->load_max  = 0;
6250     clear_ivec(comm->load_lim);
6251     comm->load_mdf  = 0;
6252     comm->load_pme  = 0;
6253
6254     return comm;
6255 }
6256
6257 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
6258 static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
6259                                    unsigned long Flags,
6260                                    ivec nc, int nPmeRanks,
6261                                    real comm_distance_min, real rconstr,
6262                                    const char *dlb_opt, real dlb_scale,
6263                                    const char *sizex, const char *sizey, const char *sizez,
6264                                    const gmx_mtop_t *mtop,
6265                                    const t_inputrec *ir,
6266                                    matrix box, const rvec *x,
6267                                    gmx_ddbox_t *ddbox,
6268                                    int *npme_x, int *npme_y)
6269 {
6270     real               r_bonded         = -1;
6271     real               r_bonded_limit   = -1;
6272     const real         tenPercentMargin = 1.1;
6273     gmx_domdec_comm_t *comm             = dd->comm;
6274
6275     snew(comm->cggl_flag, DIM*2);
6276     snew(comm->cgcm_state, DIM*2);
6277
6278     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6279     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6280
6281     dd->pme_recv_f_alloc = 0;
6282     dd->pme_recv_f_buf   = nullptr;
6283
6284     /* Initialize to GPU share count to 0, might change later */
6285     comm->nrank_gpu_shared = 0;
6286
6287     comm->dlbState         = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6288     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
6289     /* To consider turning DLB on after 2*nstlist steps we need to check
6290      * at partitioning count 3. Thus we need to increase the first count by 2.
6291      */
6292     comm->ddPartioningCountFirstDlbOff += 2;
6293
6294     if (fplog)
6295     {
6296         fprintf(fplog, "Dynamic load balancing: %s\n",
6297                 edlbs_names[comm->dlbState]);
6298     }
6299     comm->bPMELoadBalDLBLimits = FALSE;
6300
6301     /* Allocate the charge group/atom sorting struct */
6302     snew(comm->sort, 1);
6303
6304     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6305
6306     comm->bInterCGBondeds = ((ncg_mtop(mtop) > mtop->mols.nr) ||
6307                              mtop->bIntermolecularInteractions);
6308     if (comm->bInterCGBondeds)
6309     {
6310         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6311     }
6312     else
6313     {
6314         comm->bInterCGMultiBody = FALSE;
6315     }
6316
6317     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6318     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6319
6320     if (ir->rlist == 0)
6321     {
6322         /* Set the cut-off to some very large value,
6323          * so we don't need if statements everywhere in the code.
6324          * We use sqrt, since the cut-off is squared in some places.
6325          */
6326         comm->cutoff   = GMX_CUTOFF_INF;
6327     }
6328     else
6329     {
6330         comm->cutoff   = ir->rlist;
6331     }
6332     comm->cutoff_mbody = 0;
6333
6334     comm->cellsize_limit = 0;
6335     comm->bBondComm      = FALSE;
6336
6337     /* Atoms should be able to move by up to half the list buffer size (if > 0)
6338      * within nstlist steps. Since boundaries are allowed to displace by half
6339      * a cell size, DD cells should be at least the size of the list buffer.
6340      */
6341     comm->cellsize_limit = std::max(comm->cellsize_limit,
6342                                     ir->rlist - std::max(ir->rvdw, ir->rcoulomb));
6343
6344     if (comm->bInterCGBondeds)
6345     {
6346         if (comm_distance_min > 0)
6347         {
6348             comm->cutoff_mbody = comm_distance_min;
6349             if (Flags & MD_DDBONDCOMM)
6350             {
6351                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6352             }
6353             else
6354             {
6355                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6356             }
6357             r_bonded_limit = comm->cutoff_mbody;
6358         }
6359         else if (ir->bPeriodicMols)
6360         {
6361             /* Can not easily determine the required cut-off */
6362             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6363             comm->cutoff_mbody = comm->cutoff/2;
6364             r_bonded_limit     = comm->cutoff_mbody;
6365         }
6366         else
6367         {
6368             real r_2b, r_mb;
6369
6370             if (MASTER(cr))
6371             {
6372                 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6373                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6374             }
6375             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6376             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6377
6378             /* We use an initial margin of 10% for the minimum cell size,
6379              * except when we are just below the non-bonded cut-off.
6380              */
6381             if (Flags & MD_DDBONDCOMM)
6382             {
6383                 if (std::max(r_2b, r_mb) > comm->cutoff)
6384                 {
6385                     r_bonded        = std::max(r_2b, r_mb);
6386                     r_bonded_limit  = tenPercentMargin*r_bonded;
6387                     comm->bBondComm = TRUE;
6388                 }
6389                 else
6390                 {
6391                     r_bonded       = r_mb;
6392                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
6393                 }
6394                 /* We determine cutoff_mbody later */
6395             }
6396             else
6397             {
6398                 /* No special bonded communication,
6399                  * simply increase the DD cut-off.
6400                  */
6401                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
6402                 comm->cutoff_mbody = r_bonded_limit;
6403                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
6404             }
6405         }
6406         if (fplog)
6407         {
6408             fprintf(fplog,
6409                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6410                     r_bonded_limit);
6411         }
6412         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
6413     }
6414
6415     if (dd->bInterCGcons && rconstr <= 0)
6416     {
6417         /* There is a cell size limit due to the constraints (P-LINCS) */
6418         rconstr = constr_r_max(fplog, mtop, ir);
6419         if (fplog)
6420         {
6421             fprintf(fplog,
6422                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6423                     rconstr);
6424             if (rconstr > comm->cellsize_limit)
6425             {
6426                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6427             }
6428         }
6429     }
6430     else if (rconstr > 0 && fplog)
6431     {
6432         /* Here we do not check for dd->bInterCGcons,
6433          * because one can also set a cell size limit for virtual sites only
6434          * and at this point we don't know yet if there are intercg v-sites.
6435          */
6436         fprintf(fplog,
6437                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6438                 rconstr);
6439     }
6440     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
6441
6442     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6443
6444     if (nc[XX] > 0)
6445     {
6446         copy_ivec(nc, dd->nc);
6447         set_dd_dim(fplog, dd);
6448         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6449
6450         if (nPmeRanks >= 0)
6451         {
6452             cr->npmenodes = nPmeRanks;
6453         }
6454         else
6455         {
6456             /* When the DD grid is set explicitly and -npme is set to auto,
6457              * don't use PME ranks. We check later if the DD grid is
6458              * compatible with the total number of ranks.
6459              */
6460             cr->npmenodes = 0;
6461         }
6462
6463         real acs = average_cellsize_min(dd, ddbox);
6464         if (acs < comm->cellsize_limit)
6465         {
6466             if (fplog)
6467             {
6468                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6469             }
6470             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6471                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6472                                  acs, comm->cellsize_limit);
6473         }
6474     }
6475     else
6476     {
6477         set_ddbox_cr(cr, nullptr, ir, box, &comm->cgs_gl, x, ddbox);
6478
6479         /* We need to choose the optimal DD grid and possibly PME nodes */
6480         real limit =
6481             dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6482                            nPmeRanks,
6483                            comm->dlbState != edlbsOffForever, dlb_scale,
6484                            comm->cellsize_limit, comm->cutoff,
6485                            comm->bInterCGBondeds);
6486
6487         if (dd->nc[XX] == 0)
6488         {
6489             char     buf[STRLEN];
6490             gmx_bool bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6491             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6492                     !bC ? "-rdd" : "-rcon",
6493                     comm->dlbState != edlbsOffForever ? " or -dds" : "",
6494                     bC ? " or your LINCS settings" : "");
6495
6496             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6497                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6498                                  "%s\n"
6499                                  "Look in the log file for details on the domain decomposition",
6500                                  cr->nnodes-cr->npmenodes, limit, buf);
6501         }
6502         set_dd_dim(fplog, dd);
6503     }
6504
6505     if (fplog)
6506     {
6507         fprintf(fplog,
6508                 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6509                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6510     }
6511
6512     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6513     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6514     {
6515         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6516                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6517                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6518     }
6519     if (cr->npmenodes > dd->nnodes)
6520     {
6521         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6522                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6523     }
6524     if (cr->npmenodes > 0)
6525     {
6526         comm->npmenodes = cr->npmenodes;
6527     }
6528     else
6529     {
6530         comm->npmenodes = dd->nnodes;
6531     }
6532
6533     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6534     {
6535         /* The following choices should match those
6536          * in comm_cost_est in domdec_setup.c.
6537          * Note that here the checks have to take into account
6538          * that the decomposition might occur in a different order than xyz
6539          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6540          * in which case they will not match those in comm_cost_est,
6541          * but since that is mainly for testing purposes that's fine.
6542          */
6543         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6544             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6545             getenv("GMX_PMEONEDD") == nullptr)
6546         {
6547             comm->npmedecompdim = 2;
6548             comm->npmenodes_x   = dd->nc[XX];
6549             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6550         }
6551         else
6552         {
6553             /* In case nc is 1 in both x and y we could still choose to
6554              * decompose pme in y instead of x, but we use x for simplicity.
6555              */
6556             comm->npmedecompdim = 1;
6557             if (dd->dim[0] == YY)
6558             {
6559                 comm->npmenodes_x = 1;
6560                 comm->npmenodes_y = comm->npmenodes;
6561             }
6562             else
6563             {
6564                 comm->npmenodes_x = comm->npmenodes;
6565                 comm->npmenodes_y = 1;
6566             }
6567         }
6568         if (fplog)
6569         {
6570             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6571                     comm->npmenodes_x, comm->npmenodes_y, 1);
6572         }
6573     }
6574     else
6575     {
6576         comm->npmedecompdim = 0;
6577         comm->npmenodes_x   = 0;
6578         comm->npmenodes_y   = 0;
6579     }
6580
6581     /* Technically we don't need both of these,
6582      * but it simplifies code not having to recalculate it.
6583      */
6584     *npme_x = comm->npmenodes_x;
6585     *npme_y = comm->npmenodes_y;
6586
6587     snew(comm->slb_frac, DIM);
6588     if (comm->dlbState == edlbsOffForever)
6589     {
6590         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6591         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6592         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6593     }
6594
6595     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6596     {
6597         if (comm->bBondComm || comm->dlbState != edlbsOffForever)
6598         {
6599             /* Set the bonded communication distance to halfway
6600              * the minimum and the maximum,
6601              * since the extra communication cost is nearly zero.
6602              */
6603             real acs           = average_cellsize_min(dd, ddbox);
6604             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6605             if (comm->dlbState != edlbsOffForever)
6606             {
6607                 /* Check if this does not limit the scaling */
6608                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
6609             }
6610             if (!comm->bBondComm)
6611             {
6612                 /* Without bBondComm do not go beyond the n.b. cut-off */
6613                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
6614                 if (comm->cellsize_limit >= comm->cutoff)
6615                 {
6616                     /* We don't loose a lot of efficieny
6617                      * when increasing it to the n.b. cut-off.
6618                      * It can even be slightly faster, because we need
6619                      * less checks for the communication setup.
6620                      */
6621                     comm->cutoff_mbody = comm->cutoff;
6622                 }
6623             }
6624             /* Check if we did not end up below our original limit */
6625             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
6626
6627             if (comm->cutoff_mbody > comm->cellsize_limit)
6628             {
6629                 comm->cellsize_limit = comm->cutoff_mbody;
6630             }
6631         }
6632         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6633     }
6634
6635     if (debug)
6636     {
6637         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6638                 "cellsize limit %f\n",
6639                 comm->bBondComm, comm->cellsize_limit);
6640     }
6641
6642     if (MASTER(cr))
6643     {
6644         check_dd_restrictions(cr, dd, ir, fplog);
6645     }
6646 }
6647
6648 static void set_dlb_limits(gmx_domdec_t *dd)
6649
6650 {
6651     int d;
6652
6653     for (d = 0; d < dd->ndim; d++)
6654     {
6655         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6656         dd->comm->cellsize_min[dd->dim[d]] =
6657             dd->comm->cellsize_min_dlb[dd->dim[d]];
6658     }
6659 }
6660
6661
6662 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6663 {
6664     gmx_domdec_t      *dd;
6665     gmx_domdec_comm_t *comm;
6666     real               cellsize_min;
6667     int                d, nc, i;
6668
6669     dd   = cr->dd;
6670     comm = dd->comm;
6671
6672     cellsize_min = comm->cellsize_min[dd->dim[0]];
6673     for (d = 1; d < dd->ndim; d++)
6674     {
6675         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6676     }
6677
6678     if (cellsize_min < comm->cellsize_limit*1.05)
6679     {
6680         char buf[STRLEN];
6681         sprintf(buf, "step %" GMX_PRId64 " Measured %.1f %% performance load due to load imbalance, but the minimum cell size is smaller than 1.05 times the cell size limit. Will no longer try dynamic load balancing.\n", step, dd_force_imb_perf_loss(dd)*100);
6682
6683         /* Change DLB from "auto" to "no". */
6684         comm->dlbState = edlbsOffForever;
6685
6686         return;
6687     }
6688
6689     char buf[STRLEN];
6690     sprintf(buf, "step %" GMX_PRId64 " Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.\n", step, dd_force_imb_perf_loss(dd)*100);
6691     dd_warning(cr, fplog, buf);
6692     comm->dlbState = edlbsOnCanTurnOff;
6693
6694     /* Store the non-DLB performance, so we can check if DLB actually
6695      * improves performance.
6696      */
6697     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
6698     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
6699
6700     set_dlb_limits(dd);
6701
6702     /* We can set the required cell size info here,
6703      * so we do not need to communicate this.
6704      * The grid is completely uniform.
6705      */
6706     for (d = 0; d < dd->ndim; d++)
6707     {
6708         if (comm->root[d])
6709         {
6710             comm->load[d].sum_m = comm->load[d].sum;
6711
6712             nc = dd->nc[dd->dim[d]];
6713             for (i = 0; i < nc; i++)
6714             {
6715                 comm->root[d]->cell_f[i]    = i/(real)nc;
6716                 if (d > 0)
6717                 {
6718                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6719                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6720                 }
6721             }
6722             comm->root[d]->cell_f[nc] = 1.0;
6723         }
6724     }
6725 }
6726
6727 static void turn_off_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6728 {
6729     gmx_domdec_t *dd = cr->dd;
6730
6731     char          buf[STRLEN];
6732     sprintf(buf, "step %" GMX_PRId64 " Turning off dynamic load balancing, because it is degrading performance.\n", step);
6733     dd_warning(cr, fplog, buf);
6734     dd->comm->dlbState                     = edlbsOffCanTurnOn;
6735     dd->comm->haveTurnedOffDlb             = true;
6736     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
6737 }
6738
6739 static void turn_off_dlb_forever(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6740 {
6741     GMX_RELEASE_ASSERT(cr->dd->comm->dlbState == edlbsOffCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
6742     char buf[STRLEN];
6743     sprintf(buf, "step %" GMX_PRId64 " Will no longer try dynamic load balancing, as it degraded performance.\n", step);
6744     dd_warning(cr, fplog, buf);
6745     cr->dd->comm->dlbState = edlbsOffForever;
6746 }
6747
6748 static char *init_bLocalCG(const gmx_mtop_t *mtop)
6749 {
6750     int   ncg, cg;
6751     char *bLocalCG;
6752
6753     ncg = ncg_mtop(mtop);
6754     snew(bLocalCG, ncg);
6755     for (cg = 0; cg < ncg; cg++)
6756     {
6757         bLocalCG[cg] = FALSE;
6758     }
6759
6760     return bLocalCG;
6761 }
6762
6763 void dd_init_bondeds(FILE *fplog,
6764                      gmx_domdec_t *dd,
6765                      const gmx_mtop_t *mtop,
6766                      const gmx_vsite_t *vsite,
6767                      const t_inputrec *ir,
6768                      gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
6769 {
6770     gmx_domdec_comm_t *comm;
6771
6772     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
6773
6774     comm = dd->comm;
6775
6776     if (comm->bBondComm)
6777     {
6778         /* Communicate atoms beyond the cut-off for bonded interactions */
6779         comm = dd->comm;
6780
6781         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
6782
6783         comm->bLocalCG = init_bLocalCG(mtop);
6784     }
6785     else
6786     {
6787         /* Only communicate atoms based on cut-off */
6788         comm->cglink   = nullptr;
6789         comm->bLocalCG = nullptr;
6790     }
6791 }
6792
6793 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
6794                               const gmx_mtop_t *mtop, const t_inputrec *ir,
6795                               gmx_bool bDynLoadBal, real dlb_scale,
6796                               const gmx_ddbox_t *ddbox)
6797 {
6798     gmx_domdec_comm_t *comm;
6799     int                d;
6800     ivec               np;
6801     real               limit, shrink;
6802     char               buf[64];
6803
6804     if (fplog == nullptr)
6805     {
6806         return;
6807     }
6808
6809     comm = dd->comm;
6810
6811     if (bDynLoadBal)
6812     {
6813         fprintf(fplog, "The maximum number of communication pulses is:");
6814         for (d = 0; d < dd->ndim; d++)
6815         {
6816             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
6817         }
6818         fprintf(fplog, "\n");
6819         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
6820         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
6821         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
6822         for (d = 0; d < DIM; d++)
6823         {
6824             if (dd->nc[d] > 1)
6825             {
6826                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6827                 {
6828                     shrink = 0;
6829                 }
6830                 else
6831                 {
6832                     shrink =
6833                         comm->cellsize_min_dlb[d]/
6834                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6835                 }
6836                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
6837             }
6838         }
6839         fprintf(fplog, "\n");
6840     }
6841     else
6842     {
6843         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
6844         fprintf(fplog, "The initial number of communication pulses is:");
6845         for (d = 0; d < dd->ndim; d++)
6846         {
6847             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
6848         }
6849         fprintf(fplog, "\n");
6850         fprintf(fplog, "The initial domain decomposition cell size is:");
6851         for (d = 0; d < DIM; d++)
6852         {
6853             if (dd->nc[d] > 1)
6854             {
6855                 fprintf(fplog, " %c %.2f nm",
6856                         dim2char(d), dd->comm->cellsize_min[d]);
6857             }
6858         }
6859         fprintf(fplog, "\n\n");
6860     }
6861
6862     gmx_bool bInterCGVsites = count_intercg_vsites(mtop);
6863
6864     if (comm->bInterCGBondeds ||
6865         bInterCGVsites ||
6866         dd->bInterCGcons || dd->bInterCGsettles)
6867     {
6868         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
6869         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6870                 "non-bonded interactions", "", comm->cutoff);
6871
6872         if (bDynLoadBal)
6873         {
6874             limit = dd->comm->cellsize_limit;
6875         }
6876         else
6877         {
6878             if (dynamic_dd_box(ddbox, ir))
6879             {
6880                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
6881             }
6882             limit = dd->comm->cellsize_min[XX];
6883             for (d = 1; d < DIM; d++)
6884             {
6885                 limit = std::min(limit, dd->comm->cellsize_min[d]);
6886             }
6887         }
6888
6889         if (comm->bInterCGBondeds)
6890         {
6891             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6892                     "two-body bonded interactions", "(-rdd)",
6893                     std::max(comm->cutoff, comm->cutoff_mbody));
6894             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6895                     "multi-body bonded interactions", "(-rdd)",
6896                     (comm->bBondComm || dlbIsOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
6897         }
6898         if (bInterCGVsites)
6899         {
6900             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6901                     "virtual site constructions", "(-rcon)", limit);
6902         }
6903         if (dd->bInterCGcons || dd->bInterCGsettles)
6904         {
6905             sprintf(buf, "atoms separated by up to %d constraints",
6906                     1+ir->nProjOrder);
6907             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6908                     buf, "(-rcon)", limit);
6909         }
6910         fprintf(fplog, "\n");
6911     }
6912
6913     fflush(fplog);
6914 }
6915
6916 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
6917                                 real               dlb_scale,
6918                                 const t_inputrec  *ir,
6919                                 const gmx_ddbox_t *ddbox)
6920 {
6921     gmx_domdec_comm_t *comm;
6922     int                d, dim, npulse, npulse_d_max, npulse_d;
6923     gmx_bool           bNoCutOff;
6924
6925     comm = dd->comm;
6926
6927     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6928
6929     /* Determine the maximum number of comm. pulses in one dimension */
6930
6931     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
6932
6933     /* Determine the maximum required number of grid pulses */
6934     if (comm->cellsize_limit >= comm->cutoff)
6935     {
6936         /* Only a single pulse is required */
6937         npulse = 1;
6938     }
6939     else if (!bNoCutOff && comm->cellsize_limit > 0)
6940     {
6941         /* We round down slightly here to avoid overhead due to the latency
6942          * of extra communication calls when the cut-off
6943          * would be only slightly longer than the cell size.
6944          * Later cellsize_limit is redetermined,
6945          * so we can not miss interactions due to this rounding.
6946          */
6947         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6948     }
6949     else
6950     {
6951         /* There is no cell size limit */
6952         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
6953     }
6954
6955     if (!bNoCutOff && npulse > 1)
6956     {
6957         /* See if we can do with less pulses, based on dlb_scale */
6958         npulse_d_max = 0;
6959         for (d = 0; d < dd->ndim; d++)
6960         {
6961             dim      = dd->dim[d];
6962             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6963                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6964             npulse_d_max = std::max(npulse_d_max, npulse_d);
6965         }
6966         npulse = std::min(npulse, npulse_d_max);
6967     }
6968
6969     /* This env var can override npulse */
6970     d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
6971     if (d > 0)
6972     {
6973         npulse = d;
6974     }
6975
6976     comm->maxpulse       = 1;
6977     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6978     for (d = 0; d < dd->ndim; d++)
6979     {
6980         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
6981         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6982         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
6983         comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
6984         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6985         {
6986             comm->bVacDLBNoLimit = FALSE;
6987         }
6988     }
6989
6990     /* cellsize_limit is set for LINCS in init_domain_decomposition */
6991     if (!comm->bVacDLBNoLimit)
6992     {
6993         comm->cellsize_limit = std::max(comm->cellsize_limit,
6994                                         comm->cutoff/comm->maxpulse);
6995     }
6996     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
6997     /* Set the minimum cell size for each DD dimension */
6998     for (d = 0; d < dd->ndim; d++)
6999     {
7000         if (comm->bVacDLBNoLimit ||
7001             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7002         {
7003             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7004         }
7005         else
7006         {
7007             comm->cellsize_min_dlb[dd->dim[d]] =
7008                 comm->cutoff/comm->cd[d].np_dlb;
7009         }
7010     }
7011     if (comm->cutoff_mbody <= 0)
7012     {
7013         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
7014     }
7015     if (dlbIsOn(comm))
7016     {
7017         set_dlb_limits(dd);
7018     }
7019 }
7020
7021 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
7022 {
7023     /* If each molecule is a single charge group
7024      * or we use domain decomposition for each periodic dimension,
7025      * we do not need to take pbc into account for the bonded interactions.
7026      */
7027     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7028             !(dd->nc[XX] > 1 &&
7029               dd->nc[YY] > 1 &&
7030               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7031 }
7032
7033 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
7034 static void set_ddgrid_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7035                                   const gmx_mtop_t *mtop, const t_inputrec *ir,
7036                                   const gmx_ddbox_t *ddbox)
7037 {
7038     gmx_domdec_comm_t *comm;
7039     int                natoms_tot;
7040     real               vol_frac;
7041
7042     comm = dd->comm;
7043
7044     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7045     {
7046         init_ddpme(dd, &comm->ddpme[0], 0);
7047         if (comm->npmedecompdim >= 2)
7048         {
7049             init_ddpme(dd, &comm->ddpme[1], 1);
7050         }
7051     }
7052     else
7053     {
7054         comm->npmenodes = 0;
7055         if (dd->pme_nodeid >= 0)
7056         {
7057             gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
7058                                  "Can not have separate PME ranks without PME electrostatics");
7059         }
7060     }
7061
7062     if (debug)
7063     {
7064         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7065     }
7066     if (comm->dlbState != edlbsOffForever)
7067     {
7068         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7069     }
7070
7071     print_dd_settings(fplog, dd, mtop, ir, dlbIsOn(comm), dlb_scale, ddbox);
7072     if (comm->dlbState == edlbsOffCanTurnOn)
7073     {
7074         if (fplog)
7075         {
7076             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7077         }
7078         print_dd_settings(fplog, dd, mtop, ir, TRUE, dlb_scale, ddbox);
7079     }
7080
7081     if (ir->ePBC == epbcNONE)
7082     {
7083         vol_frac = 1 - 1/(double)dd->nnodes;
7084     }
7085     else
7086     {
7087         vol_frac =
7088             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7089     }
7090     if (debug)
7091     {
7092         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7093     }
7094     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7095
7096     dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
7097 }
7098
7099 /*! \brief Set some important DD parameters that can be modified by env.vars */
7100 static void set_dd_envvar_options(FILE *fplog, gmx_domdec_t *dd, int rank_mysim)
7101 {
7102     gmx_domdec_comm_t *comm = dd->comm;
7103
7104     dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
7105     comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
7106     comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
7107     int recload         = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
7108     comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
7109     comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
7110     comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
7111
7112     if (dd->bSendRecv2 && fplog)
7113     {
7114         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
7115     }
7116
7117     if (comm->eFlop)
7118     {
7119         if (fplog)
7120         {
7121             fprintf(fplog, "Will load balance based on FLOP count\n");
7122         }
7123         if (comm->eFlop > 1)
7124         {
7125             srand(1 + rank_mysim);
7126         }
7127         comm->bRecordLoad = TRUE;
7128     }
7129     else
7130     {
7131         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
7132     }
7133 }
7134
7135 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
7136                                         unsigned long Flags,
7137                                         ivec nc, int nPmeRanks,
7138                                         int dd_rank_order,
7139                                         real comm_distance_min, real rconstr,
7140                                         const char *dlb_opt, real dlb_scale,
7141                                         const char *sizex, const char *sizey, const char *sizez,
7142                                         const gmx_mtop_t *mtop,
7143                                         const t_inputrec *ir,
7144                                         matrix box, rvec *x,
7145                                         gmx_ddbox_t *ddbox,
7146                                         int *npme_x, int *npme_y)
7147 {
7148     gmx_domdec_t      *dd;
7149
7150     if (fplog)
7151     {
7152         fprintf(fplog,
7153                 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
7154     }
7155
7156     snew(dd, 1);
7157
7158     dd->comm = init_dd_comm();
7159
7160     set_dd_envvar_options(fplog, dd, cr->nodeid);
7161
7162     set_dd_limits_and_grid(fplog, cr, dd, Flags,
7163                            nc, nPmeRanks,
7164                            comm_distance_min, rconstr,
7165                            dlb_opt, dlb_scale,
7166                            sizex, sizey, sizez,
7167                            mtop, ir,
7168                            box, x,
7169                            ddbox,
7170                            npme_x, npme_y);
7171
7172     make_dd_communicators(fplog, cr, dd, dd_rank_order);
7173
7174     if (cr->duty & DUTY_PP)
7175     {
7176         set_ddgrid_parameters(fplog, dd, dlb_scale, mtop, ir, ddbox);
7177
7178         setup_neighbor_relations(dd);
7179     }
7180
7181     /* Set overallocation to avoid frequent reallocation of arrays */
7182     set_over_alloc_dd(TRUE);
7183
7184     /* Initialize DD paritioning counters */
7185     dd->comm->partition_step = INT_MIN;
7186     dd->ddp_count            = 0;
7187
7188     /* We currently don't know the number of threads yet, we set this later */
7189     dd->comm->nth = 0;
7190
7191     clear_dd_cycle_counts(dd);
7192
7193     return dd;
7194 }
7195
7196 static gmx_bool test_dd_cutoff(t_commrec *cr,
7197                                t_state *state, const t_inputrec *ir,
7198                                real cutoff_req)
7199 {
7200     gmx_domdec_t *dd;
7201     gmx_ddbox_t   ddbox;
7202     int           d, dim, np;
7203     real          inv_cell_size;
7204     int           LocallyLimited;
7205
7206     dd = cr->dd;
7207
7208     set_ddbox(dd, FALSE, cr, ir, state->box,
7209               TRUE, &dd->comm->cgs_gl, as_rvec_array(state->x.data()), &ddbox);
7210
7211     LocallyLimited = 0;
7212
7213     for (d = 0; d < dd->ndim; d++)
7214     {
7215         dim = dd->dim[d];
7216
7217         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7218         if (dynamic_dd_box(&ddbox, ir))
7219         {
7220             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7221         }
7222
7223         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7224
7225         if (dd->comm->dlbState != edlbsOffForever && dim < ddbox.npbcdim &&
7226             dd->comm->cd[d].np_dlb > 0)
7227         {
7228             if (np > dd->comm->cd[d].np_dlb)
7229             {
7230                 return FALSE;
7231             }
7232
7233             /* If a current local cell size is smaller than the requested
7234              * cut-off, we could still fix it, but this gets very complicated.
7235              * Without fixing here, we might actually need more checks.
7236              */
7237             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7238             {
7239                 LocallyLimited = 1;
7240             }
7241         }
7242     }
7243
7244     if (dd->comm->dlbState != edlbsOffForever)
7245     {
7246         /* If DLB is not active yet, we don't need to check the grid jumps.
7247          * Actually we shouldn't, because then the grid jump data is not set.
7248          */
7249         if (dlbIsOn(dd->comm) &&
7250             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7251         {
7252             LocallyLimited = 1;
7253         }
7254
7255         gmx_sumi(1, &LocallyLimited, cr);
7256
7257         if (LocallyLimited > 0)
7258         {
7259             return FALSE;
7260         }
7261     }
7262
7263     return TRUE;
7264 }
7265
7266 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, const t_inputrec *ir,
7267                           real cutoff_req)
7268 {
7269     gmx_bool bCutoffAllowed;
7270
7271     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7272
7273     if (bCutoffAllowed)
7274     {
7275         cr->dd->comm->cutoff = cutoff_req;
7276     }
7277
7278     return bCutoffAllowed;
7279 }
7280
7281 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
7282 {
7283     gmx_domdec_comm_t *comm;
7284
7285     comm = cr->dd->comm;
7286
7287     /* Turn on the DLB limiting (might have been on already) */
7288     comm->bPMELoadBalDLBLimits = TRUE;
7289
7290     /* Change the cut-off limit */
7291     comm->PMELoadBal_max_cutoff = cutoff;
7292
7293     if (debug)
7294     {
7295         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
7296                 comm->PMELoadBal_max_cutoff);
7297     }
7298 }
7299
7300 /* Sets whether we should later check the load imbalance data, so that
7301  * we can trigger dynamic load balancing if enough imbalance has
7302  * arisen.
7303  *
7304  * Used after PME load balancing unlocks DLB, so that the check
7305  * whether DLB will be useful can happen immediately.
7306  */
7307 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
7308 {
7309     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7310     {
7311         dd->comm->bCheckWhetherToTurnDlbOn = bValue;
7312
7313         if (bValue == TRUE)
7314         {
7315             /* Store the DD partitioning count, so we can ignore cycle counts
7316              * over the next nstlist steps, which are often slower.
7317              */
7318             dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
7319         }
7320     }
7321 }
7322
7323 /* Returns if we should check whether there has been enough load
7324  * imbalance to trigger dynamic load balancing.
7325  */
7326 static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
7327 {
7328     if (dd->comm->dlbState != edlbsOffCanTurnOn)
7329     {
7330         return FALSE;
7331     }
7332
7333     if (dd->ddp_count <= dd->comm->ddPartioningCountFirstDlbOff)
7334     {
7335         /* We ignore the first nstlist steps at the start of the run
7336          * or after PME load balancing or after turning DLB off, since
7337          * these often have extra allocation or cache miss overhead.
7338          */
7339         return FALSE;
7340     }
7341
7342     /* We should check whether we should use DLB directly after
7343      * unlocking DLB. */
7344     if (dd->comm->bCheckWhetherToTurnDlbOn)
7345     {
7346         /* This flag was set when the PME load-balancing routines
7347            unlocked DLB, and should now be cleared. */
7348         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
7349         return TRUE;
7350     }
7351     /* We check whether we should use DLB every c_checkTurnDlbOnInterval
7352      * partitionings (we do not do this every partioning, so that we
7353      * avoid excessive communication). */
7354     if (dd->comm->n_load_have % c_checkTurnDlbOnInterval == c_checkTurnDlbOnInterval - 1)
7355     {
7356         return TRUE;
7357     }
7358
7359     return FALSE;
7360 }
7361
7362 gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
7363 {
7364     return dlbIsOn(dd->comm);
7365 }
7366
7367 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
7368 {
7369     return (dd->comm->dlbState == edlbsOffTemporarilyLocked);
7370 }
7371
7372 void dd_dlb_lock(gmx_domdec_t *dd)
7373 {
7374     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7375     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7376     {
7377         dd->comm->dlbState = edlbsOffTemporarilyLocked;
7378     }
7379 }
7380
7381 void dd_dlb_unlock(gmx_domdec_t *dd)
7382 {
7383     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7384     if (dd->comm->dlbState == edlbsOffTemporarilyLocked)
7385     {
7386         dd->comm->dlbState = edlbsOffCanTurnOn;
7387         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
7388     }
7389 }
7390
7391 static void merge_cg_buffers(int ncell,
7392                              gmx_domdec_comm_dim_t *cd, int pulse,
7393                              int  *ncg_cell,
7394                              int  *index_gl, int  *recv_i,
7395                              rvec *cg_cm,    rvec *recv_vr,
7396                              int *cgindex,
7397                              cginfo_mb_t *cginfo_mb, int *cginfo)
7398 {
7399     gmx_domdec_ind_t *ind, *ind_p;
7400     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7401     int               shift, shift_at;
7402
7403     ind = &cd->ind[pulse];
7404
7405     /* First correct the already stored data */
7406     shift = ind->nrecv[ncell];
7407     for (cell = ncell-1; cell >= 0; cell--)
7408     {
7409         shift -= ind->nrecv[cell];
7410         if (shift > 0)
7411         {
7412             /* Move the cg's present from previous grid pulses */
7413             cg0                = ncg_cell[ncell+cell];
7414             cg1                = ncg_cell[ncell+cell+1];
7415             cgindex[cg1+shift] = cgindex[cg1];
7416             for (cg = cg1-1; cg >= cg0; cg--)
7417             {
7418                 index_gl[cg+shift] = index_gl[cg];
7419                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7420                 cgindex[cg+shift] = cgindex[cg];
7421                 cginfo[cg+shift]  = cginfo[cg];
7422             }
7423             /* Correct the already stored send indices for the shift */
7424             for (p = 1; p <= pulse; p++)
7425             {
7426                 ind_p = &cd->ind[p];
7427                 cg0   = 0;
7428                 for (c = 0; c < cell; c++)
7429                 {
7430                     cg0 += ind_p->nsend[c];
7431                 }
7432                 cg1 = cg0 + ind_p->nsend[cell];
7433                 for (cg = cg0; cg < cg1; cg++)
7434                 {
7435                     ind_p->index[cg] += shift;
7436                 }
7437             }
7438         }
7439     }
7440
7441     /* Merge in the communicated buffers */
7442     shift    = 0;
7443     shift_at = 0;
7444     cg0      = 0;
7445     for (cell = 0; cell < ncell; cell++)
7446     {
7447         cg1 = ncg_cell[ncell+cell+1] + shift;
7448         if (shift_at > 0)
7449         {
7450             /* Correct the old cg indices */
7451             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7452             {
7453                 cgindex[cg+1] += shift_at;
7454             }
7455         }
7456         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7457         {
7458             /* Copy this charge group from the buffer */
7459             index_gl[cg1] = recv_i[cg0];
7460             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7461             /* Add it to the cgindex */
7462             cg_gl          = index_gl[cg1];
7463             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7464             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7465             cgindex[cg1+1] = cgindex[cg1] + nat;
7466             cg0++;
7467             cg1++;
7468             shift_at += nat;
7469         }
7470         shift                 += ind->nrecv[cell];
7471         ncg_cell[ncell+cell+1] = cg1;
7472     }
7473 }
7474
7475 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7476                                int nzone, int cg0, const int *cgindex)
7477 {
7478     int cg, zone, p;
7479
7480     /* Store the atom block boundaries for easy copying of communication buffers
7481      */
7482     cg = cg0;
7483     for (zone = 0; zone < nzone; zone++)
7484     {
7485         for (p = 0; p < cd->np; p++)
7486         {
7487             cd->ind[p].cell2at0[zone] = cgindex[cg];
7488             cg += cd->ind[p].nrecv[zone];
7489             cd->ind[p].cell2at1[zone] = cgindex[cg];
7490         }
7491     }
7492 }
7493
7494 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7495 {
7496     int      i;
7497     gmx_bool bMiss;
7498
7499     bMiss = FALSE;
7500     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7501     {
7502         if (!bLocalCG[link->a[i]])
7503         {
7504             bMiss = TRUE;
7505         }
7506     }
7507
7508     return bMiss;
7509 }
7510
7511 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7512 typedef struct {
7513     real c[DIM][4]; /* the corners for the non-bonded communication */
7514     real cr0;       /* corner for rounding */
7515     real cr1[4];    /* corners for rounding */
7516     real bc[DIM];   /* corners for bounded communication */
7517     real bcr1;      /* corner for rounding for bonded communication */
7518 } dd_corners_t;
7519
7520 /* Determine the corners of the domain(s) we are communicating with */
7521 static void
7522 set_dd_corners(const gmx_domdec_t *dd,
7523                int dim0, int dim1, int dim2,
7524                gmx_bool bDistMB,
7525                dd_corners_t *c)
7526 {
7527     const gmx_domdec_comm_t  *comm;
7528     const gmx_domdec_zones_t *zones;
7529     int i, j;
7530
7531     comm = dd->comm;
7532
7533     zones = &comm->zones;
7534
7535     /* Keep the compiler happy */
7536     c->cr0  = 0;
7537     c->bcr1 = 0;
7538
7539     /* The first dimension is equal for all cells */
7540     c->c[0][0] = comm->cell_x0[dim0];
7541     if (bDistMB)
7542     {
7543         c->bc[0] = c->c[0][0];
7544     }
7545     if (dd->ndim >= 2)
7546     {
7547         dim1 = dd->dim[1];
7548         /* This cell row is only seen from the first row */
7549         c->c[1][0] = comm->cell_x0[dim1];
7550         /* All rows can see this row */
7551         c->c[1][1] = comm->cell_x0[dim1];
7552         if (dlbIsOn(dd->comm))
7553         {
7554             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7555             if (bDistMB)
7556             {
7557                 /* For the multi-body distance we need the maximum */
7558                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7559             }
7560         }
7561         /* Set the upper-right corner for rounding */
7562         c->cr0 = comm->cell_x1[dim0];
7563
7564         if (dd->ndim >= 3)
7565         {
7566             dim2 = dd->dim[2];
7567             for (j = 0; j < 4; j++)
7568             {
7569                 c->c[2][j] = comm->cell_x0[dim2];
7570             }
7571             if (dlbIsOn(dd->comm))
7572             {
7573                 /* Use the maximum of the i-cells that see a j-cell */
7574                 for (i = 0; i < zones->nizone; i++)
7575                 {
7576                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7577                     {
7578                         if (j >= 4)
7579                         {
7580                             c->c[2][j-4] =
7581                                 std::max(c->c[2][j-4],
7582                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7583                         }
7584                     }
7585                 }
7586                 if (bDistMB)
7587                 {
7588                     /* For the multi-body distance we need the maximum */
7589                     c->bc[2] = comm->cell_x0[dim2];
7590                     for (i = 0; i < 2; i++)
7591                     {
7592                         for (j = 0; j < 2; j++)
7593                         {
7594                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
7595                         }
7596                     }
7597                 }
7598             }
7599
7600             /* Set the upper-right corner for rounding */
7601             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7602              * Only cell (0,0,0) can see cell 7 (1,1,1)
7603              */
7604             c->cr1[0] = comm->cell_x1[dim1];
7605             c->cr1[3] = comm->cell_x1[dim1];
7606             if (dlbIsOn(dd->comm))
7607             {
7608                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7609                 if (bDistMB)
7610                 {
7611                     /* For the multi-body distance we need the maximum */
7612                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7613                 }
7614             }
7615         }
7616     }
7617 }
7618
7619 /* Determine which cg's we need to send in this pulse from this zone */
7620 static void
7621 get_zone_pulse_cgs(gmx_domdec_t *dd,
7622                    int zonei, int zone,
7623                    int cg0, int cg1,
7624                    const int *index_gl,
7625                    const int *cgindex,
7626                    int dim, int dim_ind,
7627                    int dim0, int dim1, int dim2,
7628                    real r_comm2, real r_bcomm2,
7629                    matrix box,
7630                    ivec tric_dist,
7631                    rvec *normal,
7632                    real skew_fac2_d, real skew_fac_01,
7633                    rvec *v_d, rvec *v_0, rvec *v_1,
7634                    const dd_corners_t *c,
7635                    rvec sf2_round,
7636                    gmx_bool bDistBonded,
7637                    gmx_bool bBondComm,
7638                    gmx_bool bDist2B,
7639                    gmx_bool bDistMB,
7640                    rvec *cg_cm,
7641                    int *cginfo,
7642                    gmx_domdec_ind_t *ind,
7643                    int **ibuf, int *ibuf_nalloc,
7644                    vec_rvec_t *vbuf,
7645                    int *nsend_ptr,
7646                    int *nat_ptr,
7647                    int *nsend_z_ptr)
7648 {
7649     gmx_domdec_comm_t *comm;
7650     gmx_bool           bScrew;
7651     gmx_bool           bDistMB_pulse;
7652     int                cg, i;
7653     real               r2, rb2, r, tric_sh;
7654     rvec               rn, rb;
7655     int                dimd;
7656     int                nsend_z, nsend, nat;
7657
7658     comm = dd->comm;
7659
7660     bScrew = (dd->bScrewPBC && dim == XX);
7661
7662     bDistMB_pulse = (bDistMB && bDistBonded);
7663
7664     nsend_z = 0;
7665     nsend   = *nsend_ptr;
7666     nat     = *nat_ptr;
7667
7668     for (cg = cg0; cg < cg1; cg++)
7669     {
7670         r2  = 0;
7671         rb2 = 0;
7672         if (tric_dist[dim_ind] == 0)
7673         {
7674             /* Rectangular direction, easy */
7675             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7676             if (r > 0)
7677             {
7678                 r2 += r*r;
7679             }
7680             if (bDistMB_pulse)
7681             {
7682                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7683                 if (r > 0)
7684                 {
7685                     rb2 += r*r;
7686                 }
7687             }
7688             /* Rounding gives at most a 16% reduction
7689              * in communicated atoms
7690              */
7691             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7692             {
7693                 r = cg_cm[cg][dim0] - c->cr0;
7694                 /* This is the first dimension, so always r >= 0 */
7695                 r2 += r*r;
7696                 if (bDistMB_pulse)
7697                 {
7698                     rb2 += r*r;
7699                 }
7700             }
7701             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7702             {
7703                 r = cg_cm[cg][dim1] - c->cr1[zone];
7704                 if (r > 0)
7705                 {
7706                     r2 += r*r;
7707                 }
7708                 if (bDistMB_pulse)
7709                 {
7710                     r = cg_cm[cg][dim1] - c->bcr1;
7711                     if (r > 0)
7712                     {
7713                         rb2 += r*r;
7714                     }
7715                 }
7716             }
7717         }
7718         else
7719         {
7720             /* Triclinic direction, more complicated */
7721             clear_rvec(rn);
7722             clear_rvec(rb);
7723             /* Rounding, conservative as the skew_fac multiplication
7724              * will slightly underestimate the distance.
7725              */
7726             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7727             {
7728                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7729                 for (i = dim0+1; i < DIM; i++)
7730                 {
7731                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7732                 }
7733                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7734                 if (bDistMB_pulse)
7735                 {
7736                     rb[dim0] = rn[dim0];
7737                     rb2      = r2;
7738                 }
7739                 /* Take care that the cell planes along dim0 might not
7740                  * be orthogonal to those along dim1 and dim2.
7741                  */
7742                 for (i = 1; i <= dim_ind; i++)
7743                 {
7744                     dimd = dd->dim[i];
7745                     if (normal[dim0][dimd] > 0)
7746                     {
7747                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7748                         if (bDistMB_pulse)
7749                         {
7750                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7751                         }
7752                     }
7753                 }
7754             }
7755             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7756             {
7757                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7758                 tric_sh   = 0;
7759                 for (i = dim1+1; i < DIM; i++)
7760                 {
7761                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7762                 }
7763                 rn[dim1] += tric_sh;
7764                 if (rn[dim1] > 0)
7765                 {
7766                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7767                     /* Take care of coupling of the distances
7768                      * to the planes along dim0 and dim1 through dim2.
7769                      */
7770                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7771                     /* Take care that the cell planes along dim1
7772                      * might not be orthogonal to that along dim2.
7773                      */
7774                     if (normal[dim1][dim2] > 0)
7775                     {
7776                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7777                     }
7778                 }
7779                 if (bDistMB_pulse)
7780                 {
7781                     rb[dim1] +=
7782                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7783                     if (rb[dim1] > 0)
7784                     {
7785                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7786                         /* Take care of coupling of the distances
7787                          * to the planes along dim0 and dim1 through dim2.
7788                          */
7789                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7790                         /* Take care that the cell planes along dim1
7791                          * might not be orthogonal to that along dim2.
7792                          */
7793                         if (normal[dim1][dim2] > 0)
7794                         {
7795                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7796                         }
7797                     }
7798                 }
7799             }
7800             /* The distance along the communication direction */
7801             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7802             tric_sh  = 0;
7803             for (i = dim+1; i < DIM; i++)
7804             {
7805                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7806             }
7807             rn[dim] += tric_sh;
7808             if (rn[dim] > 0)
7809             {
7810                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7811                 /* Take care of coupling of the distances
7812                  * to the planes along dim0 and dim1 through dim2.
7813                  */
7814                 if (dim_ind == 1 && zonei == 1)
7815                 {
7816                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7817                 }
7818             }
7819             if (bDistMB_pulse)
7820             {
7821                 clear_rvec(rb);
7822                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7823                 if (rb[dim] > 0)
7824                 {
7825                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7826                     /* Take care of coupling of the distances
7827                      * to the planes along dim0 and dim1 through dim2.
7828                      */
7829                     if (dim_ind == 1 && zonei == 1)
7830                     {
7831                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7832                     }
7833                 }
7834             }
7835         }
7836
7837         if (r2 < r_comm2 ||
7838             (bDistBonded &&
7839              ((bDistMB && rb2 < r_bcomm2) ||
7840               (bDist2B && r2  < r_bcomm2)) &&
7841              (!bBondComm ||
7842               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7843                missing_link(comm->cglink, index_gl[cg],
7844                             comm->bLocalCG)))))
7845         {
7846             /* Make an index to the local charge groups */
7847             if (nsend+1 > ind->nalloc)
7848             {
7849                 ind->nalloc = over_alloc_large(nsend+1);
7850                 srenew(ind->index, ind->nalloc);
7851             }
7852             if (nsend+1 > *ibuf_nalloc)
7853             {
7854                 *ibuf_nalloc = over_alloc_large(nsend+1);
7855                 srenew(*ibuf, *ibuf_nalloc);
7856             }
7857             ind->index[nsend] = cg;
7858             (*ibuf)[nsend]    = index_gl[cg];
7859             nsend_z++;
7860             vec_rvec_check_alloc(vbuf, nsend+1);
7861
7862             if (dd->ci[dim] == 0)
7863             {
7864                 /* Correct cg_cm for pbc */
7865                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7866                 if (bScrew)
7867                 {
7868                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7869                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7870                 }
7871             }
7872             else
7873             {
7874                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7875             }
7876             nsend++;
7877             nat += cgindex[cg+1] - cgindex[cg];
7878         }
7879     }
7880
7881     *nsend_ptr   = nsend;
7882     *nat_ptr     = nat;
7883     *nsend_z_ptr = nsend_z;
7884 }
7885
7886 static void setup_dd_communication(gmx_domdec_t *dd,
7887                                    matrix box, gmx_ddbox_t *ddbox,
7888                                    t_forcerec *fr,
7889                                    t_state *state, PaddedRVecVector *f)
7890 {
7891     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7892     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7893     int                    c, i, cg, cg_gl, nrcg;
7894     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7895     gmx_domdec_comm_t     *comm;
7896     gmx_domdec_zones_t    *zones;
7897     gmx_domdec_comm_dim_t *cd;
7898     gmx_domdec_ind_t      *ind;
7899     cginfo_mb_t           *cginfo_mb;
7900     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
7901     real                   r_comm2, r_bcomm2;
7902     dd_corners_t           corners;
7903     ivec                   tric_dist;
7904     rvec                  *cg_cm, *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr, *recv_vr;
7905     real                   skew_fac2_d, skew_fac_01;
7906     rvec                   sf2_round;
7907     int                    nsend, nat;
7908     int                    th;
7909
7910     if (debug)
7911     {
7912         fprintf(debug, "Setting up DD communication\n");
7913     }
7914
7915     comm  = dd->comm;
7916
7917     if (comm->nth == 0)
7918     {
7919         /* Initialize the thread data.
7920          * This can not be done in init_domain_decomposition,
7921          * as the numbers of threads is determined later.
7922          */
7923         comm->nth = gmx_omp_nthreads_get(emntDomdec);
7924         if (comm->nth > 1)
7925         {
7926             snew(comm->dth, comm->nth);
7927         }
7928     }
7929
7930     switch (fr->cutoff_scheme)
7931     {
7932         case ecutsGROUP:
7933             cg_cm = fr->cg_cm;
7934             break;
7935         case ecutsVERLET:
7936             cg_cm = as_rvec_array(state->x.data());
7937             break;
7938         default:
7939             gmx_incons("unimplemented");
7940             cg_cm = nullptr;
7941     }
7942
7943     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
7944     {
7945         /* Check if we need to use triclinic distances */
7946         tric_dist[dim_ind] = 0;
7947         for (i = 0; i <= dim_ind; i++)
7948         {
7949             if (ddbox->tric_dir[dd->dim[i]])
7950             {
7951                 tric_dist[dim_ind] = 1;
7952             }
7953         }
7954     }
7955
7956     bBondComm = comm->bBondComm;
7957
7958     /* Do we need to determine extra distances for multi-body bondeds? */
7959     bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
7960
7961     /* Do we need to determine extra distances for only two-body bondeds? */
7962     bDist2B = (bBondComm && !bDistMB);
7963
7964     r_comm2  = gmx::square(comm->cutoff);
7965     r_bcomm2 = gmx::square(comm->cutoff_mbody);
7966
7967     if (debug)
7968     {
7969         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, std::sqrt(r_bcomm2));
7970     }
7971
7972     zones = &comm->zones;
7973
7974     dim0 = dd->dim[0];
7975     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
7976     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
7977
7978     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
7979
7980     /* Triclinic stuff */
7981     normal      = ddbox->normal;
7982     skew_fac_01 = 0;
7983     if (dd->ndim >= 2)
7984     {
7985         v_0 = ddbox->v[dim0];
7986         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7987         {
7988             /* Determine the coupling coefficient for the distances
7989              * to the cell planes along dim0 and dim1 through dim2.
7990              * This is required for correct rounding.
7991              */
7992             skew_fac_01 =
7993                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7994             if (debug)
7995             {
7996                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
7997             }
7998         }
7999     }
8000     if (dd->ndim >= 3)
8001     {
8002         v_1 = ddbox->v[dim1];
8003     }
8004
8005     zone_cg_range = zones->cg_range;
8006     index_gl      = dd->index_gl;
8007     cgindex       = dd->cgindex;
8008     cginfo_mb     = fr->cginfo_mb;
8009
8010     zone_cg_range[0]   = 0;
8011     zone_cg_range[1]   = dd->ncg_home;
8012     comm->zone_ncg1[0] = dd->ncg_home;
8013     pos_cg             = dd->ncg_home;
8014
8015     nat_tot = dd->nat_home;
8016     nzone   = 1;
8017     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8018     {
8019         dim = dd->dim[dim_ind];
8020         cd  = &comm->cd[dim_ind];
8021
8022         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8023         {
8024             /* No pbc in this dimension, the first node should not comm. */
8025             nzone_send = 0;
8026         }
8027         else
8028         {
8029             nzone_send = nzone;
8030         }
8031
8032         v_d         = ddbox->v[dim];
8033         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
8034
8035         cd->bInPlace = TRUE;
8036         for (p = 0; p < cd->np; p++)
8037         {
8038             /* Only atoms communicated in the first pulse are used
8039              * for multi-body bonded interactions or for bBondComm.
8040              */
8041             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8042
8043             ind   = &cd->ind[p];
8044             nsend = 0;
8045             nat   = 0;
8046             for (zone = 0; zone < nzone_send; zone++)
8047             {
8048                 if (tric_dist[dim_ind] && dim_ind > 0)
8049                 {
8050                     /* Determine slightly more optimized skew_fac's
8051                      * for rounding.
8052                      * This reduces the number of communicated atoms
8053                      * by about 10% for 3D DD of rhombic dodecahedra.
8054                      */
8055                     for (dimd = 0; dimd < dim; dimd++)
8056                     {
8057                         sf2_round[dimd] = 1;
8058                         if (ddbox->tric_dir[dimd])
8059                         {
8060                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8061                             {
8062                                 /* If we are shifted in dimension i
8063                                  * and the cell plane is tilted forward
8064                                  * in dimension i, skip this coupling.
8065                                  */
8066                                 if (!(zones->shift[nzone+zone][i] &&
8067                                       ddbox->v[dimd][i][dimd] >= 0))
8068                                 {
8069                                     sf2_round[dimd] +=
8070                                         gmx::square(ddbox->v[dimd][i][dimd]);
8071                                 }
8072                             }
8073                             sf2_round[dimd] = 1/sf2_round[dimd];
8074                         }
8075                     }
8076                 }
8077
8078                 zonei = zone_perm[dim_ind][zone];
8079                 if (p == 0)
8080                 {
8081                     /* Here we permutate the zones to obtain a convenient order
8082                      * for neighbor searching
8083                      */
8084                     cg0 = zone_cg_range[zonei];
8085                     cg1 = zone_cg_range[zonei+1];
8086                 }
8087                 else
8088                 {
8089                     /* Look only at the cg's received in the previous grid pulse
8090                      */
8091                     cg1 = zone_cg_range[nzone+zone+1];
8092                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8093                 }
8094
8095 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8096                 for (th = 0; th < comm->nth; th++)
8097                 {
8098                     try
8099                     {
8100                         gmx_domdec_ind_t *ind_p;
8101                         int             **ibuf_p, *ibuf_nalloc_p;
8102                         vec_rvec_t       *vbuf_p;
8103                         int              *nsend_p, *nat_p;
8104                         int              *nsend_zone_p;
8105                         int               cg0_th, cg1_th;
8106
8107                         if (th == 0)
8108                         {
8109                             /* Thread 0 writes in the comm buffers */
8110                             ind_p         = ind;
8111                             ibuf_p        = &comm->buf_int;
8112                             ibuf_nalloc_p = &comm->nalloc_int;
8113                             vbuf_p        = &comm->vbuf;
8114                             nsend_p       = &nsend;
8115                             nat_p         = &nat;
8116                             nsend_zone_p  = &ind->nsend[zone];
8117                         }
8118                         else
8119                         {
8120                             /* Other threads write into temp buffers */
8121                             ind_p         = &comm->dth[th].ind;
8122                             ibuf_p        = &comm->dth[th].ibuf;
8123                             ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8124                             vbuf_p        = &comm->dth[th].vbuf;
8125                             nsend_p       = &comm->dth[th].nsend;
8126                             nat_p         = &comm->dth[th].nat;
8127                             nsend_zone_p  = &comm->dth[th].nsend_zone;
8128
8129                             comm->dth[th].nsend      = 0;
8130                             comm->dth[th].nat        = 0;
8131                             comm->dth[th].nsend_zone = 0;
8132                         }
8133
8134                         if (comm->nth == 1)
8135                         {
8136                             cg0_th = cg0;
8137                             cg1_th = cg1;
8138                         }
8139                         else
8140                         {
8141                             cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8142                             cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8143                         }
8144
8145                         /* Get the cg's for this pulse in this zone */
8146                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8147                                            index_gl, cgindex,
8148                                            dim, dim_ind, dim0, dim1, dim2,
8149                                            r_comm2, r_bcomm2,
8150                                            box, tric_dist,
8151                                            normal, skew_fac2_d, skew_fac_01,
8152                                            v_d, v_0, v_1, &corners, sf2_round,
8153                                            bDistBonded, bBondComm,
8154                                            bDist2B, bDistMB,
8155                                            cg_cm, fr->cginfo,
8156                                            ind_p,
8157                                            ibuf_p, ibuf_nalloc_p,
8158                                            vbuf_p,
8159                                            nsend_p, nat_p,
8160                                            nsend_zone_p);
8161                     }
8162                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
8163                 } // END
8164
8165                 /* Append data of threads>=1 to the communication buffers */
8166                 for (th = 1; th < comm->nth; th++)
8167                 {
8168                     dd_comm_setup_work_t *dth;
8169                     int                   i, ns1;
8170
8171                     dth = &comm->dth[th];
8172
8173                     ns1 = nsend + dth->nsend_zone;
8174                     if (ns1 > ind->nalloc)
8175                     {
8176                         ind->nalloc = over_alloc_dd(ns1);
8177                         srenew(ind->index, ind->nalloc);
8178                     }
8179                     if (ns1 > comm->nalloc_int)
8180                     {
8181                         comm->nalloc_int = over_alloc_dd(ns1);
8182                         srenew(comm->buf_int, comm->nalloc_int);
8183                     }
8184                     if (ns1 > comm->vbuf.nalloc)
8185                     {
8186                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8187                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8188                     }
8189
8190                     for (i = 0; i < dth->nsend_zone; i++)
8191                     {
8192                         ind->index[nsend]    = dth->ind.index[i];
8193                         comm->buf_int[nsend] = dth->ibuf[i];
8194                         copy_rvec(dth->vbuf.v[i],
8195                                   comm->vbuf.v[nsend]);
8196                         nsend++;
8197                     }
8198                     nat              += dth->nat;
8199                     ind->nsend[zone] += dth->nsend_zone;
8200                 }
8201             }
8202             /* Clear the counts in case we do not have pbc */
8203             for (zone = nzone_send; zone < nzone; zone++)
8204             {
8205                 ind->nsend[zone] = 0;
8206             }
8207             ind->nsend[nzone]   = nsend;
8208             ind->nsend[nzone+1] = nat;
8209             /* Communicate the number of cg's and atoms to receive */
8210             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8211                             ind->nsend, nzone+2,
8212                             ind->nrecv, nzone+2);
8213
8214             /* The rvec buffer is also required for atom buffers of size nsend
8215              * in dd_move_x and dd_move_f.
8216              */
8217             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8218
8219             if (p > 0)
8220             {
8221                 /* We can receive in place if only the last zone is not empty */
8222                 for (zone = 0; zone < nzone-1; zone++)
8223                 {
8224                     if (ind->nrecv[zone] > 0)
8225                     {
8226                         cd->bInPlace = FALSE;
8227                     }
8228                 }
8229                 if (!cd->bInPlace)
8230                 {
8231                     /* The int buffer is only required here for the cg indices */
8232                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8233                     {
8234                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8235                         srenew(comm->buf_int2, comm->nalloc_int2);
8236                     }
8237                     /* The rvec buffer is also required for atom buffers
8238                      * of size nrecv in dd_move_x and dd_move_f.
8239                      */
8240                     i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8241                     vec_rvec_check_alloc(&comm->vbuf2, i);
8242                 }
8243             }
8244
8245             /* Make space for the global cg indices */
8246             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8247                 || dd->cg_nalloc == 0)
8248             {
8249                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8250                 srenew(index_gl, dd->cg_nalloc);
8251                 srenew(cgindex, dd->cg_nalloc+1);
8252             }
8253             /* Communicate the global cg indices */
8254             if (cd->bInPlace)
8255             {
8256                 recv_i = index_gl + pos_cg;
8257             }
8258             else
8259             {
8260                 recv_i = comm->buf_int2;
8261             }
8262             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8263                             comm->buf_int, nsend,
8264                             recv_i,        ind->nrecv[nzone]);
8265
8266             /* Make space for cg_cm */
8267             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8268             if (fr->cutoff_scheme == ecutsGROUP)
8269             {
8270                 cg_cm = fr->cg_cm;
8271             }
8272             else
8273             {
8274                 cg_cm = as_rvec_array(state->x.data());
8275             }
8276             /* Communicate cg_cm */
8277             if (cd->bInPlace)
8278             {
8279                 recv_vr = cg_cm + pos_cg;
8280             }
8281             else
8282             {
8283                 recv_vr = comm->vbuf2.v;
8284             }
8285             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8286                              comm->vbuf.v, nsend,
8287                              recv_vr,      ind->nrecv[nzone]);
8288
8289             /* Make the charge group index */
8290             if (cd->bInPlace)
8291             {
8292                 zone = (p == 0 ? 0 : nzone - 1);
8293                 while (zone < nzone)
8294                 {
8295                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8296                     {
8297                         cg_gl              = index_gl[pos_cg];
8298                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8299                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8300                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8301                         if (bBondComm)
8302                         {
8303                             /* Update the charge group presence,
8304                              * so we can use it in the next pass of the loop.
8305                              */
8306                             comm->bLocalCG[cg_gl] = TRUE;
8307                         }
8308                         pos_cg++;
8309                     }
8310                     if (p == 0)
8311                     {
8312                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8313                     }
8314                     zone++;
8315                     zone_cg_range[nzone+zone] = pos_cg;
8316                 }
8317             }
8318             else
8319             {
8320                 /* This part of the code is never executed with bBondComm. */
8321                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8322                                  index_gl, recv_i, cg_cm, recv_vr,
8323                                  cgindex, fr->cginfo_mb, fr->cginfo);
8324                 pos_cg += ind->nrecv[nzone];
8325             }
8326             nat_tot += ind->nrecv[nzone+1];
8327         }
8328         if (!cd->bInPlace)
8329         {
8330             /* Store the atom block for easy copying of communication buffers */
8331             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8332         }
8333         nzone += nzone;
8334     }
8335     dd->index_gl = index_gl;
8336     dd->cgindex  = cgindex;
8337
8338     dd->ncg_tot          = zone_cg_range[zones->n];
8339     dd->nat_tot          = nat_tot;
8340     comm->nat[ddnatHOME] = dd->nat_home;
8341     for (i = ddnatZONE; i < ddnatNR; i++)
8342     {
8343         comm->nat[i] = dd->nat_tot;
8344     }
8345
8346     if (!bBondComm)
8347     {
8348         /* We don't need to update cginfo, since that was alrady done above.
8349          * So we pass NULL for the forcerec.
8350          */
8351         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8352                       nullptr, comm->bLocalCG);
8353     }
8354
8355     if (debug)
8356     {
8357         fprintf(debug, "Finished setting up DD communication, zones:");
8358         for (c = 0; c < zones->n; c++)
8359         {
8360             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8361         }
8362         fprintf(debug, "\n");
8363     }
8364 }
8365
8366 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8367 {
8368     int c;
8369
8370     for (c = 0; c < zones->nizone; c++)
8371     {
8372         zones->izone[c].cg1  = zones->cg_range[c+1];
8373         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8374         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8375     }
8376 }
8377
8378 static void set_zones_size(gmx_domdec_t *dd,
8379                            matrix box, const gmx_ddbox_t *ddbox,
8380                            int zone_start, int zone_end)
8381 {
8382     gmx_domdec_comm_t  *comm;
8383     gmx_domdec_zones_t *zones;
8384     gmx_bool            bDistMB;
8385     int                 z, zi, d, dim;
8386     real                rcs, rcmbs;
8387     int                 i, j;
8388     real                vol;
8389
8390     comm = dd->comm;
8391
8392     zones = &comm->zones;
8393
8394     /* Do we need to determine extra distances for multi-body bondeds? */
8395     bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
8396
8397     for (z = zone_start; z < zone_end; z++)
8398     {
8399         /* Copy cell limits to zone limits.
8400          * Valid for non-DD dims and non-shifted dims.
8401          */
8402         copy_rvec(comm->cell_x0, zones->size[z].x0);
8403         copy_rvec(comm->cell_x1, zones->size[z].x1);
8404     }
8405
8406     for (d = 0; d < dd->ndim; d++)
8407     {
8408         dim = dd->dim[d];
8409
8410         for (z = 0; z < zones->n; z++)
8411         {
8412             /* With a staggered grid we have different sizes
8413              * for non-shifted dimensions.
8414              */
8415             if (dlbIsOn(dd->comm) && zones->shift[z][dim] == 0)
8416             {
8417                 if (d == 1)
8418                 {
8419                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8420                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8421                 }
8422                 else if (d == 2)
8423                 {
8424                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8425                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8426                 }
8427             }
8428         }
8429
8430         rcs   = comm->cutoff;
8431         rcmbs = comm->cutoff_mbody;
8432         if (ddbox->tric_dir[dim])
8433         {
8434             rcs   /= ddbox->skew_fac[dim];
8435             rcmbs /= ddbox->skew_fac[dim];
8436         }
8437
8438         /* Set the lower limit for the shifted zone dimensions */
8439         for (z = zone_start; z < zone_end; z++)
8440         {
8441             if (zones->shift[z][dim] > 0)
8442             {
8443                 dim = dd->dim[d];
8444                 if (!dlbIsOn(dd->comm) || d == 0)
8445                 {
8446                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8447                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8448                 }
8449                 else
8450                 {
8451                     /* Here we take the lower limit of the zone from
8452                      * the lowest domain of the zone below.
8453                      */
8454                     if (z < 4)
8455                     {
8456                         zones->size[z].x0[dim] =
8457                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8458                     }
8459                     else
8460                     {
8461                         if (d == 1)
8462                         {
8463                             zones->size[z].x0[dim] =
8464                                 zones->size[zone_perm[2][z-4]].x0[dim];
8465                         }
8466                         else
8467                         {
8468                             zones->size[z].x0[dim] =
8469                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8470                         }
8471                     }
8472                     /* A temporary limit, is updated below */
8473                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8474
8475                     if (bDistMB)
8476                     {
8477                         for (zi = 0; zi < zones->nizone; zi++)
8478                         {
8479                             if (zones->shift[zi][dim] == 0)
8480                             {
8481                                 /* This takes the whole zone into account.
8482                                  * With multiple pulses this will lead
8483                                  * to a larger zone then strictly necessary.
8484                                  */
8485                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8486                                                                   zones->size[zi].x1[dim]+rcmbs);
8487                             }
8488                         }
8489                     }
8490                 }
8491             }
8492         }
8493
8494         /* Loop over the i-zones to set the upper limit of each
8495          * j-zone they see.
8496          */
8497         for (zi = 0; zi < zones->nizone; zi++)
8498         {
8499             if (zones->shift[zi][dim] == 0)
8500             {
8501                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8502                 {
8503                     if (zones->shift[z][dim] > 0)
8504                     {
8505                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8506                                                           zones->size[zi].x1[dim]+rcs);
8507                     }
8508                 }
8509             }
8510         }
8511     }
8512
8513     for (z = zone_start; z < zone_end; z++)
8514     {
8515         /* Initialization only required to keep the compiler happy */
8516         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8517         int  nc, c;
8518
8519         /* To determine the bounding box for a zone we need to find
8520          * the extreme corners of 4, 2 or 1 corners.
8521          */
8522         nc = 1 << (ddbox->nboundeddim - 1);
8523
8524         for (c = 0; c < nc; c++)
8525         {
8526             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8527             corner[XX] = 0;
8528             if ((c & 1) == 0)
8529             {
8530                 corner[YY] = zones->size[z].x0[YY];
8531             }
8532             else
8533             {
8534                 corner[YY] = zones->size[z].x1[YY];
8535             }
8536             if ((c & 2) == 0)
8537             {
8538                 corner[ZZ] = zones->size[z].x0[ZZ];
8539             }
8540             else
8541             {
8542                 corner[ZZ] = zones->size[z].x1[ZZ];
8543             }
8544             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
8545                 box[ZZ][1 - dd->dim[0]] != 0)
8546             {
8547                 /* With 1D domain decomposition the cg's are not in
8548                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
8549                  * Shift the corner of the z-vector back to along the box
8550                  * vector of dimension d, so it will later end up at 0 along d.
8551                  * This can affect the location of this corner along dd->dim[0]
8552                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
8553                  */
8554                 int d = 1 - dd->dim[0];
8555
8556                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
8557             }
8558             /* Apply the triclinic couplings */
8559             assert(ddbox->npbcdim <= DIM);
8560             for (i = YY; i < ddbox->npbcdim; i++)
8561             {
8562                 for (j = XX; j < i; j++)
8563                 {
8564                     corner[j] += corner[i]*box[i][j]/box[i][i];
8565                 }
8566             }
8567             if (c == 0)
8568             {
8569                 copy_rvec(corner, corner_min);
8570                 copy_rvec(corner, corner_max);
8571             }
8572             else
8573             {
8574                 for (i = 0; i < DIM; i++)
8575                 {
8576                     corner_min[i] = std::min(corner_min[i], corner[i]);
8577                     corner_max[i] = std::max(corner_max[i], corner[i]);
8578                 }
8579             }
8580         }
8581         /* Copy the extreme cornes without offset along x */
8582         for (i = 0; i < DIM; i++)
8583         {
8584             zones->size[z].bb_x0[i] = corner_min[i];
8585             zones->size[z].bb_x1[i] = corner_max[i];
8586         }
8587         /* Add the offset along x */
8588         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8589         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8590     }
8591
8592     if (zone_start == 0)
8593     {
8594         vol = 1;
8595         for (dim = 0; dim < DIM; dim++)
8596         {
8597             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8598         }
8599         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8600     }
8601
8602     if (debug)
8603     {
8604         for (z = zone_start; z < zone_end; z++)
8605         {
8606             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8607                     z,
8608                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8609                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8610                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8611             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8612                     z,
8613                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8614                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8615                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8616         }
8617     }
8618 }
8619
8620 static int comp_cgsort(const void *a, const void *b)
8621 {
8622     int           comp;
8623
8624     gmx_cgsort_t *cga, *cgb;
8625     cga = (gmx_cgsort_t *)a;
8626     cgb = (gmx_cgsort_t *)b;
8627
8628     comp = cga->nsc - cgb->nsc;
8629     if (comp == 0)
8630     {
8631         comp = cga->ind_gl - cgb->ind_gl;
8632     }
8633
8634     return comp;
8635 }
8636
8637 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8638                          int *a, int *buf)
8639 {
8640     int i;
8641
8642     /* Order the data */
8643     for (i = 0; i < n; i++)
8644     {
8645         buf[i] = a[sort[i].ind];
8646     }
8647
8648     /* Copy back to the original array */
8649     for (i = 0; i < n; i++)
8650     {
8651         a[i] = buf[i];
8652     }
8653 }
8654
8655 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8656                          rvec *v, rvec *buf)
8657 {
8658     int i;
8659
8660     /* Order the data */
8661     for (i = 0; i < n; i++)
8662     {
8663         copy_rvec(v[sort[i].ind], buf[i]);
8664     }
8665
8666     /* Copy back to the original array */
8667     for (i = 0; i < n; i++)
8668     {
8669         copy_rvec(buf[i], v[i]);
8670     }
8671 }
8672
8673 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8674                            rvec *v, rvec *buf)
8675 {
8676     int a, atot, cg, cg0, cg1, i;
8677
8678     if (cgindex == nullptr)
8679     {
8680         /* Avoid the useless loop of the atoms within a cg */
8681         order_vec_cg(ncg, sort, v, buf);
8682
8683         return;
8684     }
8685
8686     /* Order the data */
8687     a = 0;
8688     for (cg = 0; cg < ncg; cg++)
8689     {
8690         cg0 = cgindex[sort[cg].ind];
8691         cg1 = cgindex[sort[cg].ind+1];
8692         for (i = cg0; i < cg1; i++)
8693         {
8694             copy_rvec(v[i], buf[a]);
8695             a++;
8696         }
8697     }
8698     atot = a;
8699
8700     /* Copy back to the original array */
8701     for (a = 0; a < atot; a++)
8702     {
8703         copy_rvec(buf[a], v[a]);
8704     }
8705 }
8706
8707 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8708                          int nsort_new, gmx_cgsort_t *sort_new,
8709                          gmx_cgsort_t *sort1)
8710 {
8711     int i1, i2, i_new;
8712
8713     /* The new indices are not very ordered, so we qsort them */
8714     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8715
8716     /* sort2 is already ordered, so now we can merge the two arrays */
8717     i1    = 0;
8718     i2    = 0;
8719     i_new = 0;
8720     while (i2 < nsort2 || i_new < nsort_new)
8721     {
8722         if (i2 == nsort2)
8723         {
8724             sort1[i1++] = sort_new[i_new++];
8725         }
8726         else if (i_new == nsort_new)
8727         {
8728             sort1[i1++] = sort2[i2++];
8729         }
8730         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8731                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8732                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8733         {
8734             sort1[i1++] = sort2[i2++];
8735         }
8736         else
8737         {
8738             sort1[i1++] = sort_new[i_new++];
8739         }
8740     }
8741 }
8742
8743 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8744 {
8745     gmx_domdec_sort_t *sort;
8746     gmx_cgsort_t      *cgsort, *sort_i;
8747     int                ncg_new, nsort2, nsort_new, i, *a, moved;
8748
8749     sort = dd->comm->sort;
8750
8751     a = fr->ns->grid->cell_index;
8752
8753     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
8754
8755     if (ncg_home_old >= 0)
8756     {
8757         /* The charge groups that remained in the same ns grid cell
8758          * are completely ordered. So we can sort efficiently by sorting
8759          * the charge groups that did move into the stationary list.
8760          */
8761         ncg_new   = 0;
8762         nsort2    = 0;
8763         nsort_new = 0;
8764         for (i = 0; i < dd->ncg_home; i++)
8765         {
8766             /* Check if this cg did not move to another node */
8767             if (a[i] < moved)
8768             {
8769                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8770                 {
8771                     /* This cg is new on this node or moved ns grid cell */
8772                     if (nsort_new >= sort->sort_new_nalloc)
8773                     {
8774                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8775                         srenew(sort->sort_new, sort->sort_new_nalloc);
8776                     }
8777                     sort_i = &(sort->sort_new[nsort_new++]);
8778                 }
8779                 else
8780                 {
8781                     /* This cg did not move */
8782                     sort_i = &(sort->sort2[nsort2++]);
8783                 }
8784                 /* Sort on the ns grid cell indices
8785                  * and the global topology index.
8786                  * index_gl is irrelevant with cell ns,
8787                  * but we set it here anyhow to avoid a conditional.
8788                  */
8789                 sort_i->nsc    = a[i];
8790                 sort_i->ind_gl = dd->index_gl[i];
8791                 sort_i->ind    = i;
8792                 ncg_new++;
8793             }
8794         }
8795         if (debug)
8796         {
8797             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8798                     nsort2, nsort_new);
8799         }
8800         /* Sort efficiently */
8801         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8802                      sort->sort);
8803     }
8804     else
8805     {
8806         cgsort  = sort->sort;
8807         ncg_new = 0;
8808         for (i = 0; i < dd->ncg_home; i++)
8809         {
8810             /* Sort on the ns grid cell indices
8811              * and the global topology index
8812              */
8813             cgsort[i].nsc    = a[i];
8814             cgsort[i].ind_gl = dd->index_gl[i];
8815             cgsort[i].ind    = i;
8816             if (cgsort[i].nsc < moved)
8817             {
8818                 ncg_new++;
8819             }
8820         }
8821         if (debug)
8822         {
8823             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8824         }
8825         /* Determine the order of the charge groups using qsort */
8826         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8827     }
8828
8829     return ncg_new;
8830 }
8831
8832 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8833 {
8834     gmx_cgsort_t *sort;
8835     int           ncg_new, i, na;
8836     const int    *a;
8837
8838     sort = dd->comm->sort->sort;
8839
8840     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8841
8842     ncg_new = 0;
8843     for (i = 0; i < na; i++)
8844     {
8845         if (a[i] >= 0)
8846         {
8847             sort[ncg_new].ind = a[i];
8848             ncg_new++;
8849         }
8850     }
8851
8852     return ncg_new;
8853 }
8854
8855 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
8856                           int ncg_home_old)
8857 {
8858     gmx_domdec_sort_t *sort;
8859     gmx_cgsort_t      *cgsort;
8860     int               *cgindex;
8861     int                ncg_new, i, *ibuf, cgsize;
8862     rvec              *vbuf;
8863
8864     sort = dd->comm->sort;
8865
8866     if (dd->ncg_home > sort->sort_nalloc)
8867     {
8868         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8869         srenew(sort->sort, sort->sort_nalloc);
8870         srenew(sort->sort2, sort->sort_nalloc);
8871     }
8872     cgsort = sort->sort;
8873
8874     switch (fr->cutoff_scheme)
8875     {
8876         case ecutsGROUP:
8877             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8878             break;
8879         case ecutsVERLET:
8880             ncg_new = dd_sort_order_nbnxn(dd, fr);
8881             break;
8882         default:
8883             gmx_incons("unimplemented");
8884             ncg_new = 0;
8885     }
8886
8887     /* We alloc with the old size, since cgindex is still old */
8888     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8889     vbuf = dd->comm->vbuf.v;
8890
8891     if (dd->comm->bCGs)
8892     {
8893         cgindex = dd->cgindex;
8894     }
8895     else
8896     {
8897         cgindex = nullptr;
8898     }
8899
8900     /* Remove the charge groups which are no longer at home here */
8901     dd->ncg_home = ncg_new;
8902     if (debug)
8903     {
8904         fprintf(debug, "Set the new home charge group count to %d\n",
8905                 dd->ncg_home);
8906     }
8907
8908     /* Reorder the state */
8909     if (state->flags & (1 << estX))
8910     {
8911         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->x.data()), vbuf);
8912     }
8913     if (state->flags & (1 << estV))
8914     {
8915         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->v.data()), vbuf);
8916     }
8917     if (state->flags & (1 << estCGP))
8918     {
8919         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->cg_p.data()), vbuf);
8920     }
8921
8922     if (fr->cutoff_scheme == ecutsGROUP)
8923     {
8924         /* Reorder cgcm */
8925         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
8926     }
8927
8928     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8929     {
8930         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8931         srenew(sort->ibuf, sort->ibuf_nalloc);
8932     }
8933     ibuf = sort->ibuf;
8934     /* Reorder the global cg index */
8935     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
8936     /* Reorder the cginfo */
8937     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
8938     /* Rebuild the local cg index */
8939     if (dd->comm->bCGs)
8940     {
8941         ibuf[0] = 0;
8942         for (i = 0; i < dd->ncg_home; i++)
8943         {
8944             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8945             ibuf[i+1] = ibuf[i] + cgsize;
8946         }
8947         for (i = 0; i < dd->ncg_home+1; i++)
8948         {
8949             dd->cgindex[i] = ibuf[i];
8950         }
8951     }
8952     else
8953     {
8954         for (i = 0; i < dd->ncg_home+1; i++)
8955         {
8956             dd->cgindex[i] = i;
8957         }
8958     }
8959     /* Set the home atom number */
8960     dd->nat_home = dd->cgindex[dd->ncg_home];
8961
8962     if (fr->cutoff_scheme == ecutsVERLET)
8963     {
8964         /* The atoms are now exactly in grid order, update the grid order */
8965         nbnxn_set_atomorder(fr->nbv->nbs);
8966     }
8967     else
8968     {
8969         /* Copy the sorted ns cell indices back to the ns grid struct */
8970         for (i = 0; i < dd->ncg_home; i++)
8971         {
8972             fr->ns->grid->cell_index[i] = cgsort[i].nsc;
8973         }
8974         fr->ns->grid->nr = dd->ncg_home;
8975     }
8976 }
8977
8978 static void add_dd_statistics(gmx_domdec_t *dd)
8979 {
8980     gmx_domdec_comm_t *comm;
8981     int                ddnat;
8982
8983     comm = dd->comm;
8984
8985     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
8986     {
8987         comm->sum_nat[ddnat-ddnatZONE] +=
8988             comm->nat[ddnat] - comm->nat[ddnat-1];
8989     }
8990     comm->ndecomp++;
8991 }
8992
8993 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8994 {
8995     gmx_domdec_comm_t *comm;
8996     int                ddnat;
8997
8998     comm = dd->comm;
8999
9000     /* Reset all the statistics and counters for total run counting */
9001     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9002     {
9003         comm->sum_nat[ddnat-ddnatZONE] = 0;
9004     }
9005     comm->ndecomp   = 0;
9006     comm->nload     = 0;
9007     comm->load_step = 0;
9008     comm->load_sum  = 0;
9009     comm->load_max  = 0;
9010     clear_ivec(comm->load_lim);
9011     comm->load_mdf = 0;
9012     comm->load_pme = 0;
9013 }
9014
9015 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9016 {
9017     gmx_domdec_comm_t *comm;
9018     int                ddnat;
9019     double             av;
9020
9021     comm = cr->dd->comm;
9022
9023     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9024
9025     if (fplog == nullptr)
9026     {
9027         return;
9028     }
9029
9030     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9031
9032     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9033     {
9034         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9035         switch (ddnat)
9036         {
9037             case ddnatZONE:
9038                 fprintf(fplog,
9039                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9040                         2, av);
9041                 break;
9042             case ddnatVSITE:
9043                 if (cr->dd->vsite_comm)
9044                 {
9045                     fprintf(fplog,
9046                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9047                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9048                             av);
9049                 }
9050                 break;
9051             case ddnatCON:
9052                 if (cr->dd->constraint_comm)
9053                 {
9054                     fprintf(fplog,
9055                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9056                             1 + ir->nLincsIter, av);
9057                 }
9058                 break;
9059             default:
9060                 gmx_incons(" Unknown type for DD statistics");
9061         }
9062     }
9063     fprintf(fplog, "\n");
9064
9065     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9066     {
9067         print_dd_load_av(fplog, cr->dd);
9068     }
9069 }
9070
9071 void dd_partition_system(FILE                *fplog,
9072                          gmx_int64_t          step,
9073                          t_commrec           *cr,
9074                          gmx_bool             bMasterState,
9075                          int                  nstglobalcomm,
9076                          t_state             *state_global,
9077                          const gmx_mtop_t    *top_global,
9078                          const t_inputrec    *ir,
9079                          t_state             *state_local,
9080                          PaddedRVecVector    *f,
9081                          t_mdatoms           *mdatoms,
9082                          gmx_localtop_t      *top_local,
9083                          t_forcerec          *fr,
9084                          gmx_vsite_t         *vsite,
9085                          gmx_constr_t         constr,
9086                          t_nrnb              *nrnb,
9087                          gmx_wallcycle_t      wcycle,
9088                          gmx_bool             bVerbose)
9089 {
9090     gmx_domdec_t      *dd;
9091     gmx_domdec_comm_t *comm;
9092     gmx_ddbox_t        ddbox = {0};
9093     t_block           *cgs_gl;
9094     gmx_int64_t        step_pcoupl;
9095     rvec               cell_ns_x0, cell_ns_x1;
9096     int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9097     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
9098     gmx_bool           bRedist, bSortCG, bResortAll;
9099     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9100     real               grid_density;
9101     char               sbuf[22];
9102
9103     wallcycle_start(wcycle, ewcDOMDEC);
9104
9105     dd   = cr->dd;
9106     comm = dd->comm;
9107
9108     bBoxChanged = (bMasterState || inputrecDeform(ir));
9109     if (ir->epc != epcNO)
9110     {
9111         /* With nstpcouple > 1 pressure coupling happens.
9112          * one step after calculating the pressure.
9113          * Box scaling happens at the end of the MD step,
9114          * after the DD partitioning.
9115          * We therefore have to do DLB in the first partitioning
9116          * after an MD step where P-coupling occurred.
9117          * We need to determine the last step in which p-coupling occurred.
9118          * MRS -- need to validate this for vv?
9119          */
9120         n = ir->nstpcouple;
9121         if (n == 1)
9122         {
9123             step_pcoupl = step - 1;
9124         }
9125         else
9126         {
9127             step_pcoupl = ((step - 1)/n)*n + 1;
9128         }
9129         if (step_pcoupl >= comm->partition_step)
9130         {
9131             bBoxChanged = TRUE;
9132         }
9133     }
9134
9135     bNStGlobalComm = (step % nstglobalcomm == 0);
9136
9137     if (!dlbIsOn(comm))
9138     {
9139         bDoDLB = FALSE;
9140     }
9141     else
9142     {
9143         /* Should we do dynamic load balacing this step?
9144          * Since it requires (possibly expensive) global communication,
9145          * we might want to do DLB less frequently.
9146          */
9147         if (bBoxChanged || ir->epc != epcNO)
9148         {
9149             bDoDLB = bBoxChanged;
9150         }
9151         else
9152         {
9153             bDoDLB = bNStGlobalComm;
9154         }
9155     }
9156
9157     /* Check if we have recorded loads on the nodes */
9158     if (comm->bRecordLoad && dd_load_count(comm) > 0)
9159     {
9160         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
9161
9162         /* Print load every nstlog, first and last step to the log file */
9163         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9164                     comm->n_load_collect == 0 ||
9165                     (ir->nsteps >= 0 &&
9166                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9167
9168         /* Avoid extra communication due to verbose screen output
9169          * when nstglobalcomm is set.
9170          */
9171         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
9172             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9173         {
9174             get_load_distribution(dd, wcycle);
9175             if (DDMASTER(dd))
9176             {
9177                 if (bLogLoad)
9178                 {
9179                     dd_print_load(fplog, dd, step-1);
9180                 }
9181                 if (bVerbose)
9182                 {
9183                     dd_print_load_verbose(dd);
9184                 }
9185             }
9186             comm->n_load_collect++;
9187
9188             if (dlbIsOn(comm))
9189             {
9190                 if (DDMASTER(dd))
9191                 {
9192                     /* Add the measured cycles to the running average */
9193                     const float averageFactor        = 0.1f;
9194                     comm->cyclesPerStepDlbExpAverage =
9195                         (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
9196                         averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
9197                 }
9198                 if (comm->dlbState == edlbsOnCanTurnOff &&
9199                     dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
9200                 {
9201                     gmx_bool turnOffDlb;
9202                     if (DDMASTER(dd))
9203                     {
9204                         /* If the running averaged cycles with DLB are more
9205                          * than before we turned on DLB, turn off DLB.
9206                          * We will again run and check the cycles without DLB
9207                          * and we can then decide if to turn off DLB forever.
9208                          */
9209                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
9210                                       comm->cyclesPerStepBeforeDLB);
9211                     }
9212                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
9213                     if (turnOffDlb)
9214                     {
9215                         /* To turn off DLB, we need to redistribute the atoms */
9216                         dd_collect_state(dd, state_local, state_global);
9217                         bMasterState = TRUE;
9218                         turn_off_dlb(fplog, cr, step);
9219                     }
9220                 }
9221             }
9222             else if (bCheckWhetherToTurnDlbOn)
9223             {
9224                 gmx_bool turnOffDlbForever = FALSE;
9225                 gmx_bool turnOnDlb         = FALSE;
9226
9227                 /* Since the timings are node dependent, the master decides */
9228                 if (DDMASTER(dd))
9229                 {
9230                     /* If we recently turned off DLB, we want to check if
9231                      * performance is better without DLB. We want to do this
9232                      * ASAP to minimize the chance that external factors
9233                      * slowed down the DLB step are gone here and we
9234                      * incorrectly conclude that DLB was causing the slowdown.
9235                      * So we measure one nstlist block, no running average.
9236                      */
9237                     if (comm->haveTurnedOffDlb &&
9238                         comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
9239                         comm->cyclesPerStepDlbExpAverage)
9240                     {
9241                         /* After turning off DLB we ran nstlist steps in fewer
9242                          * cycles than with DLB. This likely means that DLB
9243                          * in not benefical, but this could be due to a one
9244                          * time unlucky fluctuation, so we require two such
9245                          * observations in close succession to turn off DLB
9246                          * forever.
9247                          */
9248                         if (comm->dlbSlowerPartitioningCount > 0 &&
9249                             dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
9250                         {
9251                             turnOffDlbForever = TRUE;
9252                         }
9253                         comm->haveTurnedOffDlb           = false;
9254                         /* Register when we last measured DLB slowdown */
9255                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
9256                     }
9257                     else
9258                     {
9259                         /* Here we check if the max PME rank load is more than 0.98
9260                          * the max PP force load. If so, PP DLB will not help,
9261                          * since we are (almost) limited by PME. Furthermore,
9262                          * DLB will cause a significant extra x/f redistribution
9263                          * cost on the PME ranks, which will then surely result
9264                          * in lower total performance.
9265                          */
9266                         if (cr->npmenodes > 0 &&
9267                             dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
9268                         {
9269                             turnOnDlb = FALSE;
9270                         }
9271                         else
9272                         {
9273                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9274                         }
9275                     }
9276                 }
9277                 struct
9278                 {
9279                     gmx_bool turnOffDlbForever;
9280                     gmx_bool turnOnDlb;
9281                 }
9282                 bools {
9283                     turnOffDlbForever, turnOnDlb
9284                 };
9285                 dd_bcast(dd, sizeof(bools), &bools);
9286                 if (bools.turnOffDlbForever)
9287                 {
9288                     turn_off_dlb_forever(fplog, cr, step);
9289                 }
9290                 else if (bools.turnOnDlb)
9291                 {
9292                     turn_on_dlb(fplog, cr, step);
9293                     bDoDLB = TRUE;
9294                 }
9295             }
9296         }
9297         comm->n_load_have++;
9298     }
9299
9300     cgs_gl = &comm->cgs_gl;
9301
9302     bRedist = FALSE;
9303     if (bMasterState)
9304     {
9305         /* Clear the old state */
9306         clear_dd_indices(dd, 0, 0);
9307         ncgindex_set = 0;
9308
9309         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9310                   TRUE, cgs_gl, as_rvec_array(state_global->x.data()), &ddbox);
9311
9312         get_cg_distribution(fplog, dd, cgs_gl,
9313                             state_global->box, &ddbox, as_rvec_array(state_global->x.data()));
9314
9315         dd_distribute_state(dd, cgs_gl,
9316                             state_global, state_local, f);
9317
9318         dd_make_local_cgs(dd, &top_local->cgs);
9319
9320         /* Ensure that we have space for the new distribution */
9321         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9322
9323         if (fr->cutoff_scheme == ecutsGROUP)
9324         {
9325             calc_cgcm(fplog, 0, dd->ncg_home,
9326                       &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9327         }
9328
9329         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9330
9331         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9332     }
9333     else if (state_local->ddp_count != dd->ddp_count)
9334     {
9335         if (state_local->ddp_count > dd->ddp_count)
9336         {
9337             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9338         }
9339
9340         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9341         {
9342             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9343         }
9344
9345         /* Clear the old state */
9346         clear_dd_indices(dd, 0, 0);
9347
9348         /* Build the new indices */
9349         rebuild_cgindex(dd, cgs_gl->index, state_local);
9350         make_dd_indices(dd, cgs_gl->index, 0);
9351         ncgindex_set = dd->ncg_home;
9352
9353         if (fr->cutoff_scheme == ecutsGROUP)
9354         {
9355             /* Redetermine the cg COMs */
9356             calc_cgcm(fplog, 0, dd->ncg_home,
9357                       &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9358         }
9359
9360         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9361
9362         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9363
9364         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9365                   TRUE, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9366
9367         bRedist = dlbIsOn(comm);
9368     }
9369     else
9370     {
9371         /* We have the full state, only redistribute the cgs */
9372
9373         /* Clear the non-home indices */
9374         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9375         ncgindex_set = 0;
9376
9377         /* Avoid global communication for dim's without pbc and -gcom */
9378         if (!bNStGlobalComm)
9379         {
9380             copy_rvec(comm->box0, ddbox.box0    );
9381             copy_rvec(comm->box_size, ddbox.box_size);
9382         }
9383         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9384                   bNStGlobalComm, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9385
9386         bBoxChanged = TRUE;
9387         bRedist     = TRUE;
9388     }
9389     /* For dim's without pbc and -gcom */
9390     copy_rvec(ddbox.box0, comm->box0    );
9391     copy_rvec(ddbox.box_size, comm->box_size);
9392
9393     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9394                       step, wcycle);
9395
9396     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9397     {
9398         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9399     }
9400
9401     /* Check if we should sort the charge groups */
9402     bSortCG = (bMasterState || bRedist);
9403
9404     ncg_home_old = dd->ncg_home;
9405
9406     ncg_moved = 0;
9407     if (bRedist)
9408     {
9409         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9410
9411         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9412                            state_local, f, fr,
9413                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9414
9415         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9416     }
9417
9418     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9419                           dd, &ddbox,
9420                           &comm->cell_x0, &comm->cell_x1,
9421                           dd->ncg_home, fr->cg_cm,
9422                           cell_ns_x0, cell_ns_x1, &grid_density);
9423
9424     if (bBoxChanged)
9425     {
9426         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9427     }
9428
9429     switch (fr->cutoff_scheme)
9430     {
9431         case ecutsGROUP:
9432             copy_ivec(fr->ns->grid->n, ncells_old);
9433             grid_first(fplog, fr->ns->grid, dd, &ddbox,
9434                        state_local->box, cell_ns_x0, cell_ns_x1,
9435                        fr->rlist, grid_density);
9436             break;
9437         case ecutsVERLET:
9438             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9439             break;
9440         default:
9441             gmx_incons("unimplemented");
9442     }
9443     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9444     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9445
9446     if (bSortCG)
9447     {
9448         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9449
9450         /* Sort the state on charge group position.
9451          * This enables exact restarts from this step.
9452          * It also improves performance by about 15% with larger numbers
9453          * of atoms per node.
9454          */
9455
9456         /* Fill the ns grid with the home cell,
9457          * so we can sort with the indices.
9458          */
9459         set_zones_ncg_home(dd);
9460
9461         switch (fr->cutoff_scheme)
9462         {
9463             case ecutsVERLET:
9464                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9465
9466                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9467                                   0,
9468                                   comm->zones.size[0].bb_x0,
9469                                   comm->zones.size[0].bb_x1,
9470                                   0, dd->ncg_home,
9471                                   comm->zones.dens_zone0,
9472                                   fr->cginfo,
9473                                   as_rvec_array(state_local->x.data()),
9474                                   ncg_moved, bRedist ? comm->moved : nullptr,
9475                                   fr->nbv->grp[eintLocal].kernel_type,
9476                                   fr->nbv->grp[eintLocal].nbat);
9477
9478                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9479                 break;
9480             case ecutsGROUP:
9481                 fill_grid(&comm->zones, fr->ns->grid, dd->ncg_home,
9482                           0, dd->ncg_home, fr->cg_cm);
9483
9484                 copy_ivec(fr->ns->grid->n, ncells_new);
9485                 break;
9486             default:
9487                 gmx_incons("unimplemented");
9488         }
9489
9490         bResortAll = bMasterState;
9491
9492         /* Check if we can user the old order and ns grid cell indices
9493          * of the charge groups to sort the charge groups efficiently.
9494          */
9495         if (ncells_new[XX] != ncells_old[XX] ||
9496             ncells_new[YY] != ncells_old[YY] ||
9497             ncells_new[ZZ] != ncells_old[ZZ])
9498         {
9499             bResortAll = TRUE;
9500         }
9501
9502         if (debug)
9503         {
9504             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9505                     gmx_step_str(step, sbuf), dd->ncg_home);
9506         }
9507         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9508                       bResortAll ? -1 : ncg_home_old);
9509
9510         /* After sorting and compacting we set the correct size */
9511         dd_resize_state(state_local, f, dd->nat_home);
9512
9513         /* Rebuild all the indices */
9514         ga2la_clear(dd->ga2la);
9515         ncgindex_set = 0;
9516
9517         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9518     }
9519
9520     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9521
9522     /* Setup up the communication and communicate the coordinates */
9523     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9524
9525     /* Set the indices */
9526     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9527
9528     /* Set the charge group boundaries for neighbor searching */
9529     set_cg_boundaries(&comm->zones);
9530
9531     if (fr->cutoff_scheme == ecutsVERLET)
9532     {
9533         set_zones_size(dd, state_local->box, &ddbox,
9534                        bSortCG ? 1 : 0, comm->zones.n);
9535     }
9536
9537     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9538
9539     /*
9540        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9541                  -1,as_rvec_array(state_local->x.data()),state_local->box);
9542      */
9543
9544     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9545
9546     /* Extract a local topology from the global topology */
9547     for (i = 0; i < dd->ndim; i++)
9548     {
9549         np[dd->dim[i]] = comm->cd[i].np;
9550     }
9551     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9552                       comm->cellsize_min, np,
9553                       fr,
9554                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : as_rvec_array(state_local->x.data()),
9555                       vsite, top_global, top_local);
9556
9557     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9558
9559     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9560
9561     /* Set up the special atom communication */
9562     n = comm->nat[ddnatZONE];
9563     for (i = ddnatZONE+1; i < ddnatNR; i++)
9564     {
9565         switch (i)
9566         {
9567             case ddnatVSITE:
9568                 if (vsite && vsite->n_intercg_vsite)
9569                 {
9570                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9571                 }
9572                 break;
9573             case ddnatCON:
9574                 if (dd->bInterCGcons || dd->bInterCGsettles)
9575                 {
9576                     /* Only for inter-cg constraints we need special code */
9577                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9578                                                   constr, ir->nProjOrder,
9579                                                   top_local->idef.il);
9580                 }
9581                 break;
9582             default:
9583                 gmx_incons("Unknown special atom type setup");
9584         }
9585         comm->nat[i] = n;
9586     }
9587
9588     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9589
9590     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9591
9592     /* Make space for the extra coordinates for virtual site
9593      * or constraint communication.
9594      */
9595     state_local->natoms = comm->nat[ddnatNR-1];
9596
9597     dd_resize_state(state_local, f, state_local->natoms);
9598
9599     if (fr->bF_NoVirSum)
9600     {
9601         if (vsite && vsite->n_intercg_vsite)
9602         {
9603             nat_f_novirsum = comm->nat[ddnatVSITE];
9604         }
9605         else
9606         {
9607             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9608             {
9609                 nat_f_novirsum = dd->nat_tot;
9610             }
9611             else
9612             {
9613                 nat_f_novirsum = dd->nat_home;
9614             }
9615         }
9616     }
9617     else
9618     {
9619         nat_f_novirsum = 0;
9620     }
9621
9622     /* Set the number of atoms required for the force calculation.
9623      * Forces need to be constrained when doing energy
9624      * minimization. For simple simulations we could avoid some
9625      * allocation, zeroing and copying, but this is probably not worth
9626      * the complications and checking.
9627      */
9628     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9629                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9630
9631     /* Update atom data for mdatoms and several algorithms */
9632     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
9633                               nullptr, mdatoms, vsite, nullptr);
9634
9635     if (ir->implicit_solvent)
9636     {
9637         make_local_gb(cr, fr->born, ir->gb_algorithm);
9638     }
9639
9640     if (!(cr->duty & DUTY_PME))
9641     {
9642         /* Send the charges and/or c6/sigmas to our PME only node */
9643         gmx_pme_send_parameters(cr,
9644                                 fr->ic,
9645                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9646                                 mdatoms->chargeA, mdatoms->chargeB,
9647                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9648                                 mdatoms->sigmaA, mdatoms->sigmaB,
9649                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9650     }
9651
9652     if (constr)
9653     {
9654         set_constraints(constr, top_local, ir, mdatoms, cr);
9655     }
9656
9657     if (ir->bPull)
9658     {
9659         /* Update the local pull groups */
9660         dd_make_local_pull_groups(cr, ir->pull_work, mdatoms);
9661     }
9662
9663     if (ir->bRot)
9664     {
9665         /* Update the local rotation groups */
9666         dd_make_local_rotation_groups(dd, ir->rot);
9667     }
9668
9669     if (ir->eSwapCoords != eswapNO)
9670     {
9671         /* Update the local groups needed for ion swapping */
9672         dd_make_local_swap_groups(dd, ir->swap);
9673     }
9674
9675     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9676     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9677
9678     add_dd_statistics(dd);
9679
9680     /* Make sure we only count the cycles for this DD partitioning */
9681     clear_dd_cycle_counts(dd);
9682
9683     /* Because the order of the atoms might have changed since
9684      * the last vsite construction, we need to communicate the constructing
9685      * atom coordinates again (for spreading the forces this MD step).
9686      */
9687     dd_move_x_vsites(dd, state_local->box, as_rvec_array(state_local->x.data()));
9688
9689     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9690
9691     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9692     {
9693         dd_move_x(dd, state_local->box, as_rvec_array(state_local->x.data()));
9694         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9695                      -1, as_rvec_array(state_local->x.data()), state_local->box);
9696     }
9697
9698     /* Store the partitioning step */
9699     comm->partition_step = step;
9700
9701     /* Increase the DD partitioning counter */
9702     dd->ddp_count++;
9703     /* The state currently matches this DD partitioning count, store it */
9704     state_local->ddp_count = dd->ddp_count;
9705     if (bMasterState)
9706     {
9707         /* The DD master node knows the complete cg distribution,
9708          * store the count so we can possibly skip the cg info communication.
9709          */
9710         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9711     }
9712
9713     if (comm->DD_debug > 0)
9714     {
9715         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9716         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9717                                 "after partitioning");
9718     }
9719
9720     wallcycle_stop(wcycle, ewcDOMDEC);
9721 }