src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <limits.h>
  44 #include <stdio.h>
  45 #include <stdlib.h>
  46 #include <string.h>
  47
  48 #include <cmath>
  49
  50 #include <algorithm>
  51
  52 #include "gromacs/domdec/domdec_network.h"
  53 #include "gromacs/domdec/ga2la.h"
  54 #include "gromacs/ewald/pme.h"
  55 #include "gromacs/fileio/gmxfio.h"
  56 #include "gromacs/fileio/pdbio.h"
  57 #include "gromacs/gmxlib/chargegroup.h"
  58 #include "gromacs/gmxlib/network.h"
  59 #include "gromacs/gmxlib/nrnb.h"
  60 #include "gromacs/gpu_utils/gpu_utils.h"
  61 #include "gromacs/hardware/hw_info.h"
  62 #include "gromacs/imd/imd.h"
  63 #include "gromacs/listed-forces/manage-threading.h"
  64 #include "gromacs/math/functions.h"
  65 #include "gromacs/math/vec.h"
  66 #include "gromacs/math/vectypes.h"
  67 #include "gromacs/mdlib/constr.h"
  68 #include "gromacs/mdlib/constraintrange.h"
  69 #include "gromacs/mdlib/force.h"
  70 #include "gromacs/mdlib/forcerec.h"
  71 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  72 #include "gromacs/mdlib/lincs.h"
  73 #include "gromacs/mdlib/mdatoms.h"
  74 #include "gromacs/mdlib/mdrun.h"
  75 #include "gromacs/mdlib/mdsetup.h"
  76 #include "gromacs/mdlib/nb_verlet.h"
  77 #include "gromacs/mdlib/nbnxn_grid.h"
  78 #include "gromacs/mdlib/nsgrid.h"
  79 #include "gromacs/mdlib/vsite.h"
  80 #include "gromacs/mdtypes/commrec.h"
  81 #include "gromacs/mdtypes/df_history.h"
  82 #include "gromacs/mdtypes/forcerec.h"
  83 #include "gromacs/mdtypes/inputrec.h"
  84 #include "gromacs/mdtypes/md_enums.h"
  85 #include "gromacs/mdtypes/mdatom.h"
  86 #include "gromacs/mdtypes/nblist.h"
  87 #include "gromacs/mdtypes/state.h"
  88 #include "gromacs/pbcutil/ishift.h"
  89 #include "gromacs/pbcutil/pbc.h"
  90 #include "gromacs/pulling/pull.h"
  91 #include "gromacs/pulling/pull_rotation.h"
  92 #include "gromacs/swap/swapcoords.h"
  93 #include "gromacs/timing/wallcycle.h"
  94 #include "gromacs/topology/block.h"
  95 #include "gromacs/topology/idef.h"
  96 #include "gromacs/topology/ifunc.h"
  97 #include "gromacs/topology/mtop_lookup.h"
  98 #include "gromacs/topology/mtop_util.h"
  99 #include "gromacs/topology/topology.h"
 100 #include "gromacs/utility/basedefinitions.h"
 101 #include "gromacs/utility/basenetwork.h"
 102 #include "gromacs/utility/cstringutil.h"
 103 #include "gromacs/utility/exceptions.h"
 104 #include "gromacs/utility/fatalerror.h"
 105 #include "gromacs/utility/gmxmpi.h"
 106 #include "gromacs/utility/qsort_threadsafe.h"
 107 #include "gromacs/utility/real.h"
 108 #include "gromacs/utility/smalloc.h"
 109 #include "gromacs/utility/stringutil.h"
 110
 111 #include "domdec_constraints.h"
 112 #include "domdec_internal.h"
 113 #include "domdec_vsite.h"
 114
 115 #define DDRANK(dd, rank)    (rank)
 116 #define DDMASTERRANK(dd)   (dd->masterrank)
 117
 118 struct gmx_domdec_master_t
 119 {
 120     /* The cell boundaries */
 121     real **cell_x;
 122     /* The global charge group division */
 123     int   *ncg;    /* Number of home charge groups for each node */
 124     int   *index;  /* Index of nnodes+1 into cg */
 125     int   *cg;     /* Global charge group index */
 126     int   *nat;    /* Number of home atoms for each node. */
 127     int   *ibuf;   /* Buffer for communication */
 128     rvec  *vbuf;   /* Buffer for state scattering and gathering */
 129 };
 130
 131 #define DD_NLOAD_MAX 9
 132
 133 const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on", "on" };
 134
 135 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 136 #define DD_CGIBS 2
 137
 138 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 139 #define DD_FLAG_NRCG  65535
 140 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 141 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 142
 143 /* The DD zone order */
 144 static const ivec dd_zo[DD_MAXZONE] =
 145 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 146
 147 /* The non-bonded zone-pair setup for domain decomposition
 148  * The first number is the i-zone, the second number the first j-zone seen by
 149  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 150  * As is, this is for 3D decomposition, where there are 4 i-zones.
 151  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 152  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 153  */
 154 static const int
 155     ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
 156                                                  {1, 3, 6},
 157                                                  {2, 5, 6},
 158                                                  {3, 5, 7}};
 159
 160 /* Factors used to avoid problems due to rounding issues */
 161 #define DD_CELL_MARGIN       1.0001
 162 #define DD_CELL_MARGIN2      1.00005
 163 /* Factor to account for pressure scaling during nstlist steps */
 164 #define DD_PRES_SCALE_MARGIN 1.02
 165
 166 /* Turn on DLB when the load imbalance causes this amount of total loss.
 167  * There is a bit of overhead with DLB and it's difficult to achieve
 168  * a load imbalance of less than 2% with DLB.
 169  */
 170 #define DD_PERF_LOSS_DLB_ON  0.02
 171
 172 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 173 #define DD_PERF_LOSS_WARN    0.05
 174
 175 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 176
 177 /* Use separate MPI send and receive commands
 178  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 179  * This saves memory (and some copying for small nnodes).
 180  * For high parallelization scatter and gather calls are used.
 181  */
 182 #define GMX_DD_NNODES_SENDRECV 4
 183
 184
 185 /* We check if to turn on DLB at the first and every 100 DD partitionings.
 186  * With large imbalance DLB will turn on at the first step, so we can
 187  * make the interval so large that the MPI overhead of the check is negligible.
 188  */
 189 static const int c_checkTurnDlbOnInterval  = 100;
 190 /* We need to check if DLB results in worse performance and then turn it off.
 191  * We check this more often then for turning DLB on, because the DLB can scale
 192  * the domains very rapidly, so if unlucky the load imbalance can go up quickly
 193  * and furthermore, we are already synchronizing often with DLB, so
 194  * the overhead of the MPI Bcast is not that high.
 195  */
 196 static const int c_checkTurnDlbOffInterval =  20;
 197
 198 /* Forward declaration */
 199 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue);
 200
 201
 202 /*
 203    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 204
 205    static void index2xyz(ivec nc,int ind,ivec xyz)
 206    {
 207    xyz[XX] = ind % nc[XX];
 208    xyz[YY] = (ind / nc[XX]) % nc[YY];
 209    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 210    }
 211  */
 212
 213 /* This order is required to minimize the coordinate communication in PME
 214  * which uses decomposition in the x direction.
 215  */
 216 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 217
 218 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 219 {
 220     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 221     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 222     xyz[ZZ] = ind % nc[ZZ];
 223 }
 224
 225 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 226 {
 227     int ddindex;
 228     int ddnodeid = -1;
 229
 230     ddindex = dd_index(dd->nc, c);
 231     if (dd->comm->bCartesianPP_PME)
 232     {
 233         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 234     }
 235     else if (dd->comm->bCartesianPP)
 236     {
 237 #if GMX_MPI
 238         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 239 #endif
 240     }
 241     else
 242     {
 243         ddnodeid = ddindex;
 244     }
 245
 246     return ddnodeid;
 247 }
 248
 249 static gmx_bool dynamic_dd_box(const gmx_ddbox_t *ddbox, const t_inputrec *ir)
 250 {
 251     return (ddbox->nboundeddim < DIM || inputrecDynamicBox(ir));
 252 }
 253
 254 int ddglatnr(const gmx_domdec_t *dd, int i)
 255 {
 256     int atnr;
 257
 258     if (dd == nullptr)
 259     {
 260         atnr = i + 1;
 261     }
 262     else
 263     {
 264         if (i >= dd->comm->nat[ddnatNR-1])
 265         {
 266             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 267         }
 268         atnr = dd->gatindex[i] + 1;
 269     }
 270
 271     return atnr;
 272 }
 273
 274 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 275 {
 276     return &dd->comm->cgs_gl;
 277 }
 278
 279 /*! \brief Returns true if the DLB state indicates that the balancer is on. */
 280 static bool isDlbOn(const gmx_domdec_comm_t *comm)
 281 {
 282     return (comm->dlbState == edlbsOnCanTurnOff ||
 283             comm->dlbState == edlbsOnUser);
 284 }
 285 /*! \brief Returns true if the DLB state indicates that the balancer is off/disabled.
 286  */
 287 static bool isDlbDisabled(const gmx_domdec_comm_t *comm)
 288 {
 289     return (comm->dlbState == edlbsOffUser ||
 290             comm->dlbState == edlbsOffForever);
 291 }
 292
 293 static void vec_rvec_init(vec_rvec_t *v)
 294 {
 295     v->nalloc = 0;
 296     v->v      = nullptr;
 297 }
 298
 299 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 300 {
 301     if (n > v->nalloc)
 302     {
 303         v->nalloc = over_alloc_dd(n);
 304         srenew(v->v, v->nalloc);
 305     }
 306 }
 307
 308 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 309 {
 310     int i;
 311
 312     if (state->ddp_count != dd->ddp_count)
 313     {
 314         gmx_incons("The MD state does not match the domain decomposition state");
 315     }
 316
 317     state->cg_gl.resize(dd->ncg_home);
 318     for (i = 0; i < dd->ncg_home; i++)
 319     {
 320         state->cg_gl[i] = dd->index_gl[i];
 321     }
 322
 323     state->ddp_count_cg_gl = dd->ddp_count;
 324 }
 325
 326 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 327 {
 328     return &dd->comm->zones;
 329 }
 330
 331 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
 332                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 333 {
 334     gmx_domdec_zones_t *zones;
 335     int                 izone, d, dim;
 336
 337     zones = &dd->comm->zones;
 338
 339     izone = 0;
 340     while (icg >= zones->izone[izone].cg1)
 341     {
 342         izone++;
 343     }
 344
 345     if (izone == 0)
 346     {
 347         *jcg0 = icg;
 348     }
 349     else if (izone < zones->nizone)
 350     {
 351         *jcg0 = zones->izone[izone].jcg0;
 352     }
 353     else
 354     {
 355         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 356                   icg, izone, zones->nizone);
 357     }
 358
 359     *jcg1 = zones->izone[izone].jcg1;
 360
 361     for (d = 0; d < dd->ndim; d++)
 362     {
 363         dim         = dd->dim[d];
 364         shift0[dim] = zones->izone[izone].shift0[dim];
 365         shift1[dim] = zones->izone[izone].shift1[dim];
 366         if (dd->comm->tric_dir[dim] || (isDlbOn(dd->comm) && d > 0))
 367         {
 368             /* A conservative approach, this can be optimized */
 369             shift0[dim] -= 1;
 370             shift1[dim] += 1;
 371         }
 372     }
 373 }
 374
 375 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
 376 {
 377     /* We currently set mdatoms entries for all atoms:
 378      * local + non-local + communicated for vsite + constraints
 379      */
 380
 381     return dd->comm->nat[ddnatNR - 1];
 382 }
 383
 384 int dd_natoms_vsite(const gmx_domdec_t *dd)
 385 {
 386     return dd->comm->nat[ddnatVSITE];
 387 }
 388
 389 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
 390 {
 391     *at_start = dd->comm->nat[ddnatCON-1];
 392     *at_end   = dd->comm->nat[ddnatCON];
 393 }
 394
 395 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[], gmx_wallcycle *wcycle)
 396 {
 397     wallcycle_start(wcycle, ewcMOVEX);
 398
 399     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 400     int                   *index, *cgindex;
 401     gmx_domdec_comm_t     *comm;
 402     gmx_domdec_comm_dim_t *cd;
 403     gmx_domdec_ind_t      *ind;
 404     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 405     gmx_bool               bPBC, bScrew;
 406
 407     comm = dd->comm;
 408
 409     cgindex = dd->cgindex;
 410
 411     buf = comm->vbuf.v;
 412
 413     nzone   = 1;
 414     nat_tot = dd->nat_home;
 415     for (d = 0; d < dd->ndim; d++)
 416     {
 417         bPBC   = (dd->ci[dd->dim[d]] == 0);
 418         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 419         if (bPBC)
 420         {
 421             copy_rvec(box[dd->dim[d]], shift);
 422         }
 423         cd = &comm->cd[d];
 424         for (p = 0; p < cd->np; p++)
 425         {
 426             ind   = &cd->ind[p];
 427             index = ind->index;
 428             n     = 0;
 429             if (!bPBC)
 430             {
 431                 for (i = 0; i < ind->nsend[nzone]; i++)
 432                 {
 433                     at0 = cgindex[index[i]];
 434                     at1 = cgindex[index[i]+1];
 435                     for (j = at0; j < at1; j++)
 436                     {
 437                         copy_rvec(x[j], buf[n]);
 438                         n++;
 439                     }
 440                 }
 441             }
 442             else if (!bScrew)
 443             {
 444                 for (i = 0; i < ind->nsend[nzone]; i++)
 445                 {
 446                     at0 = cgindex[index[i]];
 447                     at1 = cgindex[index[i]+1];
 448                     for (j = at0; j < at1; j++)
 449                     {
 450                         /* We need to shift the coordinates */
 451                         rvec_add(x[j], shift, buf[n]);
 452                         n++;
 453                     }
 454                 }
 455             }
 456             else
 457             {
 458                 for (i = 0; i < ind->nsend[nzone]; i++)
 459                 {
 460                     at0 = cgindex[index[i]];
 461                     at1 = cgindex[index[i]+1];
 462                     for (j = at0; j < at1; j++)
 463                     {
 464                         /* Shift x */
 465                         buf[n][XX] = x[j][XX] + shift[XX];
 466                         /* Rotate y and z.
 467                          * This operation requires a special shift force
 468                          * treatment, which is performed in calc_vir.
 469                          */
 470                         buf[n][YY] = box[YY][YY] - x[j][YY];
 471                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 472                         n++;
 473                     }
 474                 }
 475             }
 476
 477             if (cd->bInPlace)
 478             {
 479                 rbuf = x + nat_tot;
 480             }
 481             else
 482             {
 483                 rbuf = comm->vbuf2.v;
 484             }
 485             /* Send and receive the coordinates */
 486             dd_sendrecv_rvec(dd, d, dddirBackward,
 487                              buf,  ind->nsend[nzone+1],
 488                              rbuf, ind->nrecv[nzone+1]);
 489             if (!cd->bInPlace)
 490             {
 491                 j = 0;
 492                 for (zone = 0; zone < nzone; zone++)
 493                 {
 494                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 495                     {
 496                         copy_rvec(rbuf[j], x[i]);
 497                         j++;
 498                     }
 499                 }
 500             }
 501             nat_tot += ind->nrecv[nzone+1];
 502         }
 503         nzone += nzone;
 504     }
 505
 506     wallcycle_stop(wcycle, ewcMOVEX);
 507 }
 508
 509 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift, gmx_wallcycle *wcycle)
 510 {
 511     wallcycle_start(wcycle, ewcMOVEF);
 512
 513     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 514     int                   *index, *cgindex;
 515     gmx_domdec_comm_t     *comm;
 516     gmx_domdec_comm_dim_t *cd;
 517     gmx_domdec_ind_t      *ind;
 518     rvec                  *buf, *sbuf;
 519     ivec                   vis;
 520     int                    is;
 521     gmx_bool               bShiftForcesNeedPbc, bScrew;
 522
 523     comm = dd->comm;
 524
 525     cgindex = dd->cgindex;
 526
 527     buf = comm->vbuf.v;
 528
 529     nzone   = comm->zones.n/2;
 530     nat_tot = dd->nat_tot;
 531     for (d = dd->ndim-1; d >= 0; d--)
 532     {
 533         /* Only forces in domains near the PBC boundaries need to
 534            consider PBC in the treatment of fshift */
 535         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 536         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 537         if (fshift == nullptr && !bScrew)
 538         {
 539             bShiftForcesNeedPbc = FALSE;
 540         }
 541         /* Determine which shift vector we need */
 542         clear_ivec(vis);
 543         vis[dd->dim[d]] = 1;
 544         is              = IVEC2IS(vis);
 545
 546         cd = &comm->cd[d];
 547         for (p = cd->np-1; p >= 0; p--)
 548         {
 549             ind      = &cd->ind[p];
 550             nat_tot -= ind->nrecv[nzone+1];
 551             if (cd->bInPlace)
 552             {
 553                 sbuf = f + nat_tot;
 554             }
 555             else
 556             {
 557                 sbuf = comm->vbuf2.v;
 558                 j    = 0;
 559                 for (zone = 0; zone < nzone; zone++)
 560                 {
 561                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 562                     {
 563                         copy_rvec(f[i], sbuf[j]);
 564                         j++;
 565                     }
 566                 }
 567             }
 568             /* Communicate the forces */
 569             dd_sendrecv_rvec(dd, d, dddirForward,
 570                              sbuf, ind->nrecv[nzone+1],
 571                              buf,  ind->nsend[nzone+1]);
 572             index = ind->index;
 573             /* Add the received forces */
 574             n = 0;
 575             if (!bShiftForcesNeedPbc)
 576             {
 577                 for (i = 0; i < ind->nsend[nzone]; i++)
 578                 {
 579                     at0 = cgindex[index[i]];
 580                     at1 = cgindex[index[i]+1];
 581                     for (j = at0; j < at1; j++)
 582                     {
 583                         rvec_inc(f[j], buf[n]);
 584                         n++;
 585                     }
 586                 }
 587             }
 588             else if (!bScrew)
 589             {
 590                 /* fshift should always be defined if this function is
 591                  * called when bShiftForcesNeedPbc is true */
 592                 assert(NULL != fshift);
 593                 for (i = 0; i < ind->nsend[nzone]; i++)
 594                 {
 595                     at0 = cgindex[index[i]];
 596                     at1 = cgindex[index[i]+1];
 597                     for (j = at0; j < at1; j++)
 598                     {
 599                         rvec_inc(f[j], buf[n]);
 600                         /* Add this force to the shift force */
 601                         rvec_inc(fshift[is], buf[n]);
 602                         n++;
 603                     }
 604                 }
 605             }
 606             else
 607             {
 608                 for (i = 0; i < ind->nsend[nzone]; i++)
 609                 {
 610                     at0 = cgindex[index[i]];
 611                     at1 = cgindex[index[i]+1];
 612                     for (j = at0; j < at1; j++)
 613                     {
 614                         /* Rotate the force */
 615                         f[j][XX] += buf[n][XX];
 616                         f[j][YY] -= buf[n][YY];
 617                         f[j][ZZ] -= buf[n][ZZ];
 618                         if (fshift)
 619                         {
 620                             /* Add this force to the shift force */
 621                             rvec_inc(fshift[is], buf[n]);
 622                         }
 623                         n++;
 624                     }
 625                 }
 626             }
 627         }
 628         nzone /= 2;
 629     }
 630     wallcycle_stop(wcycle, ewcMOVEF);
 631 }
 632
 633 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 634 {
 635     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 636     int                   *index, *cgindex;
 637     gmx_domdec_comm_t     *comm;
 638     gmx_domdec_comm_dim_t *cd;
 639     gmx_domdec_ind_t      *ind;
 640     real                  *buf, *rbuf;
 641
 642     comm = dd->comm;
 643
 644     cgindex = dd->cgindex;
 645
 646     buf = &comm->vbuf.v[0][0];
 647
 648     nzone   = 1;
 649     nat_tot = dd->nat_home;
 650     for (d = 0; d < dd->ndim; d++)
 651     {
 652         cd = &comm->cd[d];
 653         for (p = 0; p < cd->np; p++)
 654         {
 655             ind   = &cd->ind[p];
 656             index = ind->index;
 657             n     = 0;
 658             for (i = 0; i < ind->nsend[nzone]; i++)
 659             {
 660                 at0 = cgindex[index[i]];
 661                 at1 = cgindex[index[i]+1];
 662                 for (j = at0; j < at1; j++)
 663                 {
 664                     buf[n] = v[j];
 665                     n++;
 666                 }
 667             }
 668
 669             if (cd->bInPlace)
 670             {
 671                 rbuf = v + nat_tot;
 672             }
 673             else
 674             {
 675                 rbuf = &comm->vbuf2.v[0][0];
 676             }
 677             /* Send and receive the coordinates */
 678             dd_sendrecv_real(dd, d, dddirBackward,
 679                              buf,  ind->nsend[nzone+1],
 680                              rbuf, ind->nrecv[nzone+1]);
 681             if (!cd->bInPlace)
 682             {
 683                 j = 0;
 684                 for (zone = 0; zone < nzone; zone++)
 685                 {
 686                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 687                     {
 688                         v[i] = rbuf[j];
 689                         j++;
 690                     }
 691                 }
 692             }
 693             nat_tot += ind->nrecv[nzone+1];
 694         }
 695         nzone += nzone;
 696     }
 697 }
 698
 699 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 700 {
 701     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 702     int                   *index, *cgindex;
 703     gmx_domdec_comm_t     *comm;
 704     gmx_domdec_comm_dim_t *cd;
 705     gmx_domdec_ind_t      *ind;
 706     real                  *buf, *sbuf;
 707
 708     comm = dd->comm;
 709
 710     cgindex = dd->cgindex;
 711
 712     buf = &comm->vbuf.v[0][0];
 713
 714     nzone   = comm->zones.n/2;
 715     nat_tot = dd->nat_tot;
 716     for (d = dd->ndim-1; d >= 0; d--)
 717     {
 718         cd = &comm->cd[d];
 719         for (p = cd->np-1; p >= 0; p--)
 720         {
 721             ind      = &cd->ind[p];
 722             nat_tot -= ind->nrecv[nzone+1];
 723             if (cd->bInPlace)
 724             {
 725                 sbuf = v + nat_tot;
 726             }
 727             else
 728             {
 729                 sbuf = &comm->vbuf2.v[0][0];
 730                 j    = 0;
 731                 for (zone = 0; zone < nzone; zone++)
 732                 {
 733                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 734                     {
 735                         sbuf[j] = v[i];
 736                         j++;
 737                     }
 738                 }
 739             }
 740             /* Communicate the forces */
 741             dd_sendrecv_real(dd, d, dddirForward,
 742                              sbuf, ind->nrecv[nzone+1],
 743                              buf,  ind->nsend[nzone+1]);
 744             index = ind->index;
 745             /* Add the received forces */
 746             n = 0;
 747             for (i = 0; i < ind->nsend[nzone]; i++)
 748             {
 749                 at0 = cgindex[index[i]];
 750                 at1 = cgindex[index[i]+1];
 751                 for (j = at0; j < at1; j++)
 752                 {
 753                     v[j] += buf[n];
 754                     n++;
 755                 }
 756             }
 757         }
 758         nzone /= 2;
 759     }
 760 }
 761
 762 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 763 {
 764     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 765             d, i, j,
 766             zone->min0, zone->max1,
 767             zone->mch0, zone->mch0,
 768             zone->p1_0, zone->p1_1);
 769 }
 770
 771
 772 #define DDZONECOMM_MAXZONE  5
 773 #define DDZONECOMM_BUFSIZE  3
 774
 775 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 776                                int ddimind, int direction,
 777                                gmx_ddzone_t *buf_s, int n_s,
 778                                gmx_ddzone_t *buf_r, int n_r)
 779 {
 780 #define ZBS  DDZONECOMM_BUFSIZE
 781     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 782     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 783     int  i;
 784
 785     for (i = 0; i < n_s; i++)
 786     {
 787         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 788         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 789         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 790         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 791         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 792         vbuf_s[i*ZBS+1][2] = 0;
 793         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 794         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 795         vbuf_s[i*ZBS+2][2] = 0;
 796     }
 797
 798     dd_sendrecv_rvec(dd, ddimind, direction,
 799                      vbuf_s, n_s*ZBS,
 800                      vbuf_r, n_r*ZBS);
 801
 802     for (i = 0; i < n_r; i++)
 803     {
 804         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 805         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 806         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 807         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 808         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 809         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 810         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 811     }
 812
 813 #undef ZBS
 814 }
 815
 816 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 817                           rvec cell_ns_x0, rvec cell_ns_x1)
 818 {
 819     int                d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
 820     gmx_ddzone_t      *zp;
 821     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 822     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 823     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 824     rvec               extr_s[2], extr_r[2];
 825     rvec               dh;
 826     real               dist_d, c = 0, det;
 827     gmx_domdec_comm_t *comm;
 828     gmx_bool           bPBC, bUse;
 829
 830     comm = dd->comm;
 831
 832     for (d = 1; d < dd->ndim; d++)
 833     {
 834         dim      = dd->dim[d];
 835         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 836         zp->min0 = cell_ns_x0[dim];
 837         zp->max1 = cell_ns_x1[dim];
 838         zp->min1 = cell_ns_x1[dim];
 839         zp->mch0 = cell_ns_x0[dim];
 840         zp->mch1 = cell_ns_x1[dim];
 841         zp->p1_0 = cell_ns_x0[dim];
 842         zp->p1_1 = cell_ns_x1[dim];
 843     }
 844
 845     for (d = dd->ndim-2; d >= 0; d--)
 846     {
 847         dim  = dd->dim[d];
 848         bPBC = (dim < ddbox->npbcdim);
 849
 850         /* Use an rvec to store two reals */
 851         extr_s[d][0] = comm->cell_f0[d+1];
 852         extr_s[d][1] = comm->cell_f1[d+1];
 853         extr_s[d][2] = comm->cell_f1[d+1];
 854
 855         pos = 0;
 856         /* Store the extremes in the backward sending buffer,
 857          * so the get updated separately from the forward communication.
 858          */
 859         for (d1 = d; d1 < dd->ndim-1; d1++)
 860         {
 861             /* We invert the order to be able to use the same loop for buf_e */
 862             buf_s[pos].min0 = extr_s[d1][1];
 863             buf_s[pos].max1 = extr_s[d1][0];
 864             buf_s[pos].min1 = extr_s[d1][2];
 865             buf_s[pos].mch0 = 0;
 866             buf_s[pos].mch1 = 0;
 867             /* Store the cell corner of the dimension we communicate along */
 868             buf_s[pos].p1_0 = comm->cell_x0[dim];
 869             buf_s[pos].p1_1 = 0;
 870             pos++;
 871         }
 872
 873         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 874         pos++;
 875
 876         if (dd->ndim == 3 && d == 0)
 877         {
 878             buf_s[pos] = comm->zone_d2[0][1];
 879             pos++;
 880             buf_s[pos] = comm->zone_d1[0];
 881             pos++;
 882         }
 883
 884         /* We only need to communicate the extremes
 885          * in the forward direction
 886          */
 887         npulse = comm->cd[d].np;
 888         if (bPBC)
 889         {
 890             /* Take the minimum to avoid double communication */
 891             npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
 892         }
 893         else
 894         {
 895             /* Without PBC we should really not communicate over
 896              * the boundaries, but implementing that complicates
 897              * the communication setup and therefore we simply
 898              * do all communication, but ignore some data.
 899              */
 900             npulse_min = npulse;
 901         }
 902         for (p = 0; p < npulse_min; p++)
 903         {
 904             /* Communicate the extremes forward */
 905             bUse = (bPBC || dd->ci[dim] > 0);
 906
 907             dd_sendrecv_rvec(dd, d, dddirForward,
 908                              extr_s+d, dd->ndim-d-1,
 909                              extr_r+d, dd->ndim-d-1);
 910
 911             if (bUse)
 912             {
 913                 for (d1 = d; d1 < dd->ndim-1; d1++)
 914                 {
 915                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 916                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 917                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 918                 }
 919             }
 920         }
 921
 922         buf_size = pos;
 923         for (p = 0; p < npulse; p++)
 924         {
 925             /* Communicate all the zone information backward */
 926             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 927
 928             dd_sendrecv_ddzone(dd, d, dddirBackward,
 929                                buf_s, buf_size,
 930                                buf_r, buf_size);
 931
 932             clear_rvec(dh);
 933             if (p > 0)
 934             {
 935                 for (d1 = d+1; d1 < dd->ndim; d1++)
 936                 {
 937                     /* Determine the decrease of maximum required
 938                      * communication height along d1 due to the distance along d,
 939                      * this avoids a lot of useless atom communication.
 940                      */
 941                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 942
 943                     if (ddbox->tric_dir[dim])
 944                     {
 945                         /* c is the off-diagonal coupling between the cell planes
 946                          * along directions d and d1.
 947                          */
 948                         c = ddbox->v[dim][dd->dim[d1]][dim];
 949                     }
 950                     else
 951                     {
 952                         c = 0;
 953                     }
 954                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 955                     if (det > 0)
 956                     {
 957                         dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
 958                     }
 959                     else
 960                     {
 961                         /* A negative value signals out of range */
 962                         dh[d1] = -1;
 963                     }
 964                 }
 965             }
 966
 967             /* Accumulate the extremes over all pulses */
 968             for (i = 0; i < buf_size; i++)
 969             {
 970                 if (p == 0)
 971                 {
 972                     buf_e[i] = buf_r[i];
 973                 }
 974                 else
 975                 {
 976                     if (bUse)
 977                     {
 978                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 979                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 980                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 981                     }
 982
 983                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 984                     {
 985                         d1 = 1;
 986                     }
 987                     else
 988                     {
 989                         d1 = d + 1;
 990                     }
 991                     if (bUse && dh[d1] >= 0)
 992                     {
 993                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 994                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 995                     }
 996                 }
 997                 /* Copy the received buffer to the send buffer,
 998                  * to pass the data through with the next pulse.
 999                  */
1000                 buf_s[i] = buf_r[i];
1001             }
1002             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1003                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1004             {
1005                 /* Store the extremes */
1006                 pos = 0;
1007
1008                 for (d1 = d; d1 < dd->ndim-1; d1++)
1009                 {
1010                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
1011                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
1012                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
1013                     pos++;
1014                 }
1015
1016                 if (d == 1 || (d == 0 && dd->ndim == 3))
1017                 {
1018                     for (i = d; i < 2; i++)
1019                     {
1020                         comm->zone_d2[1-d][i] = buf_e[pos];
1021                         pos++;
1022                     }
1023                 }
1024                 if (d == 0)
1025                 {
1026                     comm->zone_d1[1] = buf_e[pos];
1027                     pos++;
1028                 }
1029             }
1030         }
1031     }
1032
1033     if (dd->ndim >= 2)
1034     {
1035         dim = dd->dim[1];
1036         for (i = 0; i < 2; i++)
1037         {
1038             if (debug)
1039             {
1040                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1041             }
1042             cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1043             cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1044         }
1045     }
1046     if (dd->ndim >= 3)
1047     {
1048         dim = dd->dim[2];
1049         for (i = 0; i < 2; i++)
1050         {
1051             for (j = 0; j < 2; j++)
1052             {
1053                 if (debug)
1054                 {
1055                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1056                 }
1057                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1058                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1059             }
1060         }
1061     }
1062     for (d = 1; d < dd->ndim; d++)
1063     {
1064         comm->cell_f_max0[d] = extr_s[d-1][0];
1065         comm->cell_f_min1[d] = extr_s[d-1][1];
1066         if (debug)
1067         {
1068             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1069                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1070         }
1071     }
1072 }
1073
1074 static void dd_collect_cg(gmx_domdec_t  *dd,
1075                           const t_state *state_local)
1076 {
1077     gmx_domdec_master_t *ma = nullptr;
1078     int                  buf2[2], *ibuf, i, ncg_home = 0, nat_home = 0;
1079
1080     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1081     {
1082         /* The master has the correct distribution */
1083         return;
1084     }
1085
1086     const int *cg;
1087
1088     if (state_local->ddp_count == dd->ddp_count)
1089     {
1090         /* The local state and DD are in sync, use the DD indices */
1091         ncg_home = dd->ncg_home;
1092         cg       = dd->index_gl;
1093         nat_home = dd->nat_home;
1094     }
1095     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1096     {
1097         /* The DD is out of sync with the local state, but we have stored
1098          * the cg indices with the local state, so we can use those.
1099          */
1100         t_block *cgs_gl;
1101
1102         cgs_gl = &dd->comm->cgs_gl;
1103
1104         ncg_home = state_local->cg_gl.size();
1105         cg       = state_local->cg_gl.data();
1106         nat_home = 0;
1107         for (i = 0; i < ncg_home; i++)
1108         {
1109             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1110         }
1111     }
1112     else
1113     {
1114         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1115     }
1116
1117     buf2[0] = ncg_home;
1118     buf2[1] = nat_home;
1119     if (DDMASTER(dd))
1120     {
1121         ma   = dd->ma;
1122         ibuf = ma->ibuf;
1123     }
1124     else
1125     {
1126         ibuf = nullptr;
1127     }
1128     /* Collect the charge group and atom counts on the master */
1129     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1130
1131     if (DDMASTER(dd))
1132     {
1133         ma->index[0] = 0;
1134         for (i = 0; i < dd->nnodes; i++)
1135         {
1136             ma->ncg[i]     = ma->ibuf[2*i];
1137             ma->nat[i]     = ma->ibuf[2*i+1];
1138             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1139
1140         }
1141         /* Make byte counts and indices */
1142         for (i = 0; i < dd->nnodes; i++)
1143         {
1144             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1145             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1146         }
1147         if (debug)
1148         {
1149             fprintf(debug, "Initial charge group distribution: ");
1150             for (i = 0; i < dd->nnodes; i++)
1151             {
1152                 fprintf(debug, " %d", ma->ncg[i]);
1153             }
1154             fprintf(debug, "\n");
1155         }
1156     }
1157
1158     /* Collect the charge group indices on the master */
1159     dd_gatherv(dd,
1160                ncg_home*sizeof(int), cg,
1161                DDMASTER(dd) ? ma->ibuf : nullptr,
1162                DDMASTER(dd) ? ma->ibuf+dd->nnodes : nullptr,
1163                DDMASTER(dd) ? ma->cg : nullptr);
1164
1165     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1166 }
1167
1168 static void dd_collect_vec_sendrecv(gmx_domdec_t                  *dd,
1169                                     gmx::ArrayRef<const gmx::RVec> lv,
1170                                     gmx::ArrayRef<gmx::RVec>       v)
1171 {
1172     gmx_domdec_master_t *ma;
1173     int                  n, i, c, a, nalloc = 0;
1174     rvec                *buf = nullptr;
1175     t_block             *cgs_gl;
1176
1177     ma = dd->ma;
1178
1179     if (!DDMASTER(dd))
1180     {
1181 #if GMX_MPI
1182         MPI_Send(const_cast<void *>(static_cast<const void *>(lv.data())), dd->nat_home*sizeof(rvec), MPI_BYTE,
1183                  DDMASTERRANK(dd), dd->rank, dd->mpi_comm_all);
1184 #endif
1185     }
1186     else
1187     {
1188         /* Copy the master coordinates to the global array */
1189         cgs_gl = &dd->comm->cgs_gl;
1190
1191         n = DDMASTERRANK(dd);
1192         a = 0;
1193         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1194         {
1195             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1196             {
1197                 copy_rvec(lv[a++], v[c]);
1198             }
1199         }
1200
1201         for (n = 0; n < dd->nnodes; n++)
1202         {
1203             if (n != dd->rank)
1204             {
1205                 if (ma->nat[n] > nalloc)
1206                 {
1207                     nalloc = over_alloc_dd(ma->nat[n]);
1208                     srenew(buf, nalloc);
1209                 }
1210 #if GMX_MPI
1211                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1212                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1213 #endif
1214                 a = 0;
1215                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1216                 {
1217                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1218                     {
1219                         copy_rvec(buf[a++], v[c]);
1220                     }
1221                 }
1222             }
1223         }
1224         sfree(buf);
1225     }
1226 }
1227
1228 static void get_commbuffer_counts(gmx_domdec_t *dd,
1229                                   int **counts, int **disps)
1230 {
1231     gmx_domdec_master_t *ma;
1232     int                  n;
1233
1234     ma = dd->ma;
1235
1236     /* Make the rvec count and displacment arrays */
1237     *counts  = ma->ibuf;
1238     *disps   = ma->ibuf + dd->nnodes;
1239     for (n = 0; n < dd->nnodes; n++)
1240     {
1241         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1242         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1243     }
1244 }
1245
1246 static void dd_collect_vec_gatherv(gmx_domdec_t                  *dd,
1247                                    gmx::ArrayRef<const gmx::RVec> lv,
1248                                    gmx::ArrayRef<gmx::RVec>       v)
1249 {
1250     gmx_domdec_master_t *ma;
1251     int                 *rcounts = nullptr, *disps = nullptr;
1252     int                  n, i, c, a;
1253     rvec                *buf = nullptr;
1254     t_block             *cgs_gl;
1255
1256     ma = dd->ma;
1257
1258     if (DDMASTER(dd))
1259     {
1260         get_commbuffer_counts(dd, &rcounts, &disps);
1261
1262         buf = ma->vbuf;
1263     }
1264
1265     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv.data(), rcounts, disps, buf);
1266
1267     if (DDMASTER(dd))
1268     {
1269         cgs_gl = &dd->comm->cgs_gl;
1270
1271         a = 0;
1272         for (n = 0; n < dd->nnodes; n++)
1273         {
1274             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1275             {
1276                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1277                 {
1278                     copy_rvec(buf[a++], v[c]);
1279                 }
1280             }
1281         }
1282     }
1283 }
1284
1285 void dd_collect_vec(gmx_domdec_t                  *dd,
1286                     const t_state                 *state_local,
1287                     gmx::ArrayRef<const gmx::RVec> lv,
1288                     gmx::ArrayRef<gmx::RVec>       v)
1289 {
1290     dd_collect_cg(dd, state_local);
1291
1292     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1293     {
1294         dd_collect_vec_sendrecv(dd, lv, v);
1295     }
1296     else
1297     {
1298         dd_collect_vec_gatherv(dd, lv, v);
1299     }
1300 }
1301
1302
1303 void dd_collect_state(gmx_domdec_t *dd,
1304                       const t_state *state_local, t_state *state)
1305 {
1306     int nh = state_local->nhchainlength;
1307
1308     if (DDMASTER(dd))
1309     {
1310         GMX_RELEASE_ASSERT(state->nhchainlength == nh, "The global and local Nose-Hoover chain lengths should match");
1311
1312         for (int i = 0; i < efptNR; i++)
1313         {
1314             state->lambda[i] = state_local->lambda[i];
1315         }
1316         state->fep_state = state_local->fep_state;
1317         state->veta      = state_local->veta;
1318         state->vol0      = state_local->vol0;
1319         copy_mat(state_local->box, state->box);
1320         copy_mat(state_local->boxv, state->boxv);
1321         copy_mat(state_local->svir_prev, state->svir_prev);
1322         copy_mat(state_local->fvir_prev, state->fvir_prev);
1323         copy_mat(state_local->pres_prev, state->pres_prev);
1324
1325         for (int i = 0; i < state_local->ngtc; i++)
1326         {
1327             for (int j = 0; j < nh; j++)
1328             {
1329                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1330                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1331             }
1332             state->therm_integral[i] = state_local->therm_integral[i];
1333         }
1334         for (int i = 0; i < state_local->nnhpres; i++)
1335         {
1336             for (int j = 0; j < nh; j++)
1337             {
1338                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1339                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1340             }
1341         }
1342         state->baros_integral = state_local->baros_integral;
1343     }
1344     if (state_local->flags & (1 << estX))
1345     {
1346         gmx::ArrayRef<gmx::RVec> globalXRef = state ? gmx::makeArrayRef(state->x) : gmx::EmptyArrayRef();
1347         dd_collect_vec(dd, state_local, state_local->x, globalXRef);
1348     }
1349     if (state_local->flags & (1 << estV))
1350     {
1351         gmx::ArrayRef<gmx::RVec> globalVRef = state ? gmx::makeArrayRef(state->v) : gmx::EmptyArrayRef();
1352         dd_collect_vec(dd, state_local, state_local->v, globalVRef);
1353     }
1354     if (state_local->flags & (1 << estCGP))
1355     {
1356         gmx::ArrayRef<gmx::RVec> globalCgpRef = state ? gmx::makeArrayRef(state->cg_p) : gmx::EmptyArrayRef();
1357         dd_collect_vec(dd, state_local, state_local->cg_p, globalCgpRef);
1358     }
1359 }
1360
1361 static void dd_resize_state(t_state *state, PaddedRVecVector *f, int natoms)
1362 {
1363     if (debug)
1364     {
1365         fprintf(debug, "Resizing state: currently %d, required %d\n", state->natoms, natoms);
1366     }
1367
1368     state_change_natoms(state, natoms);
1369
1370     if (f != nullptr)
1371     {
1372         /* We need to allocate one element extra, since we might use
1373          * (unaligned) 4-wide SIMD loads to access rvec entries.
1374          */
1375         f->resize(paddedRVecVectorSize(natoms));
1376     }
1377 }
1378
1379 static void dd_check_alloc_ncg(t_forcerec       *fr,
1380                                t_state          *state,
1381                                PaddedRVecVector *f,
1382                                int               numChargeGroups)
1383 {
1384     if (numChargeGroups > fr->cg_nalloc)
1385     {
1386         if (debug)
1387         {
1388             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, numChargeGroups, over_alloc_dd(numChargeGroups));
1389         }
1390         fr->cg_nalloc = over_alloc_dd(numChargeGroups);
1391         srenew(fr->cginfo, fr->cg_nalloc);
1392         if (fr->cutoff_scheme == ecutsGROUP)
1393         {
1394             srenew(fr->cg_cm, fr->cg_nalloc);
1395         }
1396     }
1397     if (fr->cutoff_scheme == ecutsVERLET)
1398     {
1399         /* We don't use charge groups, we use x in state to set up
1400          * the atom communication.
1401          */
1402         dd_resize_state(state, f, numChargeGroups);
1403     }
1404 }
1405
1406 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1407                                        const rvec *v, rvec *lv)
1408 {
1409     gmx_domdec_master_t *ma;
1410     int                  n, i, c, a, nalloc = 0;
1411     rvec                *buf = nullptr;
1412
1413     if (DDMASTER(dd))
1414     {
1415         ma  = dd->ma;
1416
1417         for (n = 0; n < dd->nnodes; n++)
1418         {
1419             if (n != dd->rank)
1420             {
1421                 if (ma->nat[n] > nalloc)
1422                 {
1423                     nalloc = over_alloc_dd(ma->nat[n]);
1424                     srenew(buf, nalloc);
1425                 }
1426                 /* Use lv as a temporary buffer */
1427                 a = 0;
1428                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1429                 {
1430                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1431                     {
1432                         copy_rvec(v[c], buf[a++]);
1433                     }
1434                 }
1435                 if (a != ma->nat[n])
1436                 {
1437                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1438                               a, ma->nat[n]);
1439                 }
1440
1441 #if GMX_MPI
1442                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1443                          DDRANK(dd, n), n, dd->mpi_comm_all);
1444 #endif
1445             }
1446         }
1447         sfree(buf);
1448         n = DDMASTERRANK(dd);
1449         a = 0;
1450         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1451         {
1452             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1453             {
1454                 copy_rvec(v[c], lv[a++]);
1455             }
1456         }
1457     }
1458     else
1459     {
1460 #if GMX_MPI
1461         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1462                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1463 #endif
1464     }
1465 }
1466
1467 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1468                                        const rvec *v, rvec *lv)
1469 {
1470     gmx_domdec_master_t *ma;
1471     int                 *scounts = nullptr, *disps = nullptr;
1472     int                  n, i, c, a;
1473     rvec                *buf = nullptr;
1474
1475     if (DDMASTER(dd))
1476     {
1477         ma  = dd->ma;
1478
1479         get_commbuffer_counts(dd, &scounts, &disps);
1480
1481         buf = ma->vbuf;
1482         a   = 0;
1483         for (n = 0; n < dd->nnodes; n++)
1484         {
1485             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1486             {
1487                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1488                 {
1489                     copy_rvec(v[c], buf[a++]);
1490                 }
1491             }
1492         }
1493     }
1494
1495     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1496 }
1497
1498 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs,
1499                               const rvec *v, rvec *lv)
1500 {
1501     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1502     {
1503         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1504     }
1505     else
1506     {
1507         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1508     }
1509 }
1510
1511 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1512 {
1513     if (dfhist == nullptr)
1514     {
1515         return;
1516     }
1517
1518     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1519     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1520     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1521
1522     if (dfhist->nlambda > 0)
1523     {
1524         int nlam = dfhist->nlambda;
1525         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1526         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1527         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1528         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1529         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1530         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1531
1532         for (int i = 0; i < nlam; i++)
1533         {
1534             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1535             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1536             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1537             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1538             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1539             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1540         }
1541     }
1542 }
1543
1544 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1545                                 t_state *state, t_state *state_local,
1546                                 PaddedRVecVector *f)
1547 {
1548     int nh = state_local->nhchainlength;
1549
1550     if (DDMASTER(dd))
1551     {
1552         GMX_RELEASE_ASSERT(state->nhchainlength == nh, "The global and local Nose-Hoover chain lengths should match");
1553
1554         for (int i = 0; i < efptNR; i++)
1555         {
1556             state_local->lambda[i] = state->lambda[i];
1557         }
1558         state_local->fep_state = state->fep_state;
1559         state_local->veta      = state->veta;
1560         state_local->vol0      = state->vol0;
1561         copy_mat(state->box, state_local->box);
1562         copy_mat(state->box_rel, state_local->box_rel);
1563         copy_mat(state->boxv, state_local->boxv);
1564         copy_mat(state->svir_prev, state_local->svir_prev);
1565         copy_mat(state->fvir_prev, state_local->fvir_prev);
1566         if (state->dfhist != nullptr)
1567         {
1568             copy_df_history(state_local->dfhist, state->dfhist);
1569         }
1570         for (int i = 0; i < state_local->ngtc; i++)
1571         {
1572             for (int j = 0; j < nh; j++)
1573             {
1574                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1575                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1576             }
1577             state_local->therm_integral[i] = state->therm_integral[i];
1578         }
1579         for (int i = 0; i < state_local->nnhpres; i++)
1580         {
1581             for (int j = 0; j < nh; j++)
1582             {
1583                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1584                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1585             }
1586         }
1587         state_local->baros_integral = state->baros_integral;
1588     }
1589     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda.data());
1590     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1591     dd_bcast(dd, sizeof(real), &state_local->veta);
1592     dd_bcast(dd, sizeof(real), &state_local->vol0);
1593     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1594     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1595     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1596     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1597     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1598     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi.data());
1599     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi.data());
1600     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral.data());
1601     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi.data());
1602     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi.data());
1603
1604     /* communicate df_history -- required for restarting from checkpoint */
1605     dd_distribute_dfhist(dd, state_local->dfhist);
1606
1607     dd_resize_state(state_local, f, dd->nat_home);
1608
1609     if (state_local->flags & (1 << estX))
1610     {
1611         const rvec *xGlobal = (DDMASTER(dd) ? as_rvec_array(state->x.data()) : nullptr);
1612         dd_distribute_vec(dd, cgs, xGlobal, as_rvec_array(state_local->x.data()));
1613     }
1614     if (state_local->flags & (1 << estV))
1615     {
1616         const rvec *vGlobal = (DDMASTER(dd) ? as_rvec_array(state->v.data()) : nullptr);
1617         dd_distribute_vec(dd, cgs, vGlobal, as_rvec_array(state_local->v.data()));
1618     }
1619     if (state_local->flags & (1 << estCGP))
1620     {
1621         const rvec *cgpGlobal = (DDMASTER(dd) ? as_rvec_array(state->cg_p.data()) : nullptr);
1622         dd_distribute_vec(dd, cgs, cgpGlobal, as_rvec_array(state_local->cg_p.data()));
1623     }
1624 }
1625
1626 static char dim2char(int dim)
1627 {
1628     char c = '?';
1629
1630     switch (dim)
1631     {
1632         case XX: c = 'X'; break;
1633         case YY: c = 'Y'; break;
1634         case ZZ: c = 'Z'; break;
1635         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1636     }
1637
1638     return c;
1639 }
1640
1641 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1642                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1643 {
1644     rvec   grid_s[2], *grid_r = nullptr, cx, r;
1645     char   fname[STRLEN], buf[22];
1646     FILE  *out;
1647     int    a, i, d, z, y, x;
1648     matrix tric;
1649     real   vol;
1650
1651     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1652     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1653
1654     if (DDMASTER(dd))
1655     {
1656         snew(grid_r, 2*dd->nnodes);
1657     }
1658
1659     dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : nullptr);
1660
1661     if (DDMASTER(dd))
1662     {
1663         for (d = 0; d < DIM; d++)
1664         {
1665             for (i = 0; i < DIM; i++)
1666             {
1667                 if (d == i)
1668                 {
1669                     tric[d][i] = 1;
1670                 }
1671                 else
1672                 {
1673                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1674                     {
1675                         tric[d][i] = box[i][d]/box[i][i];
1676                     }
1677                     else
1678                     {
1679                         tric[d][i] = 0;
1680                     }
1681                 }
1682             }
1683         }
1684         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1685         out = gmx_fio_fopen(fname, "w");
1686         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1687         a = 1;
1688         for (i = 0; i < dd->nnodes; i++)
1689         {
1690             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1691             for (d = 0; d < DIM; d++)
1692             {
1693                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1694             }
1695             for (z = 0; z < 2; z++)
1696             {
1697                 for (y = 0; y < 2; y++)
1698                 {
1699                     for (x = 0; x < 2; x++)
1700                     {
1701                         cx[XX] = grid_r[i*2+x][XX];
1702                         cx[YY] = grid_r[i*2+y][YY];
1703                         cx[ZZ] = grid_r[i*2+z][ZZ];
1704                         mvmul(tric, cx, r);
1705                         gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
1706                                                  10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
1707                     }
1708                 }
1709             }
1710             for (d = 0; d < DIM; d++)
1711             {
1712                 for (x = 0; x < 4; x++)
1713                 {
1714                     switch (d)
1715                     {
1716                         case 0: y = 1 + i*8 + 2*x; break;
1717                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1718                         case 2: y = 1 + i*8 + x; break;
1719                     }
1720                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
1721                 }
1722             }
1723         }
1724         gmx_fio_fclose(out);
1725         sfree(grid_r);
1726     }
1727 }
1728
1729 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
1730                   const gmx_mtop_t *mtop, const t_commrec *cr,
1731                   int natoms, rvec x[], matrix box)
1732 {
1733     char          fname[STRLEN], buf[22];
1734     FILE         *out;
1735     int           i, ii, resnr, c;
1736     const char   *atomname, *resname;
1737     real          b;
1738     gmx_domdec_t *dd;
1739
1740     dd = cr->dd;
1741     if (natoms == -1)
1742     {
1743         natoms = dd->comm->nat[ddnatVSITE];
1744     }
1745
1746     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
1747
1748     out = gmx_fio_fopen(fname, "w");
1749
1750     fprintf(out, "TITLE     %s\n", title);
1751     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1752     int molb = 0;
1753     for (i = 0; i < natoms; i++)
1754     {
1755         ii = dd->gatindex[i];
1756         mtopGetAtomAndResidueName(mtop, ii, &molb, &atomname, &resnr, &resname, nullptr);
1757         if (i < dd->comm->nat[ddnatZONE])
1758         {
1759             c = 0;
1760             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1761             {
1762                 c++;
1763             }
1764             b = c;
1765         }
1766         else if (i < dd->comm->nat[ddnatVSITE])
1767         {
1768             b = dd->comm->zones.n;
1769         }
1770         else
1771         {
1772             b = dd->comm->zones.n + 1;
1773         }
1774         gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
1775                                  10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
1776     }
1777     fprintf(out, "TER\n");
1778
1779     gmx_fio_fclose(out);
1780 }
1781
1782 real dd_cutoff_multibody(const gmx_domdec_t *dd)
1783 {
1784     gmx_domdec_comm_t *comm;
1785     int                di;
1786     real               r;
1787
1788     comm = dd->comm;
1789
1790     r = -1;
1791     if (comm->bInterCGBondeds)
1792     {
1793         if (comm->cutoff_mbody > 0)
1794         {
1795             r = comm->cutoff_mbody;
1796         }
1797         else
1798         {
1799             /* cutoff_mbody=0 means we do not have DLB */
1800             r = comm->cellsize_min[dd->dim[0]];
1801             for (di = 1; di < dd->ndim; di++)
1802             {
1803                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
1804             }
1805             if (comm->bBondComm)
1806             {
1807                 r = std::max(r, comm->cutoff_mbody);
1808             }
1809             else
1810             {
1811                 r = std::min(r, comm->cutoff);
1812             }
1813         }
1814     }
1815
1816     return r;
1817 }
1818
1819 real dd_cutoff_twobody(const gmx_domdec_t *dd)
1820 {
1821     real r_mb;
1822
1823     r_mb = dd_cutoff_multibody(dd);
1824
1825     return std::max(dd->comm->cutoff, r_mb);
1826 }
1827
1828
1829 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
1830                                    ivec coord_pme)
1831 {
1832     int nc, ntot;
1833
1834     nc   = dd->nc[dd->comm->cartpmedim];
1835     ntot = dd->comm->ntot[dd->comm->cartpmedim];
1836     copy_ivec(coord, coord_pme);
1837     coord_pme[dd->comm->cartpmedim] =
1838         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
1839 }
1840
1841 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
1842 {
1843     int npp, npme;
1844
1845     npp  = dd->nnodes;
1846     npme = dd->comm->npmenodes;
1847
1848     /* Here we assign a PME node to communicate with this DD node
1849      * by assuming that the major index of both is x.
1850      * We add cr->npmenodes/2 to obtain an even distribution.
1851      */
1852     return (ddindex*npme + npme/2)/npp;
1853 }
1854
1855 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
1856 {
1857     int *pme_rank;
1858     int  n, i, p0, p1;
1859
1860     snew(pme_rank, dd->comm->npmenodes);
1861     n = 0;
1862     for (i = 0; i < dd->nnodes; i++)
1863     {
1864         p0 = ddindex2pmeindex(dd, i);
1865         p1 = ddindex2pmeindex(dd, i+1);
1866         if (i+1 == dd->nnodes || p1 > p0)
1867         {
1868             if (debug)
1869             {
1870                 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
1871             }
1872             pme_rank[n] = i + 1 + n;
1873             n++;
1874         }
1875     }
1876
1877     return pme_rank;
1878 }
1879
1880 static int gmx_ddcoord2pmeindex(const t_commrec *cr, int x, int y, int z)
1881 {
1882     gmx_domdec_t *dd;
1883     ivec          coords;
1884     int           slab;
1885
1886     dd = cr->dd;
1887     /*
1888        if (dd->comm->bCartesian) {
1889        gmx_ddindex2xyz(dd->nc,ddindex,coords);
1890        dd_coords2pmecoords(dd,coords,coords_pme);
1891        copy_ivec(dd->ntot,nc);
1892        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
1893        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1894
1895        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
1896        } else {
1897        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
1898        }
1899      */
1900     coords[XX] = x;
1901     coords[YY] = y;
1902     coords[ZZ] = z;
1903     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
1904
1905     return slab;
1906 }
1907
1908 static int ddcoord2simnodeid(const t_commrec *cr, int x, int y, int z)
1909 {
1910     gmx_domdec_comm_t *comm;
1911     ivec               coords;
1912     int                ddindex, nodeid = -1;
1913
1914     comm = cr->dd->comm;
1915
1916     coords[XX] = x;
1917     coords[YY] = y;
1918     coords[ZZ] = z;
1919     if (comm->bCartesianPP_PME)
1920     {
1921 #if GMX_MPI
1922         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
1923 #endif
1924     }
1925     else
1926     {
1927         ddindex = dd_index(cr->dd->nc, coords);
1928         if (comm->bCartesianPP)
1929         {
1930             nodeid = comm->ddindex2simnodeid[ddindex];
1931         }
1932         else
1933         {
1934             if (comm->pmenodes)
1935             {
1936                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
1937             }
1938             else
1939             {
1940                 nodeid = ddindex;
1941             }
1942         }
1943     }
1944
1945     return nodeid;
1946 }
1947
1948 static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
1949                               const t_commrec gmx_unused *cr,
1950                               int                         sim_nodeid)
1951 {
1952     int pmenode = -1;
1953
1954     const gmx_domdec_comm_t *comm = dd->comm;
1955
1956     /* This assumes a uniform x domain decomposition grid cell size */
1957     if (comm->bCartesianPP_PME)
1958     {
1959 #if GMX_MPI
1960         ivec coord, coord_pme;
1961         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
1962         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1963         {
1964             /* This is a PP node */
1965             dd_cart_coord2pmecoord(dd, coord, coord_pme);
1966             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
1967         }
1968 #endif
1969     }
1970     else if (comm->bCartesianPP)
1971     {
1972         if (sim_nodeid < dd->nnodes)
1973         {
1974             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1975         }
1976     }
1977     else
1978     {
1979         /* This assumes DD cells with identical x coordinates
1980          * are numbered sequentially.
1981          */
1982         if (dd->comm->pmenodes == nullptr)
1983         {
1984             if (sim_nodeid < dd->nnodes)
1985             {
1986                 /* The DD index equals the nodeid */
1987                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1988             }
1989         }
1990         else
1991         {
1992             int i = 0;
1993             while (sim_nodeid > dd->comm->pmenodes[i])
1994             {
1995                 i++;
1996             }
1997             if (sim_nodeid < dd->comm->pmenodes[i])
1998             {
1999                 pmenode = dd->comm->pmenodes[i];
2000             }
2001         }
2002     }
2003
2004     return pmenode;
2005 }
2006
2007 void get_pme_nnodes(const gmx_domdec_t *dd,
2008                     int *npmenodes_x, int *npmenodes_y)
2009 {
2010     if (dd != nullptr)
2011     {
2012         *npmenodes_x = dd->comm->npmenodes_x;
2013         *npmenodes_y = dd->comm->npmenodes_y;
2014     }
2015     else
2016     {
2017         *npmenodes_x = 1;
2018         *npmenodes_y = 1;
2019     }
2020 }
2021
2022 std::vector<int> get_pme_ddranks(const t_commrec *cr, int pmenodeid)
2023 {
2024     gmx_domdec_t *dd;
2025     int           x, y, z;
2026     ivec          coord, coord_pme;
2027
2028     dd = cr->dd;
2029
2030     std::vector<int> ddranks;
2031     ddranks.reserve((dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2032
2033     for (x = 0; x < dd->nc[XX]; x++)
2034     {
2035         for (y = 0; y < dd->nc[YY]; y++)
2036         {
2037             for (z = 0; z < dd->nc[ZZ]; z++)
2038             {
2039                 if (dd->comm->bCartesianPP_PME)
2040                 {
2041                     coord[XX] = x;
2042                     coord[YY] = y;
2043                     coord[ZZ] = z;
2044                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2045                     if (dd->ci[XX] == coord_pme[XX] &&
2046                         dd->ci[YY] == coord_pme[YY] &&
2047                         dd->ci[ZZ] == coord_pme[ZZ])
2048                     {
2049                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
2050                     }
2051                 }
2052                 else
2053                 {
2054                     /* The slab corresponds to the nodeid in the PME group */
2055                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2056                     {
2057                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
2058                     }
2059                 }
2060             }
2061         }
2062     }
2063     return ddranks;
2064 }
2065
2066 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
2067 {
2068     gmx_bool bReceive = TRUE;
2069
2070     if (cr->npmenodes < dd->nnodes)
2071     {
2072         gmx_domdec_comm_t *comm = dd->comm;
2073         if (comm->bCartesianPP_PME)
2074         {
2075 #if GMX_MPI
2076             int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2077             ivec coords;
2078             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2079             coords[comm->cartpmedim]++;
2080             if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2081             {
2082                 int rank;
2083                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2084                 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
2085                 {
2086                     /* This is not the last PP node for pmenode */
2087                     bReceive = FALSE;
2088                 }
2089             }
2090 #else
2091             GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
2092 #endif
2093         }
2094         else
2095         {
2096             int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2097             if (cr->sim_nodeid+1 < cr->nnodes &&
2098                 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
2099             {
2100                 /* This is not the last PP node for pmenode */
2101                 bReceive = FALSE;
2102             }
2103         }
2104     }
2105
2106     return bReceive;
2107 }
2108
2109 static void set_zones_ncg_home(gmx_domdec_t *dd)
2110 {
2111     gmx_domdec_zones_t *zones;
2112     int                 i;
2113
2114     zones = &dd->comm->zones;
2115
2116     zones->cg_range[0] = 0;
2117     for (i = 1; i < zones->n+1; i++)
2118     {
2119         zones->cg_range[i] = dd->ncg_home;
2120     }
2121     /* zone_ncg1[0] should always be equal to ncg_home */
2122     dd->comm->zone_ncg1[0] = dd->ncg_home;
2123 }
2124
2125 static void rebuild_cgindex(gmx_domdec_t *dd,
2126                             const int *gcgs_index, const t_state *state)
2127 {
2128     int * gmx_restrict dd_cg_gl = dd->index_gl;
2129     int * gmx_restrict cgindex  = dd->cgindex;
2130     int                nat      = 0;
2131
2132     /* Copy back the global charge group indices from state
2133      * and rebuild the local charge group to atom index.
2134      */
2135     cgindex[0] = nat;
2136     for (unsigned int i = 0; i < state->cg_gl.size(); i++)
2137     {
2138         cgindex[i]  = nat;
2139         int cg_gl   = state->cg_gl[i];
2140         dd_cg_gl[i] = cg_gl;
2141         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2142     }
2143     cgindex[state->cg_gl.size()] = nat;
2144
2145     dd->ncg_home = state->cg_gl.size();
2146     dd->nat_home = nat;
2147
2148     set_zones_ncg_home(dd);
2149 }
2150
2151 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2152 {
2153     while (cg >= cginfo_mb->cg_end)
2154     {
2155         cginfo_mb++;
2156     }
2157
2158     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2159 }
2160
2161 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2162                           t_forcerec *fr, char *bLocalCG)
2163 {
2164     cginfo_mb_t *cginfo_mb;
2165     int         *cginfo;
2166     int          cg;
2167
2168     if (fr != nullptr)
2169     {
2170         cginfo_mb = fr->cginfo_mb;
2171         cginfo    = fr->cginfo;
2172
2173         for (cg = cg0; cg < cg1; cg++)
2174         {
2175             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2176         }
2177     }
2178
2179     if (bLocalCG != nullptr)
2180     {
2181         for (cg = cg0; cg < cg1; cg++)
2182         {
2183             bLocalCG[index_gl[cg]] = TRUE;
2184         }
2185     }
2186 }
2187
2188 static void make_dd_indices(gmx_domdec_t *dd,
2189                             const int *gcgs_index, int cg_start)
2190 {
2191     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2192     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2193     gmx_bool     bCGs;
2194
2195     if (dd->nat_tot > dd->gatindex_nalloc)
2196     {
2197         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2198         srenew(dd->gatindex, dd->gatindex_nalloc);
2199     }
2200
2201     nzone      = dd->comm->zones.n;
2202     zone2cg    = dd->comm->zones.cg_range;
2203     zone_ncg1  = dd->comm->zone_ncg1;
2204     index_gl   = dd->index_gl;
2205     gatindex   = dd->gatindex;
2206     bCGs       = dd->comm->bCGs;
2207
2208     if (zone2cg[1] != dd->ncg_home)
2209     {
2210         gmx_incons("dd->ncg_zone is not up to date");
2211     }
2212
2213     /* Make the local to global and global to local atom index */
2214     a = dd->cgindex[cg_start];
2215     for (zone = 0; zone < nzone; zone++)
2216     {
2217         if (zone == 0)
2218         {
2219             cg0 = cg_start;
2220         }
2221         else
2222         {
2223             cg0 = zone2cg[zone];
2224         }
2225         cg1    = zone2cg[zone+1];
2226         cg1_p1 = cg0 + zone_ncg1[zone];
2227
2228         for (cg = cg0; cg < cg1; cg++)
2229         {
2230             zone1 = zone;
2231             if (cg >= cg1_p1)
2232             {
2233                 /* Signal that this cg is from more than one pulse away */
2234                 zone1 += nzone;
2235             }
2236             cg_gl = index_gl[cg];
2237             if (bCGs)
2238             {
2239                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2240                 {
2241                     gatindex[a] = a_gl;
2242                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2243                     a++;
2244                 }
2245             }
2246             else
2247             {
2248                 gatindex[a] = cg_gl;
2249                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2250                 a++;
2251             }
2252         }
2253     }
2254 }
2255
2256 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2257                           const char *where)
2258 {
2259     int i, ngl, nerr;
2260
2261     nerr = 0;
2262     if (bLocalCG == nullptr)
2263     {
2264         return nerr;
2265     }
2266     for (i = 0; i < dd->ncg_tot; i++)
2267     {
2268         if (!bLocalCG[dd->index_gl[i]])
2269         {
2270             fprintf(stderr,
2271                     "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2272             nerr++;
2273         }
2274     }
2275     ngl = 0;
2276     for (i = 0; i < ncg_sys; i++)
2277     {
2278         if (bLocalCG[i])
2279         {
2280             ngl++;
2281         }
2282     }
2283     if (ngl != dd->ncg_tot)
2284     {
2285         fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2286         nerr++;
2287     }
2288
2289     return nerr;
2290 }
2291
2292 static void check_index_consistency(gmx_domdec_t *dd,
2293                                     int natoms_sys, int ncg_sys,
2294                                     const char *where)
2295 {
2296     int   nerr, ngl, i, a, cell;
2297     int  *have;
2298
2299     nerr = 0;
2300
2301     if (dd->comm->DD_debug > 1)
2302     {
2303         snew(have, natoms_sys);
2304         for (a = 0; a < dd->nat_tot; a++)
2305         {
2306             if (have[dd->gatindex[a]] > 0)
2307             {
2308                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2309             }
2310             else
2311             {
2312                 have[dd->gatindex[a]] = a + 1;
2313             }
2314         }
2315         sfree(have);
2316     }
2317
2318     snew(have, dd->nat_tot);
2319
2320     ngl  = 0;
2321     for (i = 0; i < natoms_sys; i++)
2322     {
2323         if (ga2la_get(dd->ga2la, i, &a, &cell))
2324         {
2325             if (a >= dd->nat_tot)
2326             {
2327                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2328                 nerr++;
2329             }
2330             else
2331             {
2332                 have[a] = 1;
2333                 if (dd->gatindex[a] != i)
2334                 {
2335                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2336                     nerr++;
2337                 }
2338             }
2339             ngl++;
2340         }
2341     }
2342     if (ngl != dd->nat_tot)
2343     {
2344         fprintf(stderr,
2345                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2346                 dd->rank, where, ngl, dd->nat_tot);
2347     }
2348     for (a = 0; a < dd->nat_tot; a++)
2349     {
2350         if (have[a] == 0)
2351         {
2352             fprintf(stderr,
2353                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
2354                     dd->rank, where, a+1, dd->gatindex[a]+1);
2355         }
2356     }
2357     sfree(have);
2358
2359     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2360
2361     if (nerr > 0)
2362     {
2363         gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2364                   dd->rank, where, nerr);
2365     }
2366 }
2367
2368 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2369 {
2370     int   i;
2371     char *bLocalCG;
2372
2373     if (a_start == 0)
2374     {
2375         /* Clear the whole list without searching */
2376         ga2la_clear(dd->ga2la);
2377     }
2378     else
2379     {
2380         for (i = a_start; i < dd->nat_tot; i++)
2381         {
2382             ga2la_del(dd->ga2la, dd->gatindex[i]);
2383         }
2384     }
2385
2386     bLocalCG = dd->comm->bLocalCG;
2387     if (bLocalCG)
2388     {
2389         for (i = cg_start; i < dd->ncg_tot; i++)
2390         {
2391             bLocalCG[dd->index_gl[i]] = FALSE;
2392         }
2393     }
2394
2395     dd_clear_local_vsite_indices(dd);
2396
2397     if (dd->constraints)
2398     {
2399         dd_clear_local_constraint_indices(dd);
2400     }
2401 }
2402
2403 /* This function should be used for moving the domain boudaries during DLB,
2404  * for obtaining the minimum cell size. It checks the initially set limit
2405  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2406  * and, possibly, a longer cut-off limit set for PME load balancing.
2407  */
2408 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2409 {
2410     real cellsize_min;
2411
2412     cellsize_min = comm->cellsize_min[dim];
2413
2414     if (!comm->bVacDLBNoLimit)
2415     {
2416         /* The cut-off might have changed, e.g. by PME load balacning,
2417          * from the value used to set comm->cellsize_min, so check it.
2418          */
2419         cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2420
2421         if (comm->bPMELoadBalDLBLimits)
2422         {
2423             /* Check for the cut-off limit set by the PME load balancing */
2424             cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2425         }
2426     }
2427
2428     return cellsize_min;
2429 }
2430
2431 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2432                             int dim_ind)
2433 {
2434     real grid_jump_limit;
2435
2436     /* The distance between the boundaries of cells at distance
2437      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2438      * and by the fact that cells should not be shifted by more than
2439      * half their size, such that cg's only shift by one cell
2440      * at redecomposition.
2441      */
2442     grid_jump_limit = comm->cellsize_limit;
2443     if (!comm->bVacDLBNoLimit)
2444     {
2445         if (comm->bPMELoadBalDLBLimits)
2446         {
2447             cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
2448         }
2449         grid_jump_limit = std::max(grid_jump_limit,
2450                                    cutoff/comm->cd[dim_ind].np);
2451     }
2452
2453     return grid_jump_limit;
2454 }
2455
2456 static gmx_bool check_grid_jump(gmx_int64_t     step,
2457                                 gmx_domdec_t   *dd,
2458                                 real            cutoff,
2459                                 gmx_ddbox_t    *ddbox,
2460                                 gmx_bool        bFatal)
2461 {
2462     gmx_domdec_comm_t *comm;
2463     int                d, dim;
2464     real               limit, bfac;
2465     gmx_bool           bInvalid;
2466
2467     bInvalid = FALSE;
2468
2469     comm = dd->comm;
2470
2471     for (d = 1; d < dd->ndim; d++)
2472     {
2473         dim   = dd->dim[d];
2474         limit = grid_jump_limit(comm, cutoff, d);
2475         bfac  = ddbox->box_size[dim];
2476         if (ddbox->tric_dir[dim])
2477         {
2478             bfac *= ddbox->skew_fac[dim];
2479         }
2480         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2481                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2482         {
2483             bInvalid = TRUE;
2484
2485             if (bFatal)
2486             {
2487                 char buf[22];
2488
2489                 /* This error should never be triggered under normal
2490                  * circumstances, but you never know ...
2491                  */
2492                 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2493                           gmx_step_str(step, buf),
2494                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2495             }
2496         }
2497     }
2498
2499     return bInvalid;
2500 }
2501
2502 static int dd_load_count(gmx_domdec_comm_t *comm)
2503 {
2504     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2505 }
2506
2507 static float dd_force_load(gmx_domdec_comm_t *comm)
2508 {
2509     float load;
2510
2511     if (comm->eFlop)
2512     {
2513         load = comm->flop;
2514         if (comm->eFlop > 1)
2515         {
2516             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2517         }
2518     }
2519     else
2520     {
2521         load = comm->cycl[ddCyclF];
2522         if (comm->cycl_n[ddCyclF] > 1)
2523         {
2524             /* Subtract the maximum of the last n cycle counts
2525              * to get rid of possible high counts due to other sources,
2526              * for instance system activity, that would otherwise
2527              * affect the dynamic load balancing.
2528              */
2529             load -= comm->cycl_max[ddCyclF];
2530         }
2531
2532 #if GMX_MPI
2533         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2534         {
2535             float gpu_wait, gpu_wait_sum;
2536
2537             gpu_wait = comm->cycl[ddCyclWaitGPU];
2538             if (comm->cycl_n[ddCyclF] > 1)
2539             {
2540                 /* We should remove the WaitGPU time of the same MD step
2541                  * as the one with the maximum F time, since the F time
2542                  * and the wait time are not independent.
2543                  * Furthermore, the step for the max F time should be chosen
2544                  * the same on all ranks that share the same GPU.
2545                  * But to keep the code simple, we remove the average instead.
2546                  * The main reason for artificially long times at some steps
2547                  * is spurious CPU activity or MPI time, so we don't expect
2548                  * that changes in the GPU wait time matter a lot here.
2549                  */
2550                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2551             }
2552             /* Sum the wait times over the ranks that share the same GPU */
2553             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2554                           comm->mpi_comm_gpu_shared);
2555             /* Replace the wait time by the average over the ranks */
2556             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2557         }
2558 #endif
2559     }
2560
2561     return load;
2562 }
2563
2564 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2565 {
2566     gmx_domdec_comm_t *comm;
2567     int                i;
2568
2569     comm = dd->comm;
2570
2571     snew(*dim_f, dd->nc[dim]+1);
2572     (*dim_f)[0] = 0;
2573     for (i = 1; i < dd->nc[dim]; i++)
2574     {
2575         if (comm->slb_frac[dim])
2576         {
2577             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2578         }
2579         else
2580         {
2581             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2582         }
2583     }
2584     (*dim_f)[dd->nc[dim]] = 1;
2585 }
2586
2587 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2588 {
2589     int  pmeindex, slab, nso, i;
2590     ivec xyz;
2591
2592     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2593     {
2594         ddpme->dim = YY;
2595     }
2596     else
2597     {
2598         ddpme->dim = dimind;
2599     }
2600     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2601
2602     ddpme->nslab = (ddpme->dim == 0 ?
2603                     dd->comm->npmenodes_x :
2604                     dd->comm->npmenodes_y);
2605
2606     if (ddpme->nslab <= 1)
2607     {
2608         return;
2609     }
2610
2611     nso = dd->comm->npmenodes/ddpme->nslab;
2612     /* Determine for each PME slab the PP location range for dimension dim */
2613     snew(ddpme->pp_min, ddpme->nslab);
2614     snew(ddpme->pp_max, ddpme->nslab);
2615     for (slab = 0; slab < ddpme->nslab; slab++)
2616     {
2617         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2618         ddpme->pp_max[slab] = 0;
2619     }
2620     for (i = 0; i < dd->nnodes; i++)
2621     {
2622         ddindex2xyz(dd->nc, i, xyz);
2623         /* For y only use our y/z slab.
2624          * This assumes that the PME x grid size matches the DD grid size.
2625          */
2626         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2627         {
2628             pmeindex = ddindex2pmeindex(dd, i);
2629             if (dimind == 0)
2630             {
2631                 slab = pmeindex/nso;
2632             }
2633             else
2634             {
2635                 slab = pmeindex % ddpme->nslab;
2636             }
2637             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
2638             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
2639         }
2640     }
2641
2642     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2643 }
2644
2645 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
2646 {
2647     if (dd->comm->ddpme[0].dim == XX)
2648     {
2649         return dd->comm->ddpme[0].maxshift;
2650     }
2651     else
2652     {
2653         return 0;
2654     }
2655 }
2656
2657 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
2658 {
2659     if (dd->comm->ddpme[0].dim == YY)
2660     {
2661         return dd->comm->ddpme[0].maxshift;
2662     }
2663     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2664     {
2665         return dd->comm->ddpme[1].maxshift;
2666     }
2667     else
2668     {
2669         return 0;
2670     }
2671 }
2672
2673 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2674                              gmx_bool bUniform, const gmx_ddbox_t *ddbox,
2675                              const real *cell_f)
2676 {
2677     gmx_domdec_comm_t *comm;
2678     int                nc, ns, s;
2679     int               *xmin, *xmax;
2680     real               range, pme_boundary;
2681     int                sh;
2682
2683     comm = dd->comm;
2684     nc   = dd->nc[ddpme->dim];
2685     ns   = ddpme->nslab;
2686
2687     if (!ddpme->dim_match)
2688     {
2689         /* PP decomposition is not along dim: the worst situation */
2690         sh = ns/2;
2691     }
2692     else if (ns <= 3 || (bUniform && ns == nc))
2693     {
2694         /* The optimal situation */
2695         sh = 1;
2696     }
2697     else
2698     {
2699         /* We need to check for all pme nodes which nodes they
2700          * could possibly need to communicate with.
2701          */
2702         xmin = ddpme->pp_min;
2703         xmax = ddpme->pp_max;
2704         /* Allow for atoms to be maximally 2/3 times the cut-off
2705          * out of their DD cell. This is a reasonable balance between
2706          * between performance and support for most charge-group/cut-off
2707          * combinations.
2708          */
2709         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2710         /* Avoid extra communication when we are exactly at a boundary */
2711         range *= 0.999;
2712
2713         sh = 1;
2714         for (s = 0; s < ns; s++)
2715         {
2716             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2717             pme_boundary = (real)s/ns;
2718             while (sh+1 < ns &&
2719                    ((s-(sh+1) >= 0 &&
2720                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2721                     (s-(sh+1) <  0 &&
2722                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2723             {
2724                 sh++;
2725             }
2726             pme_boundary = (real)(s+1)/ns;
2727             while (sh+1 < ns &&
2728                    ((s+(sh+1) <  ns &&
2729                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2730                     (s+(sh+1) >= ns &&
2731                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2732             {
2733                 sh++;
2734             }
2735         }
2736     }
2737
2738     ddpme->maxshift = sh;
2739
2740     if (debug)
2741     {
2742         fprintf(debug, "PME slab communication range for dim %d is %d\n",
2743                 ddpme->dim, ddpme->maxshift);
2744     }
2745 }
2746
2747 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
2748 {
2749     int d, dim;
2750
2751     for (d = 0; d < dd->ndim; d++)
2752     {
2753         dim = dd->dim[d];
2754         if (dim < ddbox->nboundeddim &&
2755             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2756             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2757         {
2758             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2759                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2760                       dd->nc[dim], dd->comm->cellsize_limit);
2761         }
2762     }
2763 }
2764
2765 enum {
2766     setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
2767 };
2768
2769 /* Set the domain boundaries. Use for static (or no) load balancing,
2770  * and also for the starting state for dynamic load balancing.
2771  * setmode determine if and where the boundaries are stored, use enum above.
2772  * Returns the number communication pulses in npulse.
2773  */
2774 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, const gmx_ddbox_t *ddbox,
2775                                   int setmode, ivec npulse)
2776 {
2777     gmx_domdec_comm_t *comm;
2778     int                d, j;
2779     rvec               cellsize_min;
2780     real              *cell_x, cell_dx, cellsize;
2781
2782     comm = dd->comm;
2783
2784     for (d = 0; d < DIM; d++)
2785     {
2786         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2787         npulse[d]       = 1;
2788         if (dd->nc[d] == 1 || comm->slb_frac[d] == nullptr)
2789         {
2790             /* Uniform grid */
2791             cell_dx = ddbox->box_size[d]/dd->nc[d];
2792             switch (setmode)
2793             {
2794                 case setcellsizeslbMASTER:
2795                     for (j = 0; j < dd->nc[d]+1; j++)
2796                     {
2797                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2798                     }
2799                     break;
2800                 case setcellsizeslbLOCAL:
2801                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2802                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2803                     break;
2804                 default:
2805                     break;
2806             }
2807             cellsize = cell_dx*ddbox->skew_fac[d];
2808             while (cellsize*npulse[d] < comm->cutoff)
2809             {
2810                 npulse[d]++;
2811             }
2812             cellsize_min[d] = cellsize;
2813         }
2814         else
2815         {
2816             /* Statically load balanced grid */
2817             /* Also when we are not doing a master distribution we determine
2818              * all cell borders in a loop to obtain identical values
2819              * to the master distribution case and to determine npulse.
2820              */
2821             if (setmode == setcellsizeslbMASTER)
2822             {
2823                 cell_x = dd->ma->cell_x[d];
2824             }
2825             else
2826             {
2827                 snew(cell_x, dd->nc[d]+1);
2828             }
2829             cell_x[0] = ddbox->box0[d];
2830             for (j = 0; j < dd->nc[d]; j++)
2831             {
2832                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
2833                 cell_x[j+1] = cell_x[j] + cell_dx;
2834                 cellsize    = cell_dx*ddbox->skew_fac[d];
2835                 while (cellsize*npulse[d] < comm->cutoff &&
2836                        npulse[d] < dd->nc[d]-1)
2837                 {
2838                     npulse[d]++;
2839                 }
2840                 cellsize_min[d] = std::min(cellsize_min[d], cellsize);
2841             }
2842             if (setmode == setcellsizeslbLOCAL)
2843             {
2844                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2845                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2846             }
2847             if (setmode != setcellsizeslbMASTER)
2848             {
2849                 sfree(cell_x);
2850             }
2851         }
2852         /* The following limitation is to avoid that a cell would receive
2853          * some of its own home charge groups back over the periodic boundary.
2854          * Double charge groups cause trouble with the global indices.
2855          */
2856         if (d < ddbox->npbcdim &&
2857             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2858         {
2859             char error_string[STRLEN];
2860
2861             sprintf(error_string,
2862                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2863                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
2864                     comm->cutoff,
2865                     dd->nc[d], dd->nc[d],
2866                     dd->nnodes > dd->nc[d] ? "cells" : "ranks");
2867
2868             if (setmode == setcellsizeslbLOCAL)
2869             {
2870                 gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2871                                      error_string);
2872             }
2873             else
2874             {
2875                 gmx_fatal(FARGS, error_string);
2876             }
2877         }
2878     }
2879
2880     if (!isDlbOn(comm))
2881     {
2882         copy_rvec(cellsize_min, comm->cellsize_min);
2883     }
2884
2885     for (d = 0; d < comm->npmedecompdim; d++)
2886     {
2887         set_pme_maxshift(dd, &comm->ddpme[d],
2888                          comm->slb_frac[dd->dim[d]] == nullptr, ddbox,
2889                          comm->ddpme[d].slb_dim_f);
2890     }
2891 }
2892
2893
2894 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2895                                                   int d, int dim, domdec_root_t *root,
2896                                                   const gmx_ddbox_t *ddbox,
2897                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
2898 {
2899     gmx_domdec_comm_t *comm;
2900     int                ncd, i, j, nmin, nmin_old;
2901     gmx_bool           bLimLo, bLimHi;
2902     real              *cell_size;
2903     real               fac, halfway, cellsize_limit_f_i, region_size;
2904     gmx_bool           bPBC, bLastHi = FALSE;
2905     int                nrange[] = {range[0], range[1]};
2906
2907     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
2908
2909     comm = dd->comm;
2910
2911     ncd = dd->nc[dim];
2912
2913     bPBC = (dim < ddbox->npbcdim);
2914
2915     cell_size = root->buf_ncd;
2916
2917     if (debug)
2918     {
2919         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
2920     }
2921
2922     /* First we need to check if the scaling does not make cells
2923      * smaller than the smallest allowed size.
2924      * We need to do this iteratively, since if a cell is too small,
2925      * it needs to be enlarged, which makes all the other cells smaller,
2926      * which could in turn make another cell smaller than allowed.
2927      */
2928     for (i = range[0]; i < range[1]; i++)
2929     {
2930         root->bCellMin[i] = FALSE;
2931     }
2932     nmin = 0;
2933     do
2934     {
2935         nmin_old = nmin;
2936         /* We need the total for normalization */
2937         fac = 0;
2938         for (i = range[0]; i < range[1]; i++)
2939         {
2940             if (root->bCellMin[i] == FALSE)
2941             {
2942                 fac += cell_size[i];
2943             }
2944         }
2945         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
2946         /* Determine the cell boundaries */
2947         for (i = range[0]; i < range[1]; i++)
2948         {
2949             if (root->bCellMin[i] == FALSE)
2950             {
2951                 cell_size[i] *= fac;
2952                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
2953                 {
2954                     cellsize_limit_f_i = 0;
2955                 }
2956                 else
2957                 {
2958                     cellsize_limit_f_i = cellsize_limit_f;
2959                 }
2960                 if (cell_size[i] < cellsize_limit_f_i)
2961                 {
2962                     root->bCellMin[i] = TRUE;
2963                     cell_size[i]      = cellsize_limit_f_i;
2964                     nmin++;
2965                 }
2966             }
2967             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
2968         }
2969     }
2970     while (nmin > nmin_old);
2971
2972     i            = range[1]-1;
2973     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
2974     /* For this check we should not use DD_CELL_MARGIN,
2975      * but a slightly smaller factor,
2976      * since rounding could get use below the limit.
2977      */
2978     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
2979     {
2980         char buf[22];
2981         gmx_fatal(FARGS, "step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
2982                   gmx_step_str(step, buf),
2983                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2984                   ncd, comm->cellsize_min[dim]);
2985     }
2986
2987     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
2988
2989     if (!bUniform)
2990     {
2991         /* Check if the boundary did not displace more than halfway
2992          * each of the cells it bounds, as this could cause problems,
2993          * especially when the differences between cell sizes are large.
2994          * If changes are applied, they will not make cells smaller
2995          * than the cut-off, as we check all the boundaries which
2996          * might be affected by a change and if the old state was ok,
2997          * the cells will at most be shrunk back to their old size.
2998          */
2999         for (i = range[0]+1; i < range[1]; i++)
3000         {
3001             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3002             if (root->cell_f[i] < halfway)
3003             {
3004                 root->cell_f[i] = halfway;
3005                 /* Check if the change also causes shifts of the next boundaries */
3006                 for (j = i+1; j < range[1]; j++)
3007                 {
3008                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3009                     {
3010                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3011                     }
3012                 }
3013             }
3014             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3015             if (root->cell_f[i] > halfway)
3016             {
3017                 root->cell_f[i] = halfway;
3018                 /* Check if the change also causes shifts of the next boundaries */
3019                 for (j = i-1; j >= range[0]+1; j--)
3020                 {
3021                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3022                     {
3023                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3024                     }
3025                 }
3026             }
3027         }
3028     }
3029
3030     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3031     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3032      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3033      * for a and b nrange is used */
3034     if (d > 0)
3035     {
3036         /* Take care of the staggering of the cell boundaries */
3037         if (bUniform)
3038         {
3039             for (i = range[0]; i < range[1]; i++)
3040             {
3041                 root->cell_f_max0[i] = root->cell_f[i];
3042                 root->cell_f_min1[i] = root->cell_f[i+1];
3043             }
3044         }
3045         else
3046         {
3047             for (i = range[0]+1; i < range[1]; i++)
3048             {
3049                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3050                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3051                 if (bLimLo && bLimHi)
3052                 {
3053                     /* Both limits violated, try the best we can */
3054                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3055                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3056                     nrange[0]       = range[0];
3057                     nrange[1]       = i;
3058                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3059
3060                     nrange[0] = i;
3061                     nrange[1] = range[1];
3062                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3063
3064                     return;
3065                 }
3066                 else if (bLimLo)
3067                 {
3068                     /* root->cell_f[i] = root->bound_min[i]; */
3069                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3070                     bLastHi   = FALSE;
3071                 }
3072                 else if (bLimHi && !bLastHi)
3073                 {
3074                     bLastHi = TRUE;
3075                     if (nrange[1] < range[1])   /* found a LimLo before */
3076                     {
3077                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3078                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3079                         nrange[0] = nrange[1];
3080                     }
3081                     root->cell_f[i] = root->bound_max[i];
3082                     nrange[1]       = i;
3083                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3084                     nrange[0] = i;
3085                     nrange[1] = range[1];
3086                 }
3087             }
3088             if (nrange[1] < range[1])   /* found last a LimLo */
3089             {
3090                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3091                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3092                 nrange[0] = nrange[1];
3093                 nrange[1] = range[1];
3094                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3095             }
3096             else if (nrange[0] > range[0]) /* found at least one LimHi */
3097             {
3098                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3099             }
3100         }
3101     }
3102 }
3103
3104
3105 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3106                                        int d, int dim, domdec_root_t *root,
3107                                        const gmx_ddbox_t *ddbox,
3108                                        gmx_bool bDynamicBox,
3109                                        gmx_bool bUniform, gmx_int64_t step)
3110 {
3111     gmx_domdec_comm_t *comm;
3112     int                ncd, d1, i, pos;
3113     real              *cell_size;
3114     real               load_aver, load_i, imbalance, change, change_max, sc;
3115     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3116     real               change_limit;
3117     real               relax = 0.5;
3118     gmx_bool           bPBC;
3119     int                range[] = { 0, 0 };
3120
3121     comm = dd->comm;
3122
3123     /* Convert the maximum change from the input percentage to a fraction */
3124     change_limit = comm->dlb_scale_lim*0.01;
3125
3126     ncd = dd->nc[dim];
3127
3128     bPBC = (dim < ddbox->npbcdim);
3129
3130     cell_size = root->buf_ncd;
3131
3132     /* Store the original boundaries */
3133     for (i = 0; i < ncd+1; i++)
3134     {
3135         root->old_cell_f[i] = root->cell_f[i];
3136     }
3137     if (bUniform)
3138     {
3139         for (i = 0; i < ncd; i++)
3140         {
3141             cell_size[i] = 1.0/ncd;
3142         }
3143     }
3144     else if (dd_load_count(comm) > 0)
3145     {
3146         load_aver  = comm->load[d].sum_m/ncd;
3147         change_max = 0;
3148         for (i = 0; i < ncd; i++)
3149         {
3150             /* Determine the relative imbalance of cell i */
3151             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3152             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3153             /* Determine the change of the cell size using underrelaxation */
3154             change     = -relax*imbalance;
3155             change_max = std::max(change_max, std::max(change, -change));
3156         }
3157         /* Limit the amount of scaling.
3158          * We need to use the same rescaling for all cells in one row,
3159          * otherwise the load balancing might not converge.
3160          */
3161         sc = relax;
3162         if (change_max > change_limit)
3163         {
3164             sc *= change_limit/change_max;
3165         }
3166         for (i = 0; i < ncd; i++)
3167         {
3168             /* Determine the relative imbalance of cell i */
3169             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3170             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3171             /* Determine the change of the cell size using underrelaxation */
3172             change       = -sc*imbalance;
3173             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3174         }
3175     }
3176
3177     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3178     cellsize_limit_f *= DD_CELL_MARGIN;
3179     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3180     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3181     if (ddbox->tric_dir[dim])
3182     {
3183         cellsize_limit_f /= ddbox->skew_fac[dim];
3184         dist_min_f       /= ddbox->skew_fac[dim];
3185     }
3186     if (bDynamicBox && d > 0)
3187     {
3188         dist_min_f *= DD_PRES_SCALE_MARGIN;
3189     }
3190     if (d > 0 && !bUniform)
3191     {
3192         /* Make sure that the grid is not shifted too much */
3193         for (i = 1; i < ncd; i++)
3194         {
3195             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3196             {
3197                 gmx_incons("Inconsistent DD boundary staggering limits!");
3198             }
3199             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3200             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3201             if (space > 0)
3202             {
3203                 root->bound_min[i] += 0.5*space;
3204             }
3205             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3206             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3207             if (space < 0)
3208             {
3209                 root->bound_max[i] += 0.5*space;
3210             }
3211             if (debug)
3212             {
3213                 fprintf(debug,
3214                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3215                         d, i,
3216                         root->cell_f_max0[i-1] + dist_min_f,
3217                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3218                         root->cell_f_min1[i] - dist_min_f);
3219             }
3220         }
3221     }
3222     range[1]          = ncd;
3223     root->cell_f[0]   = 0;
3224     root->cell_f[ncd] = 1;
3225     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3226
3227
3228     /* After the checks above, the cells should obey the cut-off
3229      * restrictions, but it does not hurt to check.
3230      */
3231     for (i = 0; i < ncd; i++)
3232     {
3233         if (debug)
3234         {
3235             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3236                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3237         }
3238
3239         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3240             root->cell_f[i+1] - root->cell_f[i] <
3241             cellsize_limit_f/DD_CELL_MARGIN)
3242         {
3243             char buf[22];
3244             fprintf(stderr,
3245                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3246                     gmx_step_str(step, buf), dim2char(dim), i,
3247                     (root->cell_f[i+1] - root->cell_f[i])
3248                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3249         }
3250     }
3251
3252     pos = ncd + 1;
3253     /* Store the cell boundaries of the lower dimensions at the end */
3254     for (d1 = 0; d1 < d; d1++)
3255     {
3256         root->cell_f[pos++] = comm->cell_f0[d1];
3257         root->cell_f[pos++] = comm->cell_f1[d1];
3258     }
3259
3260     if (d < comm->npmedecompdim)
3261     {
3262         /* The master determines the maximum shift for
3263          * the coordinate communication between separate PME nodes.
3264          */
3265         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3266     }
3267     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3268     if (d >= 1)
3269     {
3270         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3271     }
3272 }
3273
3274 static void relative_to_absolute_cell_bounds(gmx_domdec_t      *dd,
3275                                              const gmx_ddbox_t *ddbox,
3276                                              int                dimind)
3277 {
3278     gmx_domdec_comm_t *comm;
3279     int                dim;
3280
3281     comm = dd->comm;
3282
3283     /* Set the cell dimensions */
3284     dim                = dd->dim[dimind];
3285     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3286     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3287     if (dim >= ddbox->nboundeddim)
3288     {
3289         comm->cell_x0[dim] += ddbox->box0[dim];
3290         comm->cell_x1[dim] += ddbox->box0[dim];
3291     }
3292 }
3293
3294 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3295                                          int d, int dim, real *cell_f_row,
3296                                          const gmx_ddbox_t *ddbox)
3297 {
3298     gmx_domdec_comm_t *comm;
3299     int                d1, pos;
3300
3301     comm = dd->comm;
3302
3303 #if GMX_MPI
3304     /* Each node would only need to know two fractions,
3305      * but it is probably cheaper to broadcast the whole array.
3306      */
3307     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3308               0, comm->mpi_comm_load[d]);
3309 #endif
3310     /* Copy the fractions for this dimension from the buffer */
3311     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3312     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3313     /* The whole array was communicated, so set the buffer position */
3314     pos = dd->nc[dim] + 1;
3315     for (d1 = 0; d1 <= d; d1++)
3316     {
3317         if (d1 < d)
3318         {
3319             /* Copy the cell fractions of the lower dimensions */
3320             comm->cell_f0[d1] = cell_f_row[pos++];
3321             comm->cell_f1[d1] = cell_f_row[pos++];
3322         }
3323         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3324     }
3325     /* Convert the communicated shift from float to int */
3326     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3327     if (d >= 1)
3328     {
3329         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3330     }
3331 }
3332
3333 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3334                                          const gmx_ddbox_t *ddbox,
3335                                          gmx_bool bDynamicBox,
3336                                          gmx_bool bUniform, gmx_int64_t step)
3337 {
3338     gmx_domdec_comm_t *comm;
3339     int                d, dim, d1;
3340     gmx_bool           bRowMember, bRowRoot;
3341     real              *cell_f_row;
3342
3343     comm = dd->comm;
3344
3345     for (d = 0; d < dd->ndim; d++)
3346     {
3347         dim        = dd->dim[d];
3348         bRowMember = TRUE;
3349         bRowRoot   = TRUE;
3350         for (d1 = d; d1 < dd->ndim; d1++)
3351         {
3352             if (dd->ci[dd->dim[d1]] > 0)
3353             {
3354                 if (d1 != d)
3355                 {
3356                     bRowMember = FALSE;
3357                 }
3358                 bRowRoot = FALSE;
3359             }
3360         }
3361         if (bRowMember)
3362         {
3363             if (bRowRoot)
3364             {
3365                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3366                                            ddbox, bDynamicBox, bUniform, step);
3367                 cell_f_row = comm->root[d]->cell_f;
3368             }
3369             else
3370             {
3371                 cell_f_row = comm->cell_f_row;
3372             }
3373             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3374         }
3375     }
3376 }
3377
3378 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t      *dd,
3379                                            const gmx_ddbox_t *ddbox)
3380 {
3381     int d;
3382
3383     /* This function assumes the box is static and should therefore
3384      * not be called when the box has changed since the last
3385      * call to dd_partition_system.
3386      */
3387     for (d = 0; d < dd->ndim; d++)
3388     {
3389         relative_to_absolute_cell_bounds(dd, ddbox, d);
3390     }
3391 }
3392
3393
3394
3395 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3396                                   const gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3397                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3398                                   gmx_wallcycle_t wcycle)
3399 {
3400     gmx_domdec_comm_t *comm;
3401     int                dim;
3402
3403     comm = dd->comm;
3404
3405     if (bDoDLB)
3406     {
3407         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3408         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3409         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3410     }
3411     else if (bDynamicBox)
3412     {
3413         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3414     }
3415
3416     /* Set the dimensions for which no DD is used */
3417     for (dim = 0; dim < DIM; dim++)
3418     {
3419         if (dd->nc[dim] == 1)
3420         {
3421             comm->cell_x0[dim] = 0;
3422             comm->cell_x1[dim] = ddbox->box_size[dim];
3423             if (dim >= ddbox->nboundeddim)
3424             {
3425                 comm->cell_x0[dim] += ddbox->box0[dim];
3426                 comm->cell_x1[dim] += ddbox->box0[dim];
3427             }
3428         }
3429     }
3430 }
3431
3432 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3433 {
3434     int                    d, np, i;
3435     gmx_domdec_comm_dim_t *cd;
3436
3437     for (d = 0; d < dd->ndim; d++)
3438     {
3439         cd = &dd->comm->cd[d];
3440         np = npulse[dd->dim[d]];
3441         if (np > cd->np_nalloc)
3442         {
3443             if (debug)
3444             {
3445                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3446                         dim2char(dd->dim[d]), np);
3447             }
3448             if (DDMASTER(dd) && cd->np_nalloc > 0)
3449             {
3450                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3451             }
3452             srenew(cd->ind, np);
3453             for (i = cd->np_nalloc; i < np; i++)
3454             {
3455                 cd->ind[i].index  = nullptr;
3456                 cd->ind[i].nalloc = 0;
3457             }
3458             cd->np_nalloc = np;
3459         }
3460         cd->np = np;
3461     }
3462 }
3463
3464
3465 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3466                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3467                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3468                               gmx_wallcycle_t wcycle)
3469 {
3470     gmx_domdec_comm_t *comm;
3471     int                d;
3472     ivec               npulse;
3473
3474     comm = dd->comm;
3475
3476     /* Copy the old cell boundaries for the cg displacement check */
3477     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3478     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3479
3480     if (isDlbOn(comm))
3481     {
3482         if (DDMASTER(dd))
3483         {
3484             check_box_size(dd, ddbox);
3485         }
3486         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3487     }
3488     else
3489     {
3490         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3491         realloc_comm_ind(dd, npulse);
3492     }
3493
3494     if (debug)
3495     {
3496         for (d = 0; d < DIM; d++)
3497         {
3498             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3499                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3500         }
3501     }
3502 }
3503
3504 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3505                                   gmx_ddbox_t *ddbox,
3506                                   rvec cell_ns_x0, rvec cell_ns_x1,
3507                                   gmx_int64_t step)
3508 {
3509     gmx_domdec_comm_t *comm;
3510     int                dim_ind, dim;
3511
3512     comm = dd->comm;
3513
3514     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3515     {
3516         dim = dd->dim[dim_ind];
3517
3518         /* Without PBC we don't have restrictions on the outer cells */
3519         if (!(dim >= ddbox->npbcdim &&
3520               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3521             isDlbOn(comm) &&
3522             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3523             comm->cellsize_min[dim])
3524         {
3525             char buf[22];
3526             gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3527                       gmx_step_str(step, buf), dim2char(dim),
3528                       comm->cell_x1[dim] - comm->cell_x0[dim],
3529                       ddbox->skew_fac[dim],
3530                       dd->comm->cellsize_min[dim],
3531                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3532         }
3533     }
3534
3535     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3536     {
3537         /* Communicate the boundaries and update cell_ns_x0/1 */
3538         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3539         if (isDlbOn(dd->comm) && dd->ndim > 1)
3540         {
3541             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3542         }
3543     }
3544 }
3545
3546 static void make_tric_corr_matrix(int npbcdim, const matrix box, matrix tcm)
3547 {
3548     if (YY < npbcdim)
3549     {
3550         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3551     }
3552     else
3553     {
3554         tcm[YY][XX] = 0;
3555     }
3556     if (ZZ < npbcdim)
3557     {
3558         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3559         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3560     }
3561     else
3562     {
3563         tcm[ZZ][XX] = 0;
3564         tcm[ZZ][YY] = 0;
3565     }
3566 }
3567
3568 static void check_screw_box(const matrix box)
3569 {
3570     /* Mathematical limitation */
3571     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3572     {
3573         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3574     }
3575
3576     /* Limitation due to the asymmetry of the eighth shell method */
3577     if (box[ZZ][YY] != 0)
3578     {
3579         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3580     }
3581 }
3582
3583 static void distribute_cg(FILE *fplog,
3584                           const matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3585                           gmx_domdec_t *dd)
3586 {
3587     gmx_domdec_master_t *ma;
3588     int                **tmp_ind = nullptr, *tmp_nalloc = nullptr;
3589     int                  i, icg, j, k, k0, k1, d;
3590     matrix               tcm;
3591     rvec                 cg_cm;
3592     ivec                 ind;
3593     real                 nrcg, inv_ncg, pos_d;
3594     int                 *cgindex;
3595     gmx_bool             bScrew;
3596
3597     ma = dd->ma;
3598
3599     snew(tmp_nalloc, dd->nnodes);
3600     snew(tmp_ind, dd->nnodes);
3601     for (i = 0; i < dd->nnodes; i++)
3602     {
3603         tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3604         snew(tmp_ind[i], tmp_nalloc[i]);
3605     }
3606
3607     /* Clear the count */
3608     for (i = 0; i < dd->nnodes; i++)
3609     {
3610         ma->ncg[i] = 0;
3611         ma->nat[i] = 0;
3612     }
3613
3614     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3615
3616     cgindex = cgs->index;
3617
3618     /* Compute the center of geometry for all charge groups */
3619     for (icg = 0; icg < cgs->nr; icg++)
3620     {
3621         k0      = cgindex[icg];
3622         k1      = cgindex[icg+1];
3623         nrcg    = k1 - k0;
3624         if (nrcg == 1)
3625         {
3626             copy_rvec(pos[k0], cg_cm);
3627         }
3628         else
3629         {
3630             inv_ncg = 1.0/nrcg;
3631
3632             clear_rvec(cg_cm);
3633             for (k = k0; (k < k1); k++)
3634             {
3635                 rvec_inc(cg_cm, pos[k]);
3636             }
3637             for (d = 0; (d < DIM); d++)
3638             {
3639                 cg_cm[d] *= inv_ncg;
3640             }
3641         }
3642         /* Put the charge group in the box and determine the cell index */
3643         for (d = DIM-1; d >= 0; d--)
3644         {
3645             pos_d = cg_cm[d];
3646             if (d < dd->npbcdim)
3647             {
3648                 bScrew = (dd->bScrewPBC && d == XX);
3649                 if (tric_dir[d] && dd->nc[d] > 1)
3650                 {
3651                     /* Use triclinic coordintates for this dimension */
3652                     for (j = d+1; j < DIM; j++)
3653                     {
3654                         pos_d += cg_cm[j]*tcm[j][d];
3655                     }
3656                 }
3657                 while (pos_d >= box[d][d])
3658                 {
3659                     pos_d -= box[d][d];
3660                     rvec_dec(cg_cm, box[d]);
3661                     if (bScrew)
3662                     {
3663                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3664                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3665                     }
3666                     for (k = k0; (k < k1); k++)
3667                     {
3668                         rvec_dec(pos[k], box[d]);
3669                         if (bScrew)
3670                         {
3671                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3672                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3673                         }
3674                     }
3675                 }
3676                 while (pos_d < 0)
3677                 {
3678                     pos_d += box[d][d];
3679                     rvec_inc(cg_cm, box[d]);
3680                     if (bScrew)
3681                     {
3682                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3683                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3684                     }
3685                     for (k = k0; (k < k1); k++)
3686                     {
3687                         rvec_inc(pos[k], box[d]);
3688                         if (bScrew)
3689                         {
3690                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3691                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3692                         }
3693                     }
3694                 }
3695             }
3696             /* This could be done more efficiently */
3697             ind[d] = 0;
3698             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3699             {
3700                 ind[d]++;
3701             }
3702         }
3703         i = dd_index(dd->nc, ind);
3704         if (ma->ncg[i] == tmp_nalloc[i])
3705         {
3706             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3707             srenew(tmp_ind[i], tmp_nalloc[i]);
3708         }
3709         tmp_ind[i][ma->ncg[i]] = icg;
3710         ma->ncg[i]++;
3711         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3712     }
3713
3714     k1 = 0;
3715     for (i = 0; i < dd->nnodes; i++)
3716     {
3717         ma->index[i] = k1;
3718         for (k = 0; k < ma->ncg[i]; k++)
3719         {
3720             ma->cg[k1++] = tmp_ind[i][k];
3721         }
3722     }
3723     ma->index[dd->nnodes] = k1;
3724
3725     for (i = 0; i < dd->nnodes; i++)
3726     {
3727         sfree(tmp_ind[i]);
3728     }
3729     sfree(tmp_ind);
3730     sfree(tmp_nalloc);
3731
3732     if (fplog)
3733     {
3734         // Use double for the sums to avoid natoms^2 overflowing
3735         // (65537^2 > 2^32)
3736         int    nat_sum, nat_min, nat_max;
3737         double nat2_sum;
3738
3739         nat_sum  = 0;
3740         nat2_sum = 0;
3741         nat_min  = ma->nat[0];
3742         nat_max  = ma->nat[0];
3743         for (i = 0; i < dd->nnodes; i++)
3744         {
3745             nat_sum  += ma->nat[i];
3746             // cast to double to avoid integer overflows when squaring
3747             nat2_sum += gmx::square(static_cast<double>(ma->nat[i]));
3748             nat_min   = std::min(nat_min, ma->nat[i]);
3749             nat_max   = std::max(nat_max, ma->nat[i]);
3750         }
3751         nat_sum  /= dd->nnodes;
3752         nat2_sum /= dd->nnodes;
3753
3754         fprintf(fplog, "Atom distribution over %d domains: av %d stddev %d min %d max %d\n",
3755                 dd->nnodes,
3756                 nat_sum,
3757                 static_cast<int>(std::sqrt(nat2_sum - gmx::square(static_cast<double>(nat_sum)) + 0.5)),
3758                 nat_min, nat_max);
3759     }
3760 }
3761
3762 static void get_cg_distribution(FILE *fplog, gmx_domdec_t *dd,
3763                                 t_block *cgs, const matrix box, gmx_ddbox_t *ddbox,
3764                                 rvec pos[])
3765 {
3766     gmx_domdec_master_t *ma = nullptr;
3767     ivec                 npulse;
3768     int                  i, cg_gl;
3769     int                 *ibuf, buf2[2] = { 0, 0 };
3770     gmx_bool             bMaster = DDMASTER(dd);
3771
3772     if (bMaster)
3773     {
3774         ma = dd->ma;
3775
3776         if (dd->bScrewPBC)
3777         {
3778             check_screw_box(box);
3779         }
3780
3781         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
3782
3783         distribute_cg(fplog, box, ddbox->tric_dir, cgs, pos, dd);
3784         for (i = 0; i < dd->nnodes; i++)
3785         {
3786             ma->ibuf[2*i]   = ma->ncg[i];
3787             ma->ibuf[2*i+1] = ma->nat[i];
3788         }
3789         ibuf = ma->ibuf;
3790     }
3791     else
3792     {
3793         ibuf = nullptr;
3794     }
3795     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
3796
3797     dd->ncg_home = buf2[0];
3798     dd->nat_home = buf2[1];
3799     dd->ncg_tot  = dd->ncg_home;
3800     dd->nat_tot  = dd->nat_home;
3801     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3802     {
3803         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3804         srenew(dd->index_gl, dd->cg_nalloc);
3805         srenew(dd->cgindex, dd->cg_nalloc+1);
3806     }
3807     if (bMaster)
3808     {
3809         for (i = 0; i < dd->nnodes; i++)
3810         {
3811             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
3812             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3813         }
3814     }
3815
3816     dd_scatterv(dd,
3817                 bMaster ? ma->ibuf : nullptr,
3818                 bMaster ? ma->ibuf+dd->nnodes : nullptr,
3819                 bMaster ? ma->cg : nullptr,
3820                 dd->ncg_home*sizeof(int), dd->index_gl);
3821
3822     /* Determine the home charge group sizes */
3823     dd->cgindex[0] = 0;
3824     for (i = 0; i < dd->ncg_home; i++)
3825     {
3826         cg_gl            = dd->index_gl[i];
3827         dd->cgindex[i+1] =
3828             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3829     }
3830
3831     if (debug)
3832     {
3833         fprintf(debug, "Home charge groups:\n");
3834         for (i = 0; i < dd->ncg_home; i++)
3835         {
3836             fprintf(debug, " %d", dd->index_gl[i]);
3837             if (i % 10 == 9)
3838             {
3839                 fprintf(debug, "\n");
3840             }
3841         }
3842         fprintf(debug, "\n");
3843     }
3844 }
3845
3846 static int compact_and_copy_vec_at(int ncg, int *move,
3847                                    int *cgindex,
3848                                    int nvec, int vec,
3849                                    rvec *src, gmx_domdec_comm_t *comm,
3850                                    gmx_bool bCompact)
3851 {
3852     int m, icg, i, i0, i1, nrcg;
3853     int home_pos;
3854     int pos_vec[DIM*2];
3855
3856     home_pos = 0;
3857
3858     for (m = 0; m < DIM*2; m++)
3859     {
3860         pos_vec[m] = 0;
3861     }
3862
3863     i0 = 0;
3864     for (icg = 0; icg < ncg; icg++)
3865     {
3866         i1 = cgindex[icg+1];
3867         m  = move[icg];
3868         if (m == -1)
3869         {
3870             if (bCompact)
3871             {
3872                 /* Compact the home array in place */
3873                 for (i = i0; i < i1; i++)
3874                 {
3875                     copy_rvec(src[i], src[home_pos++]);
3876                 }
3877             }
3878         }
3879         else
3880         {
3881             /* Copy to the communication buffer */
3882             nrcg        = i1 - i0;
3883             pos_vec[m] += 1 + vec*nrcg;
3884             for (i = i0; i < i1; i++)
3885             {
3886                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
3887             }
3888             pos_vec[m] += (nvec - vec - 1)*nrcg;
3889         }
3890         if (!bCompact)
3891         {
3892             home_pos += i1 - i0;
3893         }
3894         i0 = i1;
3895     }
3896
3897     return home_pos;
3898 }
3899
3900 static int compact_and_copy_vec_cg(int ncg, int *move,
3901                                    int *cgindex,
3902                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
3903                                    gmx_bool bCompact)
3904 {
3905     int m, icg, i0, i1, nrcg;
3906     int home_pos;
3907     int pos_vec[DIM*2];
3908
3909     home_pos = 0;
3910
3911     for (m = 0; m < DIM*2; m++)
3912     {
3913         pos_vec[m] = 0;
3914     }
3915
3916     i0 = 0;
3917     for (icg = 0; icg < ncg; icg++)
3918     {
3919         i1 = cgindex[icg+1];
3920         m  = move[icg];
3921         if (m == -1)
3922         {
3923             if (bCompact)
3924             {
3925                 /* Compact the home array in place */
3926                 copy_rvec(src[icg], src[home_pos++]);
3927             }
3928         }
3929         else
3930         {
3931             nrcg = i1 - i0;
3932             /* Copy to the communication buffer */
3933             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
3934             pos_vec[m] += 1 + nrcg*nvec;
3935         }
3936         i0 = i1;
3937     }
3938     if (!bCompact)
3939     {
3940         home_pos = ncg;
3941     }
3942
3943     return home_pos;
3944 }
3945
3946 static int compact_ind(int ncg, int *move,
3947                        int *index_gl, int *cgindex,
3948                        int *gatindex,
3949                        gmx_ga2la_t *ga2la, char *bLocalCG,
3950                        int *cginfo)
3951 {
3952     int cg, nat, a0, a1, a, a_gl;
3953     int home_pos;
3954
3955     home_pos = 0;
3956     nat      = 0;
3957     for (cg = 0; cg < ncg; cg++)
3958     {
3959         a0 = cgindex[cg];
3960         a1 = cgindex[cg+1];
3961         if (move[cg] == -1)
3962         {
3963             /* Compact the home arrays in place.
3964              * Anything that can be done here avoids access to global arrays.
3965              */
3966             cgindex[home_pos] = nat;
3967             for (a = a0; a < a1; a++)
3968             {
3969                 a_gl          = gatindex[a];
3970                 gatindex[nat] = a_gl;
3971                 /* The cell number stays 0, so we don't need to set it */
3972                 ga2la_change_la(ga2la, a_gl, nat);
3973                 nat++;
3974             }
3975             index_gl[home_pos] = index_gl[cg];
3976             cginfo[home_pos]   = cginfo[cg];
3977             /* The charge group remains local, so bLocalCG does not change */
3978             home_pos++;
3979         }
3980         else
3981         {
3982             /* Clear the global indices */
3983             for (a = a0; a < a1; a++)
3984             {
3985                 ga2la_del(ga2la, gatindex[a]);
3986             }
3987             if (bLocalCG)
3988             {
3989                 bLocalCG[index_gl[cg]] = FALSE;
3990             }
3991         }
3992     }
3993     cgindex[home_pos] = nat;
3994
3995     return home_pos;
3996 }
3997
3998 static void clear_and_mark_ind(int ncg, int *move,
3999                                int *index_gl, int *cgindex, int *gatindex,
4000                                gmx_ga2la_t *ga2la, char *bLocalCG,
4001                                int *cell_index)
4002 {
4003     int cg, a0, a1, a;
4004
4005     for (cg = 0; cg < ncg; cg++)
4006     {
4007         if (move[cg] >= 0)
4008         {
4009             a0 = cgindex[cg];
4010             a1 = cgindex[cg+1];
4011             /* Clear the global indices */
4012             for (a = a0; a < a1; a++)
4013             {
4014                 ga2la_del(ga2la, gatindex[a]);
4015             }
4016             if (bLocalCG)
4017             {
4018                 bLocalCG[index_gl[cg]] = FALSE;
4019             }
4020             /* Signal that this cg has moved using the ns cell index.
4021              * Here we set it to -1. fill_grid will change it
4022              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4023              */
4024             cell_index[cg] = -1;
4025         }
4026     }
4027 }
4028
4029 static void print_cg_move(FILE *fplog,
4030                           gmx_domdec_t *dd,
4031                           gmx_int64_t step, int cg, int dim, int dir,
4032                           gmx_bool bHaveCgcmOld, real limitd,
4033                           rvec cm_old, rvec cm_new, real pos_d)
4034 {
4035     gmx_domdec_comm_t *comm;
4036     char               buf[22];
4037
4038     comm = dd->comm;
4039
4040     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4041     if (limitd > 0)
4042     {
4043         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4044                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4045                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4046     }
4047     else
4048     {
4049         /* We don't have a limiting distance available: don't print it */
4050         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4051                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4052                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4053     }
4054     fprintf(fplog, "distance out of cell %f\n",
4055             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4056     if (bHaveCgcmOld)
4057     {
4058         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4059                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4060     }
4061     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4062             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4063     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4064             dim2char(dim),
4065             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4066     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4067             dim2char(dim),
4068             comm->cell_x0[dim], comm->cell_x1[dim]);
4069 }
4070
4071 static void cg_move_error(FILE *fplog,
4072                           gmx_domdec_t *dd,
4073                           gmx_int64_t step, int cg, int dim, int dir,
4074                           gmx_bool bHaveCgcmOld, real limitd,
4075                           rvec cm_old, rvec cm_new, real pos_d)
4076 {
4077     if (fplog)
4078     {
4079         print_cg_move(fplog, dd, step, cg, dim, dir,
4080                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4081     }
4082     print_cg_move(stderr, dd, step, cg, dim, dir,
4083                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4084     gmx_fatal(FARGS,
4085               "%s moved too far between two domain decomposition steps\n"
4086               "This usually means that your system is not well equilibrated",
4087               dd->comm->bCGs ? "A charge group" : "An atom");
4088 }
4089
4090 static void rotate_state_atom(t_state *state, int a)
4091 {
4092     if (state->flags & (1 << estX))
4093     {
4094         /* Rotate the complete state; for a rectangular box only */
4095         state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4096         state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4097     }
4098     if (state->flags & (1 << estV))
4099     {
4100         state->v[a][YY] = -state->v[a][YY];
4101         state->v[a][ZZ] = -state->v[a][ZZ];
4102     }
4103     if (state->flags & (1 << estCGP))
4104     {
4105         state->cg_p[a][YY] = -state->cg_p[a][YY];
4106         state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4107     }
4108 }
4109
4110 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4111 {
4112     if (natoms > comm->moved_nalloc)
4113     {
4114         /* Contents should be preserved here */
4115         comm->moved_nalloc = over_alloc_dd(natoms);
4116         srenew(comm->moved, comm->moved_nalloc);
4117     }
4118
4119     return comm->moved;
4120 }
4121
4122 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4123                          gmx_domdec_t *dd,
4124                          t_state *state,
4125                          ivec tric_dir, matrix tcm,
4126                          rvec cell_x0, rvec cell_x1,
4127                          rvec limitd, rvec limit0, rvec limit1,
4128                          const int *cgindex,
4129                          int cg_start, int cg_end,
4130                          rvec *cg_cm,
4131                          int *move)
4132 {
4133     int      npbcdim;
4134     int      cg, k, k0, k1, d, dim, d2;
4135     int      mc, nrcg;
4136     int      flag;
4137     gmx_bool bScrew;
4138     ivec     dev;
4139     real     inv_ncg, pos_d;
4140     rvec     cm_new;
4141
4142     npbcdim = dd->npbcdim;
4143
4144     for (cg = cg_start; cg < cg_end; cg++)
4145     {
4146         k0   = cgindex[cg];
4147         k1   = cgindex[cg+1];
4148         nrcg = k1 - k0;
4149         if (nrcg == 1)
4150         {
4151             copy_rvec(state->x[k0], cm_new);
4152         }
4153         else
4154         {
4155             inv_ncg = 1.0/nrcg;
4156
4157             clear_rvec(cm_new);
4158             for (k = k0; (k < k1); k++)
4159             {
4160                 rvec_inc(cm_new, state->x[k]);
4161             }
4162             for (d = 0; (d < DIM); d++)
4163             {
4164                 cm_new[d] = inv_ncg*cm_new[d];
4165             }
4166         }
4167
4168         clear_ivec(dev);
4169         /* Do pbc and check DD cell boundary crossings */
4170         for (d = DIM-1; d >= 0; d--)
4171         {
4172             if (dd->nc[d] > 1)
4173             {
4174                 bScrew = (dd->bScrewPBC && d == XX);
4175                 /* Determine the location of this cg in lattice coordinates */
4176                 pos_d = cm_new[d];
4177                 if (tric_dir[d])
4178                 {
4179                     for (d2 = d+1; d2 < DIM; d2++)
4180                     {
4181                         pos_d += cm_new[d2]*tcm[d2][d];
4182                     }
4183                 }
4184                 /* Put the charge group in the triclinic unit-cell */
4185                 if (pos_d >= cell_x1[d])
4186                 {
4187                     if (pos_d >= limit1[d])
4188                     {
4189                         cg_move_error(fplog, dd, step, cg, d, 1,
4190                                       cg_cm != as_rvec_array(state->x.data()), limitd[d],
4191                                       cg_cm[cg], cm_new, pos_d);
4192                     }
4193                     dev[d] = 1;
4194                     if (dd->ci[d] == dd->nc[d] - 1)
4195                     {
4196                         rvec_dec(cm_new, state->box[d]);
4197                         if (bScrew)
4198                         {
4199                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4200                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4201                         }
4202                         for (k = k0; (k < k1); k++)
4203                         {
4204                             rvec_dec(state->x[k], state->box[d]);
4205                             if (bScrew)
4206                             {
4207                                 rotate_state_atom(state, k);
4208                             }
4209                         }
4210                     }
4211                 }
4212                 else if (pos_d < cell_x0[d])
4213                 {
4214                     if (pos_d < limit0[d])
4215                     {
4216                         cg_move_error(fplog, dd, step, cg, d, -1,
4217                                       cg_cm != as_rvec_array(state->x.data()), limitd[d],
4218                                       cg_cm[cg], cm_new, pos_d);
4219                     }
4220                     dev[d] = -1;
4221                     if (dd->ci[d] == 0)
4222                     {
4223                         rvec_inc(cm_new, state->box[d]);
4224                         if (bScrew)
4225                         {
4226                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4227                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4228                         }
4229                         for (k = k0; (k < k1); k++)
4230                         {
4231                             rvec_inc(state->x[k], state->box[d]);
4232                             if (bScrew)
4233                             {
4234                                 rotate_state_atom(state, k);
4235                             }
4236                         }
4237                     }
4238                 }
4239             }
4240             else if (d < npbcdim)
4241             {
4242                 /* Put the charge group in the rectangular unit-cell */
4243                 while (cm_new[d] >= state->box[d][d])
4244                 {
4245                     rvec_dec(cm_new, state->box[d]);
4246                     for (k = k0; (k < k1); k++)
4247                     {
4248                         rvec_dec(state->x[k], state->box[d]);
4249                     }
4250                 }
4251                 while (cm_new[d] < 0)
4252                 {
4253                     rvec_inc(cm_new, state->box[d]);
4254                     for (k = k0; (k < k1); k++)
4255                     {
4256                         rvec_inc(state->x[k], state->box[d]);
4257                     }
4258                 }
4259             }
4260         }
4261
4262         copy_rvec(cm_new, cg_cm[cg]);
4263
4264         /* Determine where this cg should go */
4265         flag = 0;
4266         mc   = -1;
4267         for (d = 0; d < dd->ndim; d++)
4268         {
4269             dim = dd->dim[d];
4270             if (dev[dim] == 1)
4271             {
4272                 flag |= DD_FLAG_FW(d);
4273                 if (mc == -1)
4274                 {
4275                     mc = d*2;
4276                 }
4277             }
4278             else if (dev[dim] == -1)
4279             {
4280                 flag |= DD_FLAG_BW(d);
4281                 if (mc == -1)
4282                 {
4283                     if (dd->nc[dim] > 2)
4284                     {
4285                         mc = d*2 + 1;
4286                     }
4287                     else
4288                     {
4289                         mc = d*2;
4290                     }
4291                 }
4292             }
4293         }
4294         /* Temporarily store the flag in move */
4295         move[cg] = mc + flag;
4296     }
4297 }
4298
4299 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4300                                gmx_domdec_t *dd, ivec tric_dir,
4301                                t_state *state, PaddedRVecVector *f,
4302                                t_forcerec *fr,
4303                                gmx_bool bCompact,
4304                                t_nrnb *nrnb,
4305                                int *ncg_stay_home,
4306                                int *ncg_moved)
4307 {
4308     int               *move;
4309     int                npbcdim;
4310     int                ncg[DIM*2] = { 0 }, nat[DIM*2] = { 0 };
4311     int                i, cg, k, d, dim, dim2, dir, d2, d3;
4312     int                mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
4313     int                sbuf[2], rbuf[2];
4314     int                home_pos_cg, home_pos_at, buf_pos;
4315     int                flag;
4316     real               pos_d;
4317     matrix             tcm;
4318     rvec              *cg_cm = nullptr, cell_x0, cell_x1, limitd, limit0, limit1;
4319     int               *cgindex;
4320     cginfo_mb_t       *cginfo_mb;
4321     gmx_domdec_comm_t *comm;
4322     int               *moved;
4323     int                nthread, thread;
4324
4325     if (dd->bScrewPBC)
4326     {
4327         check_screw_box(state->box);
4328     }
4329
4330     comm  = dd->comm;
4331     if (fr->cutoff_scheme == ecutsGROUP)
4332     {
4333         cg_cm = fr->cg_cm;
4334     }
4335
4336     // Positions are always present, so there's nothing to flag
4337     bool bV   = state->flags & (1<<estV);
4338     bool bCGP = state->flags & (1<<estCGP);
4339
4340     if (dd->ncg_tot > comm->nalloc_int)
4341     {
4342         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4343         srenew(comm->buf_int, comm->nalloc_int);
4344     }
4345     move = comm->buf_int;
4346
4347     npbcdim = dd->npbcdim;
4348
4349     for (d = 0; (d < DIM); d++)
4350     {
4351         limitd[d] = dd->comm->cellsize_min[d];
4352         if (d >= npbcdim && dd->ci[d] == 0)
4353         {
4354             cell_x0[d] = -GMX_FLOAT_MAX;
4355         }
4356         else
4357         {
4358             cell_x0[d] = comm->cell_x0[d];
4359         }
4360         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4361         {
4362             cell_x1[d] = GMX_FLOAT_MAX;
4363         }
4364         else
4365         {
4366             cell_x1[d] = comm->cell_x1[d];
4367         }
4368         if (d < npbcdim)
4369         {
4370             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4371             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4372         }
4373         else
4374         {
4375             /* We check after communication if a charge group moved
4376              * more than one cell. Set the pre-comm check limit to float_max.
4377              */
4378             limit0[d] = -GMX_FLOAT_MAX;
4379             limit1[d] =  GMX_FLOAT_MAX;
4380         }
4381     }
4382
4383     make_tric_corr_matrix(npbcdim, state->box, tcm);
4384
4385     cgindex = dd->cgindex;
4386
4387     nthread = gmx_omp_nthreads_get(emntDomdec);
4388
4389     /* Compute the center of geometry for all home charge groups
4390      * and put them in the box and determine where they should go.
4391      */
4392 #pragma omp parallel for num_threads(nthread) schedule(static)
4393     for (thread = 0; thread < nthread; thread++)
4394     {
4395         try
4396         {
4397             calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4398                          cell_x0, cell_x1, limitd, limit0, limit1,
4399                          cgindex,
4400                          ( thread   *dd->ncg_home)/nthread,
4401                          ((thread+1)*dd->ncg_home)/nthread,
4402                          fr->cutoff_scheme == ecutsGROUP ? cg_cm : as_rvec_array(state->x.data()),
4403                          move);
4404         }
4405         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4406     }
4407
4408     for (cg = 0; cg < dd->ncg_home; cg++)
4409     {
4410         if (move[cg] >= 0)
4411         {
4412             mc       = move[cg];
4413             flag     = mc & ~DD_FLAG_NRCG;
4414             mc       = mc & DD_FLAG_NRCG;
4415             move[cg] = mc;
4416
4417             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4418             {
4419                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4420                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4421             }
4422             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4423             /* We store the cg size in the lower 16 bits
4424              * and the place where the charge group should go
4425              * in the next 6 bits. This saves some communication volume.
4426              */
4427             nrcg = cgindex[cg+1] - cgindex[cg];
4428             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4429             ncg[mc] += 1;
4430             nat[mc] += nrcg;
4431         }
4432     }
4433
4434     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4435     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4436
4437     *ncg_moved = 0;
4438     for (i = 0; i < dd->ndim*2; i++)
4439     {
4440         *ncg_moved += ncg[i];
4441     }
4442
4443     nvec = 1;
4444     if (bV)
4445     {
4446         nvec++;
4447     }
4448     if (bCGP)
4449     {
4450         nvec++;
4451     }
4452
4453     /* Make sure the communication buffers are large enough */
4454     for (mc = 0; mc < dd->ndim*2; mc++)
4455     {
4456         nvr = ncg[mc] + nat[mc]*nvec;
4457         if (nvr > comm->cgcm_state_nalloc[mc])
4458         {
4459             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4460             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4461         }
4462     }
4463
4464     switch (fr->cutoff_scheme)
4465     {
4466         case ecutsGROUP:
4467             /* Recalculating cg_cm might be cheaper than communicating,
4468              * but that could give rise to rounding issues.
4469              */
4470             home_pos_cg =
4471                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4472                                         nvec, cg_cm, comm, bCompact);
4473             break;
4474         case ecutsVERLET:
4475             /* Without charge groups we send the moved atom coordinates
4476              * over twice. This is so the code below can be used without
4477              * many conditionals for both for with and without charge groups.
4478              */
4479             home_pos_cg =
4480                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4481                                         nvec, as_rvec_array(state->x.data()), comm, FALSE);
4482             if (bCompact)
4483             {
4484                 home_pos_cg -= *ncg_moved;
4485             }
4486             break;
4487         default:
4488             gmx_incons("unimplemented");
4489             home_pos_cg = 0;
4490     }
4491
4492     vec         = 0;
4493     home_pos_at =
4494         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4495                                 nvec, vec++, as_rvec_array(state->x.data()),
4496                                 comm, bCompact);
4497     if (bV)
4498     {
4499         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4500                                 nvec, vec++, as_rvec_array(state->v.data()),
4501                                 comm, bCompact);
4502     }
4503     if (bCGP)
4504     {
4505         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4506                                 nvec, vec++, as_rvec_array(state->cg_p.data()),
4507                                 comm, bCompact);
4508     }
4509
4510     if (bCompact)
4511     {
4512         compact_ind(dd->ncg_home, move,
4513                     dd->index_gl, dd->cgindex, dd->gatindex,
4514                     dd->ga2la, comm->bLocalCG,
4515                     fr->cginfo);
4516     }
4517     else
4518     {
4519         if (fr->cutoff_scheme == ecutsVERLET)
4520         {
4521             moved = get_moved(comm, dd->ncg_home);
4522
4523             for (k = 0; k < dd->ncg_home; k++)
4524             {
4525                 moved[k] = 0;
4526             }
4527         }
4528         else
4529         {
4530             moved = fr->ns->grid->cell_index;
4531         }
4532
4533         clear_and_mark_ind(dd->ncg_home, move,
4534                            dd->index_gl, dd->cgindex, dd->gatindex,
4535                            dd->ga2la, comm->bLocalCG,
4536                            moved);
4537     }
4538
4539     cginfo_mb = fr->cginfo_mb;
4540
4541     *ncg_stay_home = home_pos_cg;
4542     for (d = 0; d < dd->ndim; d++)
4543     {
4544         dim      = dd->dim[d];
4545         ncg_recv = 0;
4546         nvr      = 0;
4547         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4548         {
4549             cdd = d*2 + dir;
4550             /* Communicate the cg and atom counts */
4551             sbuf[0] = ncg[cdd];
4552             sbuf[1] = nat[cdd];
4553             if (debug)
4554             {
4555                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4556                         d, dir, sbuf[0], sbuf[1]);
4557             }
4558             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4559
4560             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4561             {
4562                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4563                 srenew(comm->buf_int, comm->nalloc_int);
4564             }
4565
4566             /* Communicate the charge group indices, sizes and flags */
4567             dd_sendrecv_int(dd, d, dir,
4568                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4569                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4570
4571             nvs = ncg[cdd] + nat[cdd]*nvec;
4572             i   = rbuf[0]  + rbuf[1] *nvec;
4573             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4574
4575             /* Communicate cgcm and state */
4576             dd_sendrecv_rvec(dd, d, dir,
4577                              comm->cgcm_state[cdd], nvs,
4578                              comm->vbuf.v+nvr, i);
4579             ncg_recv += rbuf[0];
4580             nvr      += i;
4581         }
4582
4583         dd_check_alloc_ncg(fr, state, f, home_pos_cg + ncg_recv);
4584         if (fr->cutoff_scheme == ecutsGROUP)
4585         {
4586             /* Here we resize to more than necessary and shrink later */
4587             dd_resize_state(state, f, home_pos_at + ncg_recv*MAX_CGCGSIZE);
4588         }
4589
4590         /* Process the received charge groups */
4591         buf_pos = 0;
4592         for (cg = 0; cg < ncg_recv; cg++)
4593         {
4594             flag = comm->buf_int[cg*DD_CGIBS+1];
4595
4596             if (dim >= npbcdim && dd->nc[dim] > 2)
4597             {
4598                 /* No pbc in this dim and more than one domain boundary.
4599                  * We do a separate check if a charge group didn't move too far.
4600                  */
4601                 if (((flag & DD_FLAG_FW(d)) &&
4602                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4603                     ((flag & DD_FLAG_BW(d)) &&
4604                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4605                 {
4606                     cg_move_error(fplog, dd, step, cg, dim,
4607                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4608                                   fr->cutoff_scheme == ecutsGROUP, 0,
4609                                   comm->vbuf.v[buf_pos],
4610                                   comm->vbuf.v[buf_pos],
4611                                   comm->vbuf.v[buf_pos][dim]);
4612                 }
4613             }
4614
4615             mc = -1;
4616             if (d < dd->ndim-1)
4617             {
4618                 /* Check which direction this cg should go */
4619                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4620                 {
4621                     if (isDlbOn(dd->comm))
4622                     {
4623                         /* The cell boundaries for dimension d2 are not equal
4624                          * for each cell row of the lower dimension(s),
4625                          * therefore we might need to redetermine where
4626                          * this cg should go.
4627                          */
4628                         dim2 = dd->dim[d2];
4629                         /* If this cg crosses the box boundary in dimension d2
4630                          * we can use the communicated flag, so we do not
4631                          * have to worry about pbc.
4632                          */
4633                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4634                                (flag & DD_FLAG_FW(d2))) ||
4635                               (dd->ci[dim2] == 0 &&
4636                                (flag & DD_FLAG_BW(d2)))))
4637                         {
4638                             /* Clear the two flags for this dimension */
4639                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4640                             /* Determine the location of this cg
4641                              * in lattice coordinates
4642                              */
4643                             pos_d = comm->vbuf.v[buf_pos][dim2];
4644                             if (tric_dir[dim2])
4645                             {
4646                                 for (d3 = dim2+1; d3 < DIM; d3++)
4647                                 {
4648                                     pos_d +=
4649                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4650                                 }
4651                             }
4652                             /* Check of we are not at the box edge.
4653                              * pbc is only handled in the first step above,
4654                              * but this check could move over pbc while
4655                              * the first step did not due to different rounding.
4656                              */
4657                             if (pos_d >= cell_x1[dim2] &&
4658                                 dd->ci[dim2] != dd->nc[dim2]-1)
4659                             {
4660                                 flag |= DD_FLAG_FW(d2);
4661                             }
4662                             else if (pos_d < cell_x0[dim2] &&
4663                                      dd->ci[dim2] != 0)
4664                             {
4665                                 flag |= DD_FLAG_BW(d2);
4666                             }
4667                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4668                         }
4669                     }
4670                     /* Set to which neighboring cell this cg should go */
4671                     if (flag & DD_FLAG_FW(d2))
4672                     {
4673                         mc = d2*2;
4674                     }
4675                     else if (flag & DD_FLAG_BW(d2))
4676                     {
4677                         if (dd->nc[dd->dim[d2]] > 2)
4678                         {
4679                             mc = d2*2+1;
4680                         }
4681                         else
4682                         {
4683                             mc = d2*2;
4684                         }
4685                     }
4686                 }
4687             }
4688
4689             nrcg = flag & DD_FLAG_NRCG;
4690             if (mc == -1)
4691             {
4692                 if (home_pos_cg+1 > dd->cg_nalloc)
4693                 {
4694                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4695                     srenew(dd->index_gl, dd->cg_nalloc);
4696                     srenew(dd->cgindex, dd->cg_nalloc+1);
4697                 }
4698                 /* Set the global charge group index and size */
4699                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
4700                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4701                 /* Copy the state from the buffer */
4702                 if (fr->cutoff_scheme == ecutsGROUP)
4703                 {
4704                     cg_cm = fr->cg_cm;
4705                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4706                 }
4707                 buf_pos++;
4708
4709                 /* Set the cginfo */
4710                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4711                                                    dd->index_gl[home_pos_cg]);
4712                 if (comm->bLocalCG)
4713                 {
4714                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4715                 }
4716
4717                 for (i = 0; i < nrcg; i++)
4718                 {
4719                     copy_rvec(comm->vbuf.v[buf_pos++],
4720                               state->x[home_pos_at+i]);
4721                 }
4722                 if (bV)
4723                 {
4724                     for (i = 0; i < nrcg; i++)
4725                     {
4726                         copy_rvec(comm->vbuf.v[buf_pos++],
4727                                   state->v[home_pos_at+i]);
4728                     }
4729                 }
4730                 if (bCGP)
4731                 {
4732                     for (i = 0; i < nrcg; i++)
4733                     {
4734                         copy_rvec(comm->vbuf.v[buf_pos++],
4735                                   state->cg_p[home_pos_at+i]);
4736                     }
4737                 }
4738                 home_pos_cg += 1;
4739                 home_pos_at += nrcg;
4740             }
4741             else
4742             {
4743                 /* Reallocate the buffers if necessary  */
4744                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4745                 {
4746                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4747                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4748                 }
4749                 nvr = ncg[mc] + nat[mc]*nvec;
4750                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4751                 {
4752                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4753                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4754                 }
4755                 /* Copy from the receive to the send buffers */
4756                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4757                        comm->buf_int + cg*DD_CGIBS,
4758                        DD_CGIBS*sizeof(int));
4759                 memcpy(comm->cgcm_state[mc][nvr],
4760                        comm->vbuf.v[buf_pos],
4761                        (1+nrcg*nvec)*sizeof(rvec));
4762                 buf_pos += 1 + nrcg*nvec;
4763                 ncg[mc] += 1;
4764                 nat[mc] += nrcg;
4765             }
4766         }
4767     }
4768
4769     /* With sorting (!bCompact) the indices are now only partially up to date
4770      * and ncg_home and nat_home are not the real count, since there are
4771      * "holes" in the arrays for the charge groups that moved to neighbors.
4772      */
4773     if (fr->cutoff_scheme == ecutsVERLET)
4774     {
4775         moved = get_moved(comm, home_pos_cg);
4776
4777         for (i = dd->ncg_home; i < home_pos_cg; i++)
4778         {
4779             moved[i] = 0;
4780         }
4781     }
4782     dd->ncg_home = home_pos_cg;
4783     dd->nat_home = home_pos_at;
4784
4785     if (fr->cutoff_scheme == ecutsGROUP && !bCompact)
4786     {
4787         /* We overallocated before, we need to set the right size here */
4788         dd_resize_state(state, f, dd->nat_home);
4789     }
4790
4791     if (debug)
4792     {
4793         fprintf(debug,
4794                 "Finished repartitioning: cgs moved out %d, new home %d\n",
4795                 *ncg_moved, dd->ncg_home-*ncg_moved);
4796
4797     }
4798 }
4799
4800 void dd_cycles_add(const gmx_domdec_t *dd, float cycles, int ddCycl)
4801 {
4802     /* Note that the cycles value can be incorrect, either 0 or some
4803      * extremely large value, when our thread migrated to another core
4804      * with an unsynchronized cycle counter. If this happens less often
4805      * that once per nstlist steps, this will not cause issues, since
4806      * we later subtract the maximum value from the sum over nstlist steps.
4807      * A zero count will slightly lower the total, but that's a small effect.
4808      * Note that the main purpose of the subtraction of the maximum value
4809      * is to avoid throwing off the load balancing when stalls occur due
4810      * e.g. system activity or network congestion.
4811      */
4812     dd->comm->cycl[ddCycl] += cycles;
4813     dd->comm->cycl_n[ddCycl]++;
4814     if (cycles > dd->comm->cycl_max[ddCycl])
4815     {
4816         dd->comm->cycl_max[ddCycl] = cycles;
4817     }
4818 }
4819
4820 static double force_flop_count(t_nrnb *nrnb)
4821 {
4822     int         i;
4823     double      sum;
4824     const char *name;
4825
4826     sum = 0;
4827     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
4828     {
4829         /* To get closer to the real timings, we half the count
4830          * for the normal loops and again half it for water loops.
4831          */
4832         name = nrnb_str(i);
4833         if (strstr(name, "W3") != nullptr || strstr(name, "W4") != nullptr)
4834         {
4835             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4836         }
4837         else
4838         {
4839             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4840         }
4841     }
4842     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
4843     {
4844         name = nrnb_str(i);
4845         if (strstr(name, "W3") != nullptr || strstr(name, "W4") != nullptr)
4846         {
4847             sum += nrnb->n[i]*cost_nrnb(i);
4848         }
4849     }
4850     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
4851     {
4852         sum += nrnb->n[i]*cost_nrnb(i);
4853     }
4854
4855     return sum;
4856 }
4857
4858 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
4859 {
4860     if (dd->comm->eFlop)
4861     {
4862         dd->comm->flop -= force_flop_count(nrnb);
4863     }
4864 }
4865 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
4866 {
4867     if (dd->comm->eFlop)
4868     {
4869         dd->comm->flop += force_flop_count(nrnb);
4870         dd->comm->flop_n++;
4871     }
4872 }
4873
4874 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4875 {
4876     int i;
4877
4878     for (i = 0; i < ddCyclNr; i++)
4879     {
4880         dd->comm->cycl[i]     = 0;
4881         dd->comm->cycl_n[i]   = 0;
4882         dd->comm->cycl_max[i] = 0;
4883     }
4884     dd->comm->flop   = 0;
4885     dd->comm->flop_n = 0;
4886 }
4887
4888 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
4889 {
4890     gmx_domdec_comm_t *comm;
4891     domdec_load_t     *load;
4892     domdec_root_t     *root = nullptr;
4893     int                d, dim, i, pos;
4894     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
4895     gmx_bool           bSepPME;
4896
4897     if (debug)
4898     {
4899         fprintf(debug, "get_load_distribution start\n");
4900     }
4901
4902     wallcycle_start(wcycle, ewcDDCOMMLOAD);
4903
4904     comm = dd->comm;
4905
4906     bSepPME = (dd->pme_nodeid >= 0);
4907
4908     if (dd->ndim == 0 && bSepPME)
4909     {
4910         /* Without decomposition, but with PME nodes, we need the load */
4911         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
4912         comm->load[0].pme = comm->cycl[ddCyclPME];
4913     }
4914
4915     for (d = dd->ndim-1; d >= 0; d--)
4916     {
4917         dim = dd->dim[d];
4918         /* Check if we participate in the communication in this dimension */
4919         if (d == dd->ndim-1 ||
4920             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
4921         {
4922             load = &comm->load[d];
4923             if (isDlbOn(dd->comm))
4924             {
4925                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4926             }
4927             pos = 0;
4928             if (d == dd->ndim-1)
4929             {
4930                 sbuf[pos++] = dd_force_load(comm);
4931                 sbuf[pos++] = sbuf[0];
4932                 if (isDlbOn(dd->comm))
4933                 {
4934                     sbuf[pos++] = sbuf[0];
4935                     sbuf[pos++] = cell_frac;
4936                     if (d > 0)
4937                     {
4938                         sbuf[pos++] = comm->cell_f_max0[d];
4939                         sbuf[pos++] = comm->cell_f_min1[d];
4940                     }
4941                 }
4942                 if (bSepPME)
4943                 {
4944                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4945                     sbuf[pos++] = comm->cycl[ddCyclPME];
4946                 }
4947             }
4948             else
4949             {
4950                 sbuf[pos++] = comm->load[d+1].sum;
4951                 sbuf[pos++] = comm->load[d+1].max;
4952                 if (isDlbOn(dd->comm))
4953                 {
4954                     sbuf[pos++] = comm->load[d+1].sum_m;
4955                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4956                     sbuf[pos++] = comm->load[d+1].flags;
4957                     if (d > 0)
4958                     {
4959                         sbuf[pos++] = comm->cell_f_max0[d];
4960                         sbuf[pos++] = comm->cell_f_min1[d];
4961                     }
4962                 }
4963                 if (bSepPME)
4964                 {
4965                     sbuf[pos++] = comm->load[d+1].mdf;
4966                     sbuf[pos++] = comm->load[d+1].pme;
4967                 }
4968             }
4969             load->nload = pos;
4970             /* Communicate a row in DD direction d.
4971              * The communicators are setup such that the root always has rank 0.
4972              */
4973 #if GMX_MPI
4974             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
4975                        load->load, load->nload*sizeof(float), MPI_BYTE,
4976                        0, comm->mpi_comm_load[d]);
4977 #endif
4978             if (dd->ci[dim] == dd->master_ci[dim])
4979             {
4980                 /* We are the root, process this row */
4981                 if (isDlbOn(comm))
4982                 {
4983                     root = comm->root[d];
4984                 }
4985                 load->sum      = 0;
4986                 load->max      = 0;
4987                 load->sum_m    = 0;
4988                 load->cvol_min = 1;
4989                 load->flags    = 0;
4990                 load->mdf      = 0;
4991                 load->pme      = 0;
4992                 pos            = 0;
4993                 for (i = 0; i < dd->nc[dim]; i++)
4994                 {
4995                     load->sum += load->load[pos++];
4996                     load->max  = std::max(load->max, load->load[pos]);
4997                     pos++;
4998                     if (isDlbOn(dd->comm))
4999                     {
5000                         if (root->bLimited)
5001                         {
5002                             /* This direction could not be load balanced properly,
5003                              * therefore we need to use the maximum iso the average load.
5004                              */
5005                             load->sum_m = std::max(load->sum_m, load->load[pos]);
5006                         }
5007                         else
5008                         {
5009                             load->sum_m += load->load[pos];
5010                         }
5011                         pos++;
5012                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
5013                         pos++;
5014                         if (d < dd->ndim-1)
5015                         {
5016                             load->flags = (int)(load->load[pos++] + 0.5);
5017                         }
5018                         if (d > 0)
5019                         {
5020                             root->cell_f_max0[i] = load->load[pos++];
5021                             root->cell_f_min1[i] = load->load[pos++];
5022                         }
5023                     }
5024                     if (bSepPME)
5025                     {
5026                         load->mdf = std::max(load->mdf, load->load[pos]);
5027                         pos++;
5028                         load->pme = std::max(load->pme, load->load[pos]);
5029                         pos++;
5030                     }
5031                 }
5032                 if (isDlbOn(comm) && root->bLimited)
5033                 {
5034                     load->sum_m *= dd->nc[dim];
5035                     load->flags |= (1<<d);
5036                 }
5037             }
5038         }
5039     }
5040
5041     if (DDMASTER(dd))
5042     {
5043         comm->nload      += dd_load_count(comm);
5044         comm->load_step  += comm->cycl[ddCyclStep];
5045         comm->load_sum   += comm->load[0].sum;
5046         comm->load_max   += comm->load[0].max;
5047         if (isDlbOn(comm))
5048         {
5049             for (d = 0; d < dd->ndim; d++)
5050             {
5051                 if (comm->load[0].flags & (1<<d))
5052                 {
5053                     comm->load_lim[d]++;
5054                 }
5055             }
5056         }
5057         if (bSepPME)
5058         {
5059             comm->load_mdf += comm->load[0].mdf;
5060             comm->load_pme += comm->load[0].pme;
5061         }
5062     }
5063
5064     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5065
5066     if (debug)
5067     {
5068         fprintf(debug, "get_load_distribution finished\n");
5069     }
5070 }
5071
5072 static float dd_force_load_fraction(gmx_domdec_t *dd)
5073 {
5074     /* Return the relative performance loss on the total run time
5075      * due to the force calculation load imbalance.
5076      */
5077     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5078     {
5079         return dd->comm->load_sum/(dd->comm->load_step*dd->nnodes);
5080     }
5081     else
5082     {
5083         return 0;
5084     }
5085 }
5086
5087 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5088 {
5089     /* Return the relative performance loss on the total run time
5090      * due to the force calculation load imbalance.
5091      */
5092     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5093     {
5094         return
5095             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5096             (dd->comm->load_step*dd->nnodes);
5097     }
5098     else
5099     {
5100         return 0;
5101     }
5102 }
5103
5104 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5105 {
5106     gmx_domdec_comm_t *comm = dd->comm;
5107
5108     /* Only the master rank prints loads and only if we measured loads */
5109     if (!DDMASTER(dd) || comm->nload == 0)
5110     {
5111         return;
5112     }
5113
5114     char  buf[STRLEN];
5115     int   numPpRanks   = dd->nnodes;
5116     int   numPmeRanks  = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5117     int   numRanks     = numPpRanks + numPmeRanks;
5118     float lossFraction = 0;
5119
5120     /* Print the average load imbalance and performance loss */
5121     if (dd->nnodes > 1 && comm->load_sum > 0)
5122     {
5123         float imbalance = comm->load_max*numPpRanks/comm->load_sum - 1;
5124         lossFraction    = dd_force_imb_perf_loss(dd);
5125
5126         std::string msg         = "\n Dynamic load balancing report:\n";
5127         std::string dlbStateStr = "";
5128
5129         switch (dd->comm->dlbState)
5130         {
5131             case edlbsOffUser:
5132                 dlbStateStr = "DLB was off during the run per user request.";
5133                 break;
5134             case edlbsOffForever:
5135                 /* Currectly this can happen due to performance loss observed, cell size
5136                  * limitations or incompatibility with other settings observed during
5137                  * determineInitialDlbState(). */
5138                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
5139                 break;
5140             case edlbsOffCanTurnOn:
5141                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
5142                 break;
5143             case edlbsOffTemporarilyLocked:
5144                 dlbStateStr = "DLB was locked at the end of the run due to unfinished PP-PME balancing.";
5145                 break;
5146             case edlbsOnCanTurnOff:
5147                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
5148                 break;
5149             case edlbsOnUser:
5150                 dlbStateStr = "DLB was permanently on during the run per user request.";
5151                 break;
5152             default:
5153                 GMX_ASSERT(false, "Undocumented DLB state");
5154         }
5155
5156         msg += " " + dlbStateStr + "\n";
5157         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance*100);
5158         msg += gmx::formatString(" The balanceable part of the MD step is %d%%, load imbalance is computed from this.\n",
5159                                  static_cast<int>(dd_force_load_fraction(dd)*100 + 0.5));
5160         msg += gmx::formatString(" Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
5161                                  lossFraction*100);
5162         fprintf(fplog, "%s", msg.c_str());
5163         fprintf(stderr, "%s", msg.c_str());
5164     }
5165
5166     /* Print during what percentage of steps the  load balancing was limited */
5167     bool dlbWasLimited = false;
5168     if (isDlbOn(comm))
5169     {
5170         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5171         for (int d = 0; d < dd->ndim; d++)
5172         {
5173             int limitPercentage = (200*comm->load_lim[d] + 1)/(2*comm->nload);
5174             sprintf(buf+strlen(buf), " %c %d %%",
5175                     dim2char(dd->dim[d]), limitPercentage);
5176             if (limitPercentage >= 50)
5177             {
5178                 dlbWasLimited = true;
5179             }
5180         }
5181         sprintf(buf + strlen(buf), "\n");
5182         fprintf(fplog, "%s", buf);
5183         fprintf(stderr, "%s", buf);
5184     }
5185
5186     /* Print the performance loss due to separate PME - PP rank imbalance */
5187     float lossFractionPme = 0;
5188     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
5189     {
5190         float pmeForceRatio = comm->load_pme/comm->load_mdf;
5191         lossFractionPme     = (comm->load_pme - comm->load_mdf)/comm->load_step;
5192         if (lossFractionPme <= 0)
5193         {
5194             lossFractionPme *= numPmeRanks/static_cast<float>(numRanks);
5195         }
5196         else
5197         {
5198             lossFractionPme *= numPpRanks/static_cast<float>(numRanks);
5199         }
5200         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
5201         fprintf(fplog, "%s", buf);
5202         fprintf(stderr, "%s", buf);
5203         sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossFractionPme)*100);
5204         fprintf(fplog, "%s", buf);
5205         fprintf(stderr, "%s", buf);
5206     }
5207     fprintf(fplog, "\n");
5208     fprintf(stderr, "\n");
5209
5210     if (lossFraction >= DD_PERF_LOSS_WARN)
5211     {
5212         sprintf(buf,
5213                 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5214                 "      in the domain decomposition.\n", lossFraction*100);
5215         if (!isDlbOn(comm))
5216         {
5217             sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5218         }
5219         else if (dlbWasLimited)
5220         {
5221             sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5222         }
5223         fprintf(fplog, "%s\n", buf);
5224         fprintf(stderr, "%s\n", buf);
5225     }
5226     if (numPmeRanks > 0 && fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
5227     {
5228         sprintf(buf,
5229                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
5230                 "      had %s work to do than the PP ranks.\n"
5231                 "      You might want to %s the number of PME ranks\n"
5232                 "      or %s the cut-off and the grid spacing.\n",
5233                 fabs(lossFractionPme*100),
5234                 (lossFractionPme < 0) ? "less"     : "more",
5235                 (lossFractionPme < 0) ? "decrease" : "increase",
5236                 (lossFractionPme < 0) ? "decrease" : "increase");
5237         fprintf(fplog, "%s\n", buf);
5238         fprintf(stderr, "%s\n", buf);
5239     }
5240 }
5241
5242 static float dd_vol_min(gmx_domdec_t *dd)
5243 {
5244     return dd->comm->load[0].cvol_min*dd->nnodes;
5245 }
5246
5247 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5248 {
5249     return dd->comm->load[0].flags;
5250 }
5251
5252 static float dd_f_imbal(gmx_domdec_t *dd)
5253 {
5254     if (dd->comm->load[0].sum > 0)
5255     {
5256         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
5257     }
5258     else
5259     {
5260         /* Something is wrong in the cycle counting, report no load imbalance */
5261         return 0.0f;
5262     }
5263 }
5264
5265 float dd_pme_f_ratio(gmx_domdec_t *dd)
5266 {
5267     /* Should only be called on the DD master rank */
5268     assert(DDMASTER(dd));
5269
5270     if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
5271     {
5272         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5273     }
5274     else
5275     {
5276         return -1.0;
5277     }
5278 }
5279
5280 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5281 {
5282     int  flags, d;
5283     char buf[22];
5284
5285     flags = dd_load_flags(dd);
5286     if (flags)
5287     {
5288         fprintf(fplog,
5289                 "DD  load balancing is limited by minimum cell size in dimension");
5290         for (d = 0; d < dd->ndim; d++)
5291         {
5292             if (flags & (1<<d))
5293             {
5294                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5295             }
5296         }
5297         fprintf(fplog, "\n");
5298     }
5299     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5300     if (isDlbOn(dd->comm))
5301     {
5302         fprintf(fplog, "  vol min/aver %5.3f%c",
5303                 dd_vol_min(dd), flags ? '!' : ' ');
5304     }
5305     if (dd->nnodes > 1)
5306     {
5307         fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5308     }
5309     if (dd->comm->cycl_n[ddCyclPME])
5310     {
5311         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5312     }
5313     fprintf(fplog, "\n\n");
5314 }
5315
5316 static void dd_print_load_verbose(gmx_domdec_t *dd)
5317 {
5318     if (isDlbOn(dd->comm))
5319     {
5320         fprintf(stderr, "vol %4.2f%c ",
5321                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5322     }
5323     if (dd->nnodes > 1)
5324     {
5325         fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5326     }
5327     if (dd->comm->cycl_n[ddCyclPME])
5328     {
5329         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5330     }
5331 }
5332
5333 #if GMX_MPI
5334 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5335 {
5336     MPI_Comm           c_row;
5337     int                dim, i, rank;
5338     ivec               loc_c;
5339     domdec_root_t     *root;
5340     gmx_bool           bPartOfGroup = FALSE;
5341
5342     dim = dd->dim[dim_ind];
5343     copy_ivec(loc, loc_c);
5344     for (i = 0; i < dd->nc[dim]; i++)
5345     {
5346         loc_c[dim] = i;
5347         rank       = dd_index(dd->nc, loc_c);
5348         if (rank == dd->rank)
5349         {
5350             /* This process is part of the group */
5351             bPartOfGroup = TRUE;
5352         }
5353     }
5354     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5355                    &c_row);
5356     if (bPartOfGroup)
5357     {
5358         dd->comm->mpi_comm_load[dim_ind] = c_row;
5359         if (!isDlbDisabled(dd->comm))
5360         {
5361             if (dd->ci[dim] == dd->master_ci[dim])
5362             {
5363                 /* This is the root process of this row */
5364                 snew(dd->comm->root[dim_ind], 1);
5365                 root = dd->comm->root[dim_ind];
5366                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5367                 snew(root->old_cell_f, dd->nc[dim]+1);
5368                 snew(root->bCellMin, dd->nc[dim]);
5369                 if (dim_ind > 0)
5370                 {
5371                     snew(root->cell_f_max0, dd->nc[dim]);
5372                     snew(root->cell_f_min1, dd->nc[dim]);
5373                     snew(root->bound_min, dd->nc[dim]);
5374                     snew(root->bound_max, dd->nc[dim]);
5375                 }
5376                 snew(root->buf_ncd, dd->nc[dim]);
5377             }
5378             else
5379             {
5380                 /* This is not a root process, we only need to receive cell_f */
5381                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5382             }
5383         }
5384         if (dd->ci[dim] == dd->master_ci[dim])
5385         {
5386             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5387         }
5388     }
5389 }
5390 #endif
5391
5392 void dd_setup_dlb_resource_sharing(t_commrec            *cr,
5393                                    int                   gpu_id)
5394 {
5395 #if GMX_MPI
5396     int           physicalnode_id_hash;
5397     gmx_domdec_t *dd;
5398     MPI_Comm      mpi_comm_pp_physicalnode;
5399
5400     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
5401     {
5402         /* Only ranks with short-ranged tasks (currently) use GPUs.
5403          * If we don't have GPUs assigned, there are no resources to share.
5404          */
5405         return;
5406     }
5407
5408     physicalnode_id_hash = gmx_physicalnode_id_hash();
5409
5410     dd = cr->dd;
5411
5412     if (debug)
5413     {
5414         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5415         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5416                 dd->rank, physicalnode_id_hash, gpu_id);
5417     }
5418     /* Split the PP communicator over the physical nodes */
5419     /* TODO: See if we should store this (before), as it's also used for
5420      * for the nodecomm summation.
5421      */
5422     // TODO PhysicalNodeCommunicator could be extended/used to handle
5423     // the need for per-node per-group communicators.
5424     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5425                    &mpi_comm_pp_physicalnode);
5426     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5427                    &dd->comm->mpi_comm_gpu_shared);
5428     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5429     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5430
5431     if (debug)
5432     {
5433         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5434     }
5435
5436     /* Note that some ranks could share a GPU, while others don't */
5437
5438     if (dd->comm->nrank_gpu_shared == 1)
5439     {
5440         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5441     }
5442 #else
5443     GMX_UNUSED_VALUE(cr);
5444     GMX_UNUSED_VALUE(gpu_id);
5445 #endif
5446 }
5447
5448 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5449 {
5450 #if GMX_MPI
5451     int  dim0, dim1, i, j;
5452     ivec loc;
5453
5454     if (debug)
5455     {
5456         fprintf(debug, "Making load communicators\n");
5457     }
5458
5459     snew(dd->comm->load,          std::max(dd->ndim, 1));
5460     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
5461
5462     if (dd->ndim == 0)
5463     {
5464         return;
5465     }
5466
5467     clear_ivec(loc);
5468     make_load_communicator(dd, 0, loc);
5469     if (dd->ndim > 1)
5470     {
5471         dim0 = dd->dim[0];
5472         for (i = 0; i < dd->nc[dim0]; i++)
5473         {
5474             loc[dim0] = i;
5475             make_load_communicator(dd, 1, loc);
5476         }
5477     }
5478     if (dd->ndim > 2)
5479     {
5480         dim0 = dd->dim[0];
5481         for (i = 0; i < dd->nc[dim0]; i++)
5482         {
5483             loc[dim0] = i;
5484             dim1      = dd->dim[1];
5485             for (j = 0; j < dd->nc[dim1]; j++)
5486             {
5487                 loc[dim1] = j;
5488                 make_load_communicator(dd, 2, loc);
5489             }
5490         }
5491     }
5492
5493     if (debug)
5494     {
5495         fprintf(debug, "Finished making load communicators\n");
5496     }
5497 #endif
5498 }
5499
5500 /*! \brief Sets up the relation between neighboring domains and zones */
5501 static void setup_neighbor_relations(gmx_domdec_t *dd)
5502 {
5503     int                     d, dim, i, j, m;
5504     ivec                    tmp, s;
5505     gmx_domdec_zones_t     *zones;
5506     gmx_domdec_ns_ranges_t *izone;
5507
5508     for (d = 0; d < dd->ndim; d++)
5509     {
5510         dim = dd->dim[d];
5511         copy_ivec(dd->ci, tmp);
5512         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5513         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5514         copy_ivec(dd->ci, tmp);
5515         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5516         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5517         if (debug)
5518         {
5519             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5520                     dd->rank, dim,
5521                     dd->neighbor[d][0],
5522                     dd->neighbor[d][1]);
5523         }
5524     }
5525
5526     int nzone  = (1 << dd->ndim);
5527     int nizone = (1 << std::max(dd->ndim - 1, 0));
5528     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
5529
5530     zones = &dd->comm->zones;
5531
5532     for (i = 0; i < nzone; i++)
5533     {
5534         m = 0;
5535         clear_ivec(zones->shift[i]);
5536         for (d = 0; d < dd->ndim; d++)
5537         {
5538             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5539         }
5540     }
5541
5542     zones->n = nzone;
5543     for (i = 0; i < nzone; i++)
5544     {
5545         for (d = 0; d < DIM; d++)
5546         {
5547             s[d] = dd->ci[d] - zones->shift[i][d];
5548             if (s[d] < 0)
5549             {
5550                 s[d] += dd->nc[d];
5551             }
5552             else if (s[d] >= dd->nc[d])
5553             {
5554                 s[d] -= dd->nc[d];
5555             }
5556         }
5557     }
5558     zones->nizone = nizone;
5559     for (i = 0; i < zones->nizone; i++)
5560     {
5561         assert(ddNonbondedZonePairRanges[i][0] == i);
5562
5563         izone     = &zones->izone[i];
5564         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
5565          * j-zones up to nzone.
5566          */
5567         izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
5568         izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
5569         for (dim = 0; dim < DIM; dim++)
5570         {
5571             if (dd->nc[dim] == 1)
5572             {
5573                 /* All shifts should be allowed */
5574                 izone->shift0[dim] = -1;
5575                 izone->shift1[dim] = 1;
5576             }
5577             else
5578             {
5579                 /* Determine the min/max j-zone shift wrt the i-zone */
5580                 izone->shift0[dim] = 1;
5581                 izone->shift1[dim] = -1;
5582                 for (j = izone->j0; j < izone->j1; j++)
5583                 {
5584                     int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5585                     if (shift_diff < izone->shift0[dim])
5586                     {
5587                         izone->shift0[dim] = shift_diff;
5588                     }
5589                     if (shift_diff > izone->shift1[dim])
5590                     {
5591                         izone->shift1[dim] = shift_diff;
5592                     }
5593                 }
5594             }
5595         }
5596     }
5597
5598     if (!isDlbDisabled(dd->comm))
5599     {
5600         snew(dd->comm->root, dd->ndim);
5601     }
5602
5603     if (dd->comm->bRecordLoad)
5604     {
5605         make_load_communicators(dd);
5606     }
5607 }
5608
5609 static void make_pp_communicator(FILE                 *fplog,
5610                                  gmx_domdec_t         *dd,
5611                                  t_commrec gmx_unused *cr,
5612                                  int gmx_unused        reorder)
5613 {
5614 #if GMX_MPI
5615     gmx_domdec_comm_t *comm;
5616     int                rank, *buf;
5617     ivec               periods;
5618     MPI_Comm           comm_cart;
5619
5620     comm = dd->comm;
5621
5622     if (comm->bCartesianPP)
5623     {
5624         /* Set up cartesian communication for the particle-particle part */
5625         if (fplog)
5626         {
5627             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5628                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5629         }
5630
5631         for (int i = 0; i < DIM; i++)
5632         {
5633             periods[i] = TRUE;
5634         }
5635         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5636                         &comm_cart);
5637         /* We overwrite the old communicator with the new cartesian one */
5638         cr->mpi_comm_mygroup = comm_cart;
5639     }
5640
5641     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5642     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5643
5644     if (comm->bCartesianPP_PME)
5645     {
5646         /* Since we want to use the original cartesian setup for sim,
5647          * and not the one after split, we need to make an index.
5648          */
5649         snew(comm->ddindex2ddnodeid, dd->nnodes);
5650         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5651         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5652         /* Get the rank of the DD master,
5653          * above we made sure that the master node is a PP node.
5654          */
5655         if (MASTER(cr))
5656         {
5657             rank = dd->rank;
5658         }
5659         else
5660         {
5661             rank = 0;
5662         }
5663         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5664     }
5665     else if (comm->bCartesianPP)
5666     {
5667         if (cr->npmenodes == 0)
5668         {
5669             /* The PP communicator is also
5670              * the communicator for this simulation
5671              */
5672             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5673         }
5674         cr->nodeid = dd->rank;
5675
5676         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5677
5678         /* We need to make an index to go from the coordinates
5679          * to the nodeid of this simulation.
5680          */
5681         snew(comm->ddindex2simnodeid, dd->nnodes);
5682         snew(buf, dd->nnodes);
5683         if (thisRankHasDuty(cr, DUTY_PP))
5684         {
5685             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5686         }
5687         /* Communicate the ddindex to simulation nodeid index */
5688         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5689                       cr->mpi_comm_mysim);
5690         sfree(buf);
5691
5692         /* Determine the master coordinates and rank.
5693          * The DD master should be the same node as the master of this sim.
5694          */
5695         for (int i = 0; i < dd->nnodes; i++)
5696         {
5697             if (comm->ddindex2simnodeid[i] == 0)
5698             {
5699                 ddindex2xyz(dd->nc, i, dd->master_ci);
5700                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5701             }
5702         }
5703         if (debug)
5704         {
5705             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5706         }
5707     }
5708     else
5709     {
5710         /* No Cartesian communicators */
5711         /* We use the rank in dd->comm->all as DD index */
5712         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5713         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5714         dd->masterrank = 0;
5715         clear_ivec(dd->master_ci);
5716     }
5717 #endif
5718
5719     if (fplog)
5720     {
5721         fprintf(fplog,
5722                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5723                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5724     }
5725     if (debug)
5726     {
5727         fprintf(debug,
5728                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5729                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5730     }
5731 }
5732
5733 static void receive_ddindex2simnodeid(gmx_domdec_t         *dd,
5734                                       t_commrec            *cr)
5735 {
5736 #if GMX_MPI
5737     gmx_domdec_comm_t *comm = dd->comm;
5738
5739     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5740     {
5741         int *buf;
5742         snew(comm->ddindex2simnodeid, dd->nnodes);
5743         snew(buf, dd->nnodes);
5744         if (thisRankHasDuty(cr, DUTY_PP))
5745         {
5746             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5747         }
5748         /* Communicate the ddindex to simulation nodeid index */
5749         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5750                       cr->mpi_comm_mysim);
5751         sfree(buf);
5752     }
5753 #else
5754     GMX_UNUSED_VALUE(dd);
5755     GMX_UNUSED_VALUE(cr);
5756 #endif
5757 }
5758
5759 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5760                                                      int ncg, int natoms)
5761 {
5762     gmx_domdec_master_t *ma;
5763     int                  i;
5764
5765     snew(ma, 1);
5766
5767     snew(ma->ncg, dd->nnodes);
5768     snew(ma->index, dd->nnodes+1);
5769     snew(ma->cg, ncg);
5770     snew(ma->nat, dd->nnodes);
5771     snew(ma->ibuf, dd->nnodes*2);
5772     snew(ma->cell_x, DIM);
5773     for (i = 0; i < DIM; i++)
5774     {
5775         snew(ma->cell_x[i], dd->nc[i]+1);
5776     }
5777
5778     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5779     {
5780         ma->vbuf = nullptr;
5781     }
5782     else
5783     {
5784         snew(ma->vbuf, natoms);
5785     }
5786
5787     return ma;
5788 }
5789
5790 static void split_communicator(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
5791                                DdRankOrder gmx_unused rankOrder,
5792                                int gmx_unused reorder)
5793 {
5794     gmx_domdec_comm_t *comm;
5795     int                i;
5796     gmx_bool           bDiv[DIM];
5797 #if GMX_MPI
5798     MPI_Comm           comm_cart;
5799 #endif
5800
5801     comm = dd->comm;
5802
5803     if (comm->bCartesianPP)
5804     {
5805         for (i = 1; i < DIM; i++)
5806         {
5807             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5808         }
5809         if (bDiv[YY] || bDiv[ZZ])
5810         {
5811             comm->bCartesianPP_PME = TRUE;
5812             /* If we have 2D PME decomposition, which is always in x+y,
5813              * we stack the PME only nodes in z.
5814              * Otherwise we choose the direction that provides the thinnest slab
5815              * of PME only nodes as this will have the least effect
5816              * on the PP communication.
5817              * But for the PME communication the opposite might be better.
5818              */
5819             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5820                              !bDiv[YY] ||
5821                              dd->nc[YY] > dd->nc[ZZ]))
5822             {
5823                 comm->cartpmedim = ZZ;
5824             }
5825             else
5826             {
5827                 comm->cartpmedim = YY;
5828             }
5829             comm->ntot[comm->cartpmedim]
5830                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5831         }
5832         else if (fplog)
5833         {
5834             fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
5835             fprintf(fplog,
5836                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5837         }
5838     }
5839
5840     if (comm->bCartesianPP_PME)
5841     {
5842 #if GMX_MPI
5843         int  rank;
5844         ivec periods;
5845
5846         if (fplog)
5847         {
5848             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
5849         }
5850
5851         for (i = 0; i < DIM; i++)
5852         {
5853             periods[i] = TRUE;
5854         }
5855         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
5856                         &comm_cart);
5857         MPI_Comm_rank(comm_cart, &rank);
5858         if (MASTER(cr) && rank != 0)
5859         {
5860             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5861         }
5862
5863         /* With this assigment we loose the link to the original communicator
5864          * which will usually be MPI_COMM_WORLD, unless have multisim.
5865          */
5866         cr->mpi_comm_mysim = comm_cart;
5867         cr->sim_nodeid     = rank;
5868
5869         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
5870
5871         if (fplog)
5872         {
5873             fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
5874                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5875         }
5876
5877         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5878         {
5879             cr->duty = DUTY_PP;
5880         }
5881         if (cr->npmenodes == 0 ||
5882             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5883         {
5884             cr->duty = DUTY_PME;
5885         }
5886
5887         /* Split the sim communicator into PP and PME only nodes */
5888         MPI_Comm_split(cr->mpi_comm_mysim,
5889                        getThisRankDuties(cr),
5890                        dd_index(comm->ntot, dd->ci),
5891                        &cr->mpi_comm_mygroup);
5892 #endif
5893     }
5894     else
5895     {
5896         switch (rankOrder)
5897         {
5898             case DdRankOrder::pp_pme:
5899                 if (fplog)
5900                 {
5901                     fprintf(fplog, "Order of the ranks: PP first, PME last\n");
5902                 }
5903                 break;
5904             case DdRankOrder::interleave:
5905                 /* Interleave the PP-only and PME-only ranks */
5906                 if (fplog)
5907                 {
5908                     fprintf(fplog, "Interleaving PP and PME ranks\n");
5909                 }
5910                 comm->pmenodes = dd_interleaved_pme_ranks(dd);
5911                 break;
5912             case DdRankOrder::cartesian:
5913                 break;
5914             default:
5915                 gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(rankOrder));
5916         }
5917
5918         if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
5919         {
5920             cr->duty = DUTY_PME;
5921         }
5922         else
5923         {
5924             cr->duty = DUTY_PP;
5925         }
5926 #if GMX_MPI
5927         /* Split the sim communicator into PP and PME only nodes */
5928         MPI_Comm_split(cr->mpi_comm_mysim,
5929                        getThisRankDuties(cr),
5930                        cr->nodeid,
5931                        &cr->mpi_comm_mygroup);
5932         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
5933 #endif
5934     }
5935
5936     if (fplog)
5937     {
5938         fprintf(fplog, "This rank does only %s work.\n\n",
5939                 thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
5940     }
5941 }
5942
5943 /*! \brief Generates the MPI communicators for domain decomposition */
5944 static void make_dd_communicators(FILE *fplog, t_commrec *cr,
5945                                   gmx_domdec_t *dd, DdRankOrder ddRankOrder)
5946 {
5947     gmx_domdec_comm_t *comm;
5948     int                CartReorder;
5949
5950     comm = dd->comm;
5951
5952     copy_ivec(dd->nc, comm->ntot);
5953
5954     comm->bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
5955     comm->bCartesianPP_PME = FALSE;
5956
5957     /* Reorder the nodes by default. This might change the MPI ranks.
5958      * Real reordering is only supported on very few architectures,
5959      * Blue Gene is one of them.
5960      */
5961     CartReorder = (getenv("GMX_NO_CART_REORDER") == nullptr);
5962
5963     if (cr->npmenodes > 0)
5964     {
5965         /* Split the communicator into a PP and PME part */
5966         split_communicator(fplog, cr, dd, ddRankOrder, CartReorder);
5967         if (comm->bCartesianPP_PME)
5968         {
5969             /* We (possibly) reordered the nodes in split_communicator,
5970              * so it is no longer required in make_pp_communicator.
5971              */
5972             CartReorder = FALSE;
5973         }
5974     }
5975     else
5976     {
5977         /* All nodes do PP and PME */
5978 #if GMX_MPI
5979         /* We do not require separate communicators */
5980         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5981 #endif
5982     }
5983
5984     if (thisRankHasDuty(cr, DUTY_PP))
5985     {
5986         /* Copy or make a new PP communicator */
5987         make_pp_communicator(fplog, dd, cr, CartReorder);
5988     }
5989     else
5990     {
5991         receive_ddindex2simnodeid(dd, cr);
5992     }
5993
5994     if (!thisRankHasDuty(cr, DUTY_PME))
5995     {
5996         /* Set up the commnuication to our PME node */
5997         dd->pme_nodeid           = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
5998         dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
5999         if (debug)
6000         {
6001             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6002                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6003         }
6004     }
6005     else
6006     {
6007         dd->pme_nodeid = -1;
6008     }
6009
6010     if (DDMASTER(dd))
6011     {
6012         dd->ma = init_gmx_domdec_master_t(dd,
6013                                           comm->cgs_gl.nr,
6014                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6015     }
6016 }
6017
6018 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6019 {
6020     real  *slb_frac, tot;
6021     int    i, n;
6022     double dbl;
6023
6024     slb_frac = nullptr;
6025     if (nc > 1 && size_string != nullptr)
6026     {
6027         if (fplog)
6028         {
6029             fprintf(fplog, "Using static load balancing for the %s direction\n",
6030                     dir);
6031         }
6032         snew(slb_frac, nc);
6033         tot = 0;
6034         for (i = 0; i < nc; i++)
6035         {
6036             dbl = 0;
6037             sscanf(size_string, "%20lf%n", &dbl, &n);
6038             if (dbl == 0)
6039             {
6040                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6041             }
6042             slb_frac[i]  = dbl;
6043             size_string += n;
6044             tot         += slb_frac[i];
6045         }
6046         /* Normalize */
6047         if (fplog)
6048         {
6049             fprintf(fplog, "Relative cell sizes:");
6050         }
6051         for (i = 0; i < nc; i++)
6052         {
6053             slb_frac[i] /= tot;
6054             if (fplog)
6055             {
6056                 fprintf(fplog, " %5.3f", slb_frac[i]);
6057             }
6058         }
6059         if (fplog)
6060         {
6061             fprintf(fplog, "\n");
6062         }
6063     }
6064
6065     return slb_frac;
6066 }
6067
6068 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
6069 {
6070     int                  n, nmol, ftype;
6071     gmx_mtop_ilistloop_t iloop;
6072     const t_ilist       *il;
6073
6074     n     = 0;
6075     iloop = gmx_mtop_ilistloop_init(mtop);
6076     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6077     {
6078         for (ftype = 0; ftype < F_NRE; ftype++)
6079         {
6080             if ((interaction_function[ftype].flags & IF_BOND) &&
6081                 NRAL(ftype) >  2)
6082             {
6083                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6084             }
6085         }
6086     }
6087
6088     return n;
6089 }
6090
6091 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6092 {
6093     char *val;
6094     int   nst;
6095
6096     nst = def;
6097     val = getenv(env_var);
6098     if (val)
6099     {
6100         if (sscanf(val, "%20d", &nst) <= 0)
6101         {
6102             nst = 1;
6103         }
6104         if (fplog)
6105         {
6106             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6107                     env_var, val, nst);
6108         }
6109     }
6110
6111     return nst;
6112 }
6113
6114 static void dd_warning(const t_commrec *cr, FILE *fplog, const char *warn_string)
6115 {
6116     if (MASTER(cr))
6117     {
6118         fprintf(stderr, "\n%s\n", warn_string);
6119     }
6120     if (fplog)
6121     {
6122         fprintf(fplog, "\n%s\n", warn_string);
6123     }
6124 }
6125
6126 static void check_dd_restrictions(t_commrec *cr, const gmx_domdec_t *dd,
6127                                   const t_inputrec *ir, FILE *fplog)
6128 {
6129     if (ir->ePBC == epbcSCREW &&
6130         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6131     {
6132         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6133     }
6134
6135     if (ir->ns_type == ensSIMPLE)
6136     {
6137         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6138     }
6139
6140     if (ir->nstlist == 0)
6141     {
6142         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6143     }
6144
6145     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6146     {
6147         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6148     }
6149 }
6150
6151 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6152 {
6153     int  di, d;
6154     real r;
6155
6156     r = ddbox->box_size[XX];
6157     for (di = 0; di < dd->ndim; di++)
6158     {
6159         d = dd->dim[di];
6160         /* Check using the initial average cell size */
6161         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6162     }
6163
6164     return r;
6165 }
6166
6167 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
6168  */
6169 static int forceDlbOffOrBail(int                cmdlineDlbState,
6170                              const std::string &reasonStr,
6171                              t_commrec         *cr,
6172                              FILE              *fplog)
6173 {
6174     std::string dlbNotSupportedErr  = "Dynamic load balancing requested, but ";
6175     std::string dlbDisableNote      = "NOTE: disabling dynamic load balancing as ";
6176
6177     if (cmdlineDlbState == edlbsOnUser)
6178     {
6179         gmx_fatal(FARGS, (dlbNotSupportedErr + reasonStr).c_str());
6180     }
6181     else if (cmdlineDlbState == edlbsOffCanTurnOn)
6182     {
6183         dd_warning(cr, fplog, (dlbDisableNote + reasonStr + "\n").c_str());
6184     }
6185     return edlbsOffForever;
6186 }
6187
6188 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
6189  *
6190  * This function parses the parameters of "-dlb" command line option setting
6191  * corresponding state values. Then it checks the consistency of the determined
6192  * state with other run parameters and settings. As a result, the initial state
6193  * may be altered or an error may be thrown if incompatibility of options is detected.
6194  *
6195  * \param [in] fplog       Pointer to mdrun log file.
6196  * \param [in] cr          Pointer to MPI communication object.
6197  * \param [in] dlbOption   Enum value for the DLB option.
6198  * \param [in] bRecordLoad True if the load balancer is recording load information.
6199  * \param [in] mdrunOptions  Options for mdrun.
6200  * \param [in] ir          Pointer mdrun to input parameters.
6201  * \returns                DLB initial/startup state.
6202  */
6203 static int determineInitialDlbState(FILE *fplog, t_commrec *cr,
6204                                     DlbOption dlbOption, gmx_bool bRecordLoad,
6205                                     const MdrunOptions &mdrunOptions,
6206                                     const t_inputrec *ir)
6207 {
6208     int dlbState = edlbsOffCanTurnOn;
6209
6210     switch (dlbOption)
6211     {
6212         case DlbOption::turnOnWhenUseful: dlbState = edlbsOffCanTurnOn; break;
6213         case DlbOption::no:               dlbState = edlbsOffUser;      break;
6214         case DlbOption::yes:              dlbState = edlbsOnUser;       break;
6215         default: gmx_incons("Invalid dlbOption enum value");
6216     }
6217
6218     /* Reruns don't support DLB: bail or override auto mode */
6219     if (mdrunOptions.rerun)
6220     {
6221         std::string reasonStr = "it is not supported in reruns.";
6222         return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6223     }
6224
6225     /* Unsupported integrators */
6226     if (!EI_DYNAMICS(ir->eI))
6227     {
6228         auto reasonStr = gmx::formatString("it is only supported with dynamics, not with integrator '%s'.", EI(ir->eI));
6229         return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6230     }
6231
6232     /* Without cycle counters we can't time work to balance on */
6233     if (!bRecordLoad)
6234     {
6235         std::string reasonStr = "cycle counters unsupported or not enabled in the operating system kernel.";
6236         return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6237     }
6238
6239     if (mdrunOptions.reproducible)
6240     {
6241         std::string reasonStr = "you started a reproducible run.";
6242         switch (dlbState)
6243         {
6244             case edlbsOffUser:
6245                 break;
6246             case edlbsOffForever:
6247                 GMX_RELEASE_ASSERT(false, "edlbsOffForever is not a valid initial state");
6248                 break;
6249             case edlbsOffCanTurnOn:
6250                 return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6251                 break;
6252             case edlbsOnCanTurnOff:
6253                 GMX_RELEASE_ASSERT(false, "edlbsOffCanTurnOff is not a valid initial state");
6254                 break;
6255             case edlbsOnUser:
6256                 return forceDlbOffOrBail(dlbState, reasonStr + " In load balanced runs binary reproducibility cannot be ensured.", cr, fplog);
6257                 break;
6258             default:
6259                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
6260                 break;
6261         }
6262     }
6263
6264     return dlbState;
6265 }
6266
6267 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6268 {
6269     int dim;
6270
6271     dd->ndim = 0;
6272     if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
6273     {
6274         /* Decomposition order z,y,x */
6275         if (fplog)
6276         {
6277             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6278         }
6279         for (dim = DIM-1; dim >= 0; dim--)
6280         {
6281             if (dd->nc[dim] > 1)
6282             {
6283                 dd->dim[dd->ndim++] = dim;
6284             }
6285         }
6286     }
6287     else
6288     {
6289         /* Decomposition order x,y,z */
6290         for (dim = 0; dim < DIM; dim++)
6291         {
6292             if (dd->nc[dim] > 1)
6293             {
6294                 dd->dim[dd->ndim++] = dim;
6295             }
6296         }
6297     }
6298 }
6299
6300 static gmx_domdec_comm_t *init_dd_comm()
6301 {
6302     gmx_domdec_comm_t *comm;
6303     int                i;
6304
6305     snew(comm, 1);
6306     snew(comm->cggl_flag, DIM*2);
6307     snew(comm->cgcm_state, DIM*2);
6308     for (i = 0; i < DIM*2; i++)
6309     {
6310         comm->cggl_flag_nalloc[i]  = 0;
6311         comm->cgcm_state_nalloc[i] = 0;
6312     }
6313
6314     comm->nalloc_int = 0;
6315     comm->buf_int    = nullptr;
6316
6317     vec_rvec_init(&comm->vbuf);
6318
6319     comm->n_load_have    = 0;
6320     comm->n_load_collect = 0;
6321
6322     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6323     {
6324         comm->sum_nat[i] = 0;
6325     }
6326     comm->ndecomp   = 0;
6327     comm->nload     = 0;
6328     comm->load_step = 0;
6329     comm->load_sum  = 0;
6330     comm->load_max  = 0;
6331     clear_ivec(comm->load_lim);
6332     comm->load_mdf  = 0;
6333     comm->load_pme  = 0;
6334
6335     /* This should be replaced by a unique pointer */
6336     comm->balanceRegion = ddBalanceRegionAllocate();
6337
6338     return comm;
6339 }
6340
6341 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
6342 static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
6343                                    const DomdecOptions &options,
6344                                    const MdrunOptions &mdrunOptions,
6345                                    const gmx_mtop_t *mtop,
6346                                    const t_inputrec *ir,
6347                                    const matrix box, const rvec *xGlobal,
6348                                    gmx_ddbox_t *ddbox,
6349                                    int *npme_x, int *npme_y)
6350 {
6351     real               r_bonded         = -1;
6352     real               r_bonded_limit   = -1;
6353     const real         tenPercentMargin = 1.1;
6354     gmx_domdec_comm_t *comm             = dd->comm;
6355
6356     snew(comm->cggl_flag, DIM*2);
6357     snew(comm->cgcm_state, DIM*2);
6358
6359     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6360     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6361
6362     dd->pme_recv_f_alloc = 0;
6363     dd->pme_recv_f_buf   = nullptr;
6364
6365     /* Initialize to GPU share count to 0, might change later */
6366     comm->nrank_gpu_shared = 0;
6367
6368     comm->dlbState         = determineInitialDlbState(fplog, cr, options.dlbOption, comm->bRecordLoad, mdrunOptions, ir);
6369     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
6370     /* To consider turning DLB on after 2*nstlist steps we need to check
6371      * at partitioning count 3. Thus we need to increase the first count by 2.
6372      */
6373     comm->ddPartioningCountFirstDlbOff += 2;
6374
6375     if (fplog)
6376     {
6377         fprintf(fplog, "Dynamic load balancing: %s\n",
6378                 edlbs_names[comm->dlbState]);
6379     }
6380     comm->bPMELoadBalDLBLimits = FALSE;
6381
6382     /* Allocate the charge group/atom sorting struct */
6383     snew(comm->sort, 1);
6384
6385     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6386
6387     comm->bInterCGBondeds = ((ncg_mtop(mtop) > gmx_mtop_num_molecules(*mtop)) ||
6388                              mtop->bIntermolecularInteractions);
6389     if (comm->bInterCGBondeds)
6390     {
6391         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6392     }
6393     else
6394     {
6395         comm->bInterCGMultiBody = FALSE;
6396     }
6397
6398     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6399     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6400
6401     if (ir->rlist == 0)
6402     {
6403         /* Set the cut-off to some very large value,
6404          * so we don't need if statements everywhere in the code.
6405          * We use sqrt, since the cut-off is squared in some places.
6406          */
6407         comm->cutoff   = GMX_CUTOFF_INF;
6408     }
6409     else
6410     {
6411         comm->cutoff   = ir->rlist;
6412     }
6413     comm->cutoff_mbody = 0;
6414
6415     comm->cellsize_limit = 0;
6416     comm->bBondComm      = FALSE;
6417
6418     /* Atoms should be able to move by up to half the list buffer size (if > 0)
6419      * within nstlist steps. Since boundaries are allowed to displace by half
6420      * a cell size, DD cells should be at least the size of the list buffer.
6421      */
6422     comm->cellsize_limit = std::max(comm->cellsize_limit,
6423                                     ir->rlist - std::max(ir->rvdw, ir->rcoulomb));
6424
6425     if (comm->bInterCGBondeds)
6426     {
6427         if (options.minimumCommunicationRange > 0)
6428         {
6429             comm->cutoff_mbody = options.minimumCommunicationRange;
6430             if (options.useBondedCommunication)
6431             {
6432                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6433             }
6434             else
6435             {
6436                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6437             }
6438             r_bonded_limit = comm->cutoff_mbody;
6439         }
6440         else if (ir->bPeriodicMols)
6441         {
6442             /* Can not easily determine the required cut-off */
6443             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6444             comm->cutoff_mbody = comm->cutoff/2;
6445             r_bonded_limit     = comm->cutoff_mbody;
6446         }
6447         else
6448         {
6449             real r_2b, r_mb;
6450
6451             if (MASTER(cr))
6452             {
6453                 dd_bonded_cg_distance(fplog, mtop, ir, xGlobal, box,
6454                                       options.checkBondedInteractions,
6455                                       &r_2b, &r_mb);
6456             }
6457             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6458             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6459
6460             /* We use an initial margin of 10% for the minimum cell size,
6461              * except when we are just below the non-bonded cut-off.
6462              */
6463             if (options.useBondedCommunication)
6464             {
6465                 if (std::max(r_2b, r_mb) > comm->cutoff)
6466                 {
6467                     r_bonded        = std::max(r_2b, r_mb);
6468                     r_bonded_limit  = tenPercentMargin*r_bonded;
6469                     comm->bBondComm = TRUE;
6470                 }
6471                 else
6472                 {
6473                     r_bonded       = r_mb;
6474                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
6475                 }
6476                 /* We determine cutoff_mbody later */
6477             }
6478             else
6479             {
6480                 /* No special bonded communication,
6481                  * simply increase the DD cut-off.
6482                  */
6483                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
6484                 comm->cutoff_mbody = r_bonded_limit;
6485                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
6486             }
6487         }
6488         if (fplog)
6489         {
6490             fprintf(fplog,
6491                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6492                     r_bonded_limit);
6493         }
6494         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
6495     }
6496
6497     real rconstr = 0;
6498     if (dd->bInterCGcons && options.constraintCommunicationRange <= 0)
6499     {
6500         /* There is a cell size limit due to the constraints (P-LINCS) */
6501         rconstr = constr_r_max(fplog, mtop, ir);
6502         if (fplog)
6503         {
6504             fprintf(fplog,
6505                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6506                     rconstr);
6507             if (rconstr > comm->cellsize_limit)
6508             {
6509                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6510             }
6511         }
6512     }
6513     else if (options.constraintCommunicationRange > 0 && fplog)
6514     {
6515         /* Here we do not check for dd->bInterCGcons,
6516          * because one can also set a cell size limit for virtual sites only
6517          * and at this point we don't know yet if there are intercg v-sites.
6518          */
6519         fprintf(fplog,
6520                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6521                 options.constraintCommunicationRange);
6522         rconstr = options.constraintCommunicationRange;
6523     }
6524     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
6525
6526     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6527
6528     if (options.numCells[XX] > 0)
6529     {
6530         copy_ivec(options.numCells, dd->nc);
6531         set_dd_dim(fplog, dd);
6532         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, xGlobal, ddbox);
6533
6534         if (options.numPmeRanks >= 0)
6535         {
6536             cr->npmenodes = options.numPmeRanks;
6537         }
6538         else
6539         {
6540             /* When the DD grid is set explicitly and -npme is set to auto,
6541              * don't use PME ranks. We check later if the DD grid is
6542              * compatible with the total number of ranks.
6543              */
6544             cr->npmenodes = 0;
6545         }
6546
6547         real acs = average_cellsize_min(dd, ddbox);
6548         if (acs < comm->cellsize_limit)
6549         {
6550             if (fplog)
6551             {
6552                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6553             }
6554             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6555                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6556                                  acs, comm->cellsize_limit);
6557         }
6558     }
6559     else
6560     {
6561         set_ddbox_cr(cr, nullptr, ir, box, &comm->cgs_gl, xGlobal, ddbox);
6562
6563         /* We need to choose the optimal DD grid and possibly PME nodes */
6564         real limit =
6565             dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6566                            options.numPmeRanks,
6567                            !isDlbDisabled(comm),
6568                            options.dlbScaling,
6569                            comm->cellsize_limit, comm->cutoff,
6570                            comm->bInterCGBondeds);
6571
6572         if (dd->nc[XX] == 0)
6573         {
6574             char     buf[STRLEN];
6575             gmx_bool bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6576             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6577                     !bC ? "-rdd" : "-rcon",
6578                     comm->dlbState != edlbsOffUser ? " or -dds" : "",
6579                     bC ? " or your LINCS settings" : "");
6580
6581             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6582                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6583                                  "%s\n"
6584                                  "Look in the log file for details on the domain decomposition",
6585                                  cr->nnodes-cr->npmenodes, limit, buf);
6586         }
6587         set_dd_dim(fplog, dd);
6588     }
6589
6590     if (fplog)
6591     {
6592         fprintf(fplog,
6593                 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6594                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6595     }
6596
6597     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6598     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6599     {
6600         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6601                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6602                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6603     }
6604     if (cr->npmenodes > dd->nnodes)
6605     {
6606         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6607                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6608     }
6609     if (cr->npmenodes > 0)
6610     {
6611         comm->npmenodes = cr->npmenodes;
6612     }
6613     else
6614     {
6615         comm->npmenodes = dd->nnodes;
6616     }
6617
6618     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6619     {
6620         /* The following choices should match those
6621          * in comm_cost_est in domdec_setup.c.
6622          * Note that here the checks have to take into account
6623          * that the decomposition might occur in a different order than xyz
6624          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6625          * in which case they will not match those in comm_cost_est,
6626          * but since that is mainly for testing purposes that's fine.
6627          */
6628         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6629             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6630             getenv("GMX_PMEONEDD") == nullptr)
6631         {
6632             comm->npmedecompdim = 2;
6633             comm->npmenodes_x   = dd->nc[XX];
6634             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6635         }
6636         else
6637         {
6638             /* In case nc is 1 in both x and y we could still choose to
6639              * decompose pme in y instead of x, but we use x for simplicity.
6640              */
6641             comm->npmedecompdim = 1;
6642             if (dd->dim[0] == YY)
6643             {
6644                 comm->npmenodes_x = 1;
6645                 comm->npmenodes_y = comm->npmenodes;
6646             }
6647             else
6648             {
6649                 comm->npmenodes_x = comm->npmenodes;
6650                 comm->npmenodes_y = 1;
6651             }
6652         }
6653         if (fplog)
6654         {
6655             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6656                     comm->npmenodes_x, comm->npmenodes_y, 1);
6657         }
6658     }
6659     else
6660     {
6661         comm->npmedecompdim = 0;
6662         comm->npmenodes_x   = 0;
6663         comm->npmenodes_y   = 0;
6664     }
6665
6666     /* Technically we don't need both of these,
6667      * but it simplifies code not having to recalculate it.
6668      */
6669     *npme_x = comm->npmenodes_x;
6670     *npme_y = comm->npmenodes_y;
6671
6672     snew(comm->slb_frac, DIM);
6673     if (isDlbDisabled(comm))
6674     {
6675         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], options.cellSizeX);
6676         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], options.cellSizeY);
6677         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], options.cellSizeZ);
6678     }
6679
6680     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6681     {
6682         if (comm->bBondComm || !isDlbDisabled(comm))
6683         {
6684             /* Set the bonded communication distance to halfway
6685              * the minimum and the maximum,
6686              * since the extra communication cost is nearly zero.
6687              */
6688             real acs           = average_cellsize_min(dd, ddbox);
6689             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6690             if (!isDlbDisabled(comm))
6691             {
6692                 /* Check if this does not limit the scaling */
6693                 comm->cutoff_mbody = std::min(comm->cutoff_mbody,
6694                                               options.dlbScaling*acs);
6695             }
6696             if (!comm->bBondComm)
6697             {
6698                 /* Without bBondComm do not go beyond the n.b. cut-off */
6699                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
6700                 if (comm->cellsize_limit >= comm->cutoff)
6701                 {
6702                     /* We don't loose a lot of efficieny
6703                      * when increasing it to the n.b. cut-off.
6704                      * It can even be slightly faster, because we need
6705                      * less checks for the communication setup.
6706                      */
6707                     comm->cutoff_mbody = comm->cutoff;
6708                 }
6709             }
6710             /* Check if we did not end up below our original limit */
6711             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
6712
6713             if (comm->cutoff_mbody > comm->cellsize_limit)
6714             {
6715                 comm->cellsize_limit = comm->cutoff_mbody;
6716             }
6717         }
6718         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6719     }
6720
6721     if (debug)
6722     {
6723         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6724                 "cellsize limit %f\n",
6725                 comm->bBondComm, comm->cellsize_limit);
6726     }
6727
6728     if (MASTER(cr))
6729     {
6730         check_dd_restrictions(cr, dd, ir, fplog);
6731     }
6732 }
6733
6734 static void set_dlb_limits(gmx_domdec_t *dd)
6735
6736 {
6737     int d;
6738
6739     for (d = 0; d < dd->ndim; d++)
6740     {
6741         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6742         dd->comm->cellsize_min[dd->dim[d]] =
6743             dd->comm->cellsize_min_dlb[dd->dim[d]];
6744     }
6745 }
6746
6747
6748 static void turn_on_dlb(FILE *fplog, const t_commrec *cr, gmx_int64_t step)
6749 {
6750     gmx_domdec_t      *dd;
6751     gmx_domdec_comm_t *comm;
6752     real               cellsize_min;
6753     int                d, nc, i;
6754
6755     dd   = cr->dd;
6756     comm = dd->comm;
6757
6758     cellsize_min = comm->cellsize_min[dd->dim[0]];
6759     for (d = 1; d < dd->ndim; d++)
6760     {
6761         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6762     }
6763
6764     /* Turn off DLB if we're too close to the cell size limit. */
6765     if (cellsize_min < comm->cellsize_limit*1.05)
6766     {
6767         auto str = gmx::formatString("step %" GMX_PRId64 " Measured %.1f %% performance loss due to load imbalance, "
6768                                      "but the minimum cell size is smaller than 1.05 times the cell size limit."
6769                                      "Will no longer try dynamic load balancing.\n", step, dd_force_imb_perf_loss(dd)*100);
6770         dd_warning(cr, fplog, str.c_str());
6771
6772         comm->dlbState = edlbsOffForever;
6773         return;
6774     }
6775
6776     char buf[STRLEN];
6777     sprintf(buf, "step %" GMX_PRId64 " Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.\n", step, dd_force_imb_perf_loss(dd)*100);
6778     dd_warning(cr, fplog, buf);
6779     comm->dlbState = edlbsOnCanTurnOff;
6780
6781     /* Store the non-DLB performance, so we can check if DLB actually
6782      * improves performance.
6783      */
6784     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
6785     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
6786
6787     set_dlb_limits(dd);
6788
6789     /* We can set the required cell size info here,
6790      * so we do not need to communicate this.
6791      * The grid is completely uniform.
6792      */
6793     for (d = 0; d < dd->ndim; d++)
6794     {
6795         if (comm->root[d])
6796         {
6797             comm->load[d].sum_m = comm->load[d].sum;
6798
6799             nc = dd->nc[dd->dim[d]];
6800             for (i = 0; i < nc; i++)
6801             {
6802                 comm->root[d]->cell_f[i]    = i/(real)nc;
6803                 if (d > 0)
6804                 {
6805                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6806                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6807                 }
6808             }
6809             comm->root[d]->cell_f[nc] = 1.0;
6810         }
6811     }
6812 }
6813
6814 static void turn_off_dlb(FILE *fplog, const t_commrec *cr, gmx_int64_t step)
6815 {
6816     gmx_domdec_t *dd = cr->dd;
6817
6818     char          buf[STRLEN];
6819     sprintf(buf, "step %" GMX_PRId64 " Turning off dynamic load balancing, because it is degrading performance.\n", step);
6820     dd_warning(cr, fplog, buf);
6821     dd->comm->dlbState                     = edlbsOffCanTurnOn;
6822     dd->comm->haveTurnedOffDlb             = true;
6823     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
6824 }
6825
6826 static void turn_off_dlb_forever(FILE *fplog, const t_commrec *cr, gmx_int64_t step)
6827 {
6828     GMX_RELEASE_ASSERT(cr->dd->comm->dlbState == edlbsOffCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
6829     char buf[STRLEN];
6830     sprintf(buf, "step %" GMX_PRId64 " Will no longer try dynamic load balancing, as it degraded performance.\n", step);
6831     dd_warning(cr, fplog, buf);
6832     cr->dd->comm->dlbState = edlbsOffForever;
6833 }
6834
6835 static char *init_bLocalCG(const gmx_mtop_t *mtop)
6836 {
6837     int   ncg, cg;
6838     char *bLocalCG;
6839
6840     ncg = ncg_mtop(mtop);
6841     snew(bLocalCG, ncg);
6842     for (cg = 0; cg < ncg; cg++)
6843     {
6844         bLocalCG[cg] = FALSE;
6845     }
6846
6847     return bLocalCG;
6848 }
6849
6850 void dd_init_bondeds(FILE *fplog,
6851                      gmx_domdec_t *dd,
6852                      const gmx_mtop_t *mtop,
6853                      const gmx_vsite_t *vsite,
6854                      const t_inputrec *ir,
6855                      gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
6856 {
6857     gmx_domdec_comm_t *comm;
6858
6859     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
6860
6861     comm = dd->comm;
6862
6863     if (comm->bBondComm)
6864     {
6865         /* Communicate atoms beyond the cut-off for bonded interactions */
6866         comm = dd->comm;
6867
6868         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
6869
6870         comm->bLocalCG = init_bLocalCG(mtop);
6871     }
6872     else
6873     {
6874         /* Only communicate atoms based on cut-off */
6875         comm->cglink   = nullptr;
6876         comm->bLocalCG = nullptr;
6877     }
6878 }
6879
6880 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
6881                               const gmx_mtop_t *mtop, const t_inputrec *ir,
6882                               gmx_bool bDynLoadBal, real dlb_scale,
6883                               const gmx_ddbox_t *ddbox)
6884 {
6885     gmx_domdec_comm_t *comm;
6886     int                d;
6887     ivec               np;
6888     real               limit, shrink;
6889     char               buf[64];
6890
6891     if (fplog == nullptr)
6892     {
6893         return;
6894     }
6895
6896     comm = dd->comm;
6897
6898     if (bDynLoadBal)
6899     {
6900         fprintf(fplog, "The maximum number of communication pulses is:");
6901         for (d = 0; d < dd->ndim; d++)
6902         {
6903             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
6904         }
6905         fprintf(fplog, "\n");
6906         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
6907         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
6908         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
6909         for (d = 0; d < DIM; d++)
6910         {
6911             if (dd->nc[d] > 1)
6912             {
6913                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6914                 {
6915                     shrink = 0;
6916                 }
6917                 else
6918                 {
6919                     shrink =
6920                         comm->cellsize_min_dlb[d]/
6921                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6922                 }
6923                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
6924             }
6925         }
6926         fprintf(fplog, "\n");
6927     }
6928     else
6929     {
6930         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
6931         fprintf(fplog, "The initial number of communication pulses is:");
6932         for (d = 0; d < dd->ndim; d++)
6933         {
6934             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
6935         }
6936         fprintf(fplog, "\n");
6937         fprintf(fplog, "The initial domain decomposition cell size is:");
6938         for (d = 0; d < DIM; d++)
6939         {
6940             if (dd->nc[d] > 1)
6941             {
6942                 fprintf(fplog, " %c %.2f nm",
6943                         dim2char(d), dd->comm->cellsize_min[d]);
6944             }
6945         }
6946         fprintf(fplog, "\n\n");
6947     }
6948
6949     gmx_bool bInterCGVsites = count_intercg_vsites(mtop);
6950
6951     if (comm->bInterCGBondeds ||
6952         bInterCGVsites ||
6953         dd->bInterCGcons || dd->bInterCGsettles)
6954     {
6955         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
6956         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6957                 "non-bonded interactions", "", comm->cutoff);
6958
6959         if (bDynLoadBal)
6960         {
6961             limit = dd->comm->cellsize_limit;
6962         }
6963         else
6964         {
6965             if (dynamic_dd_box(ddbox, ir))
6966             {
6967                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
6968             }
6969             limit = dd->comm->cellsize_min[XX];
6970             for (d = 1; d < DIM; d++)
6971             {
6972                 limit = std::min(limit, dd->comm->cellsize_min[d]);
6973             }
6974         }
6975
6976         if (comm->bInterCGBondeds)
6977         {
6978             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6979                     "two-body bonded interactions", "(-rdd)",
6980                     std::max(comm->cutoff, comm->cutoff_mbody));
6981             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6982                     "multi-body bonded interactions", "(-rdd)",
6983                     (comm->bBondComm || isDlbOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
6984         }
6985         if (bInterCGVsites)
6986         {
6987             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6988                     "virtual site constructions", "(-rcon)", limit);
6989         }
6990         if (dd->bInterCGcons || dd->bInterCGsettles)
6991         {
6992             sprintf(buf, "atoms separated by up to %d constraints",
6993                     1+ir->nProjOrder);
6994             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6995                     buf, "(-rcon)", limit);
6996         }
6997         fprintf(fplog, "\n");
6998     }
6999
7000     fflush(fplog);
7001 }
7002
7003 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7004                                 real               dlb_scale,
7005                                 const t_inputrec  *ir,
7006                                 const gmx_ddbox_t *ddbox)
7007 {
7008     gmx_domdec_comm_t *comm;
7009     int                d, dim, npulse, npulse_d_max, npulse_d;
7010     gmx_bool           bNoCutOff;
7011
7012     comm = dd->comm;
7013
7014     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7015
7016     /* Determine the maximum number of comm. pulses in one dimension */
7017
7018     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7019
7020     /* Determine the maximum required number of grid pulses */
7021     if (comm->cellsize_limit >= comm->cutoff)
7022     {
7023         /* Only a single pulse is required */
7024         npulse = 1;
7025     }
7026     else if (!bNoCutOff && comm->cellsize_limit > 0)
7027     {
7028         /* We round down slightly here to avoid overhead due to the latency
7029          * of extra communication calls when the cut-off
7030          * would be only slightly longer than the cell size.
7031          * Later cellsize_limit is redetermined,
7032          * so we can not miss interactions due to this rounding.
7033          */
7034         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7035     }
7036     else
7037     {
7038         /* There is no cell size limit */
7039         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7040     }
7041
7042     if (!bNoCutOff && npulse > 1)
7043     {
7044         /* See if we can do with less pulses, based on dlb_scale */
7045         npulse_d_max = 0;
7046         for (d = 0; d < dd->ndim; d++)
7047         {
7048             dim      = dd->dim[d];
7049             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7050                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7051             npulse_d_max = std::max(npulse_d_max, npulse_d);
7052         }
7053         npulse = std::min(npulse, npulse_d_max);
7054     }
7055
7056     /* This env var can override npulse */
7057     d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
7058     if (d > 0)
7059     {
7060         npulse = d;
7061     }
7062
7063     comm->maxpulse       = 1;
7064     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7065     for (d = 0; d < dd->ndim; d++)
7066     {
7067         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
7068         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7069         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7070         comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
7071         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7072         {
7073             comm->bVacDLBNoLimit = FALSE;
7074         }
7075     }
7076
7077     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7078     if (!comm->bVacDLBNoLimit)
7079     {
7080         comm->cellsize_limit = std::max(comm->cellsize_limit,
7081                                         comm->cutoff/comm->maxpulse);
7082     }
7083     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7084     /* Set the minimum cell size for each DD dimension */
7085     for (d = 0; d < dd->ndim; d++)
7086     {
7087         if (comm->bVacDLBNoLimit ||
7088             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7089         {
7090             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7091         }
7092         else
7093         {
7094             comm->cellsize_min_dlb[dd->dim[d]] =
7095                 comm->cutoff/comm->cd[d].np_dlb;
7096         }
7097     }
7098     if (comm->cutoff_mbody <= 0)
7099     {
7100         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
7101     }
7102     if (isDlbOn(comm))
7103     {
7104         set_dlb_limits(dd);
7105     }
7106 }
7107
7108 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
7109 {
7110     /* If each molecule is a single charge group
7111      * or we use domain decomposition for each periodic dimension,
7112      * we do not need to take pbc into account for the bonded interactions.
7113      */
7114     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7115             !(dd->nc[XX] > 1 &&
7116               dd->nc[YY] > 1 &&
7117               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7118 }
7119
7120 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
7121 static void set_ddgrid_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7122                                   const gmx_mtop_t *mtop, const t_inputrec *ir,
7123                                   const gmx_ddbox_t *ddbox)
7124 {
7125     gmx_domdec_comm_t *comm;
7126     int                natoms_tot;
7127     real               vol_frac;
7128
7129     comm = dd->comm;
7130
7131     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7132     {
7133         init_ddpme(dd, &comm->ddpme[0], 0);
7134         if (comm->npmedecompdim >= 2)
7135         {
7136             init_ddpme(dd, &comm->ddpme[1], 1);
7137         }
7138     }
7139     else
7140     {
7141         comm->npmenodes = 0;
7142         if (dd->pme_nodeid >= 0)
7143         {
7144             gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
7145                                  "Can not have separate PME ranks without PME electrostatics");
7146         }
7147     }
7148
7149     if (debug)
7150     {
7151         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7152     }
7153     if (!isDlbDisabled(comm))
7154     {
7155         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7156     }
7157
7158     print_dd_settings(fplog, dd, mtop, ir, isDlbOn(comm), dlb_scale, ddbox);
7159     if (comm->dlbState == edlbsOffCanTurnOn)
7160     {
7161         if (fplog)
7162         {
7163             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7164         }
7165         print_dd_settings(fplog, dd, mtop, ir, TRUE, dlb_scale, ddbox);
7166     }
7167
7168     if (ir->ePBC == epbcNONE)
7169     {
7170         vol_frac = 1 - 1/(double)dd->nnodes;
7171     }
7172     else
7173     {
7174         vol_frac =
7175             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7176     }
7177     if (debug)
7178     {
7179         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7180     }
7181     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7182
7183     dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
7184 }
7185
7186 /*! \brief Set some important DD parameters that can be modified by env.vars */
7187 static void set_dd_envvar_options(FILE *fplog, gmx_domdec_t *dd, int rank_mysim)
7188 {
7189     gmx_domdec_comm_t *comm = dd->comm;
7190
7191     dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
7192     comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
7193     comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
7194     int recload         = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
7195     comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
7196     comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
7197     comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
7198
7199     if (dd->bSendRecv2 && fplog)
7200     {
7201         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
7202     }
7203
7204     if (comm->eFlop)
7205     {
7206         if (fplog)
7207         {
7208             fprintf(fplog, "Will load balance based on FLOP count\n");
7209         }
7210         if (comm->eFlop > 1)
7211         {
7212             srand(1 + rank_mysim);
7213         }
7214         comm->bRecordLoad = TRUE;
7215     }
7216     else
7217     {
7218         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
7219     }
7220 }
7221
7222 DomdecOptions::DomdecOptions() :
7223     checkBondedInteractions(TRUE),
7224     useBondedCommunication(TRUE),
7225     numPmeRanks(-1),
7226     rankOrder(DdRankOrder::pp_pme),
7227     minimumCommunicationRange(0),
7228     constraintCommunicationRange(0),
7229     dlbOption(DlbOption::turnOnWhenUseful),
7230     dlbScaling(0.8),
7231     cellSizeX(nullptr),
7232     cellSizeY(nullptr),
7233     cellSizeZ(nullptr)
7234 {
7235     clear_ivec(numCells);
7236 }
7237
7238 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
7239                                         const DomdecOptions &options,
7240                                         const MdrunOptions &mdrunOptions,
7241                                         const gmx_mtop_t *mtop,
7242                                         const t_inputrec *ir,
7243                                         const matrix box,
7244                                         const rvec *xGlobal,
7245                                         gmx_ddbox_t *ddbox,
7246                                         int *npme_x, int *npme_y)
7247 {
7248     gmx_domdec_t      *dd;
7249
7250     if (fplog)
7251     {
7252         fprintf(fplog,
7253                 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
7254     }
7255
7256     snew(dd, 1);
7257
7258     dd->comm = init_dd_comm();
7259
7260     set_dd_envvar_options(fplog, dd, cr->nodeid);
7261
7262     set_dd_limits_and_grid(fplog, cr, dd, options, mdrunOptions,
7263                            mtop, ir,
7264                            box, xGlobal,
7265                            ddbox,
7266                            npme_x, npme_y);
7267
7268     make_dd_communicators(fplog, cr, dd, options.rankOrder);
7269
7270     if (thisRankHasDuty(cr, DUTY_PP))
7271     {
7272         set_ddgrid_parameters(fplog, dd, options.dlbScaling, mtop, ir, ddbox);
7273
7274         setup_neighbor_relations(dd);
7275     }
7276
7277     /* Set overallocation to avoid frequent reallocation of arrays */
7278     set_over_alloc_dd(TRUE);
7279
7280     /* Initialize DD paritioning counters */
7281     dd->comm->partition_step = INT_MIN;
7282     dd->ddp_count            = 0;
7283
7284     /* We currently don't know the number of threads yet, we set this later */
7285     dd->comm->nth = 0;
7286
7287     clear_dd_cycle_counts(dd);
7288
7289     return dd;
7290 }
7291
7292 static gmx_bool test_dd_cutoff(t_commrec *cr,
7293                                t_state *state, const t_inputrec *ir,
7294                                real cutoff_req)
7295 {
7296     gmx_domdec_t *dd;
7297     gmx_ddbox_t   ddbox;
7298     int           d, dim, np;
7299     real          inv_cell_size;
7300     int           LocallyLimited;
7301
7302     dd = cr->dd;
7303
7304     set_ddbox(dd, FALSE, cr, ir, state->box,
7305               TRUE, &dd->comm->cgs_gl, as_rvec_array(state->x.data()), &ddbox);
7306
7307     LocallyLimited = 0;
7308
7309     for (d = 0; d < dd->ndim; d++)
7310     {
7311         dim = dd->dim[d];
7312
7313         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7314         if (dynamic_dd_box(&ddbox, ir))
7315         {
7316             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7317         }
7318
7319         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7320
7321         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
7322         {
7323             if (np > dd->comm->cd[d].np_dlb)
7324             {
7325                 return FALSE;
7326             }
7327
7328             /* If a current local cell size is smaller than the requested
7329              * cut-off, we could still fix it, but this gets very complicated.
7330              * Without fixing here, we might actually need more checks.
7331              */
7332             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7333             {
7334                 LocallyLimited = 1;
7335             }
7336         }
7337     }
7338
7339     if (!isDlbDisabled(dd->comm))
7340     {
7341         /* If DLB is not active yet, we don't need to check the grid jumps.
7342          * Actually we shouldn't, because then the grid jump data is not set.
7343          */
7344         if (isDlbOn(dd->comm) &&
7345             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7346         {
7347             LocallyLimited = 1;
7348         }
7349
7350         gmx_sumi(1, &LocallyLimited, cr);
7351
7352         if (LocallyLimited > 0)
7353         {
7354             return FALSE;
7355         }
7356     }
7357
7358     return TRUE;
7359 }
7360
7361 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, const t_inputrec *ir,
7362                           real cutoff_req)
7363 {
7364     gmx_bool bCutoffAllowed;
7365
7366     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7367
7368     if (bCutoffAllowed)
7369     {
7370         cr->dd->comm->cutoff = cutoff_req;
7371     }
7372
7373     return bCutoffAllowed;
7374 }
7375
7376 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
7377 {
7378     gmx_domdec_comm_t *comm;
7379
7380     comm = cr->dd->comm;
7381
7382     /* Turn on the DLB limiting (might have been on already) */
7383     comm->bPMELoadBalDLBLimits = TRUE;
7384
7385     /* Change the cut-off limit */
7386     comm->PMELoadBal_max_cutoff = cutoff;
7387
7388     if (debug)
7389     {
7390         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
7391                 comm->PMELoadBal_max_cutoff);
7392     }
7393 }
7394
7395 /* Sets whether we should later check the load imbalance data, so that
7396  * we can trigger dynamic load balancing if enough imbalance has
7397  * arisen.
7398  *
7399  * Used after PME load balancing unlocks DLB, so that the check
7400  * whether DLB will be useful can happen immediately.
7401  */
7402 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
7403 {
7404     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7405     {
7406         dd->comm->bCheckWhetherToTurnDlbOn = bValue;
7407
7408         if (bValue == TRUE)
7409         {
7410             /* Store the DD partitioning count, so we can ignore cycle counts
7411              * over the next nstlist steps, which are often slower.
7412              */
7413             dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
7414         }
7415     }
7416 }
7417
7418 /* Returns if we should check whether there has been enough load
7419  * imbalance to trigger dynamic load balancing.
7420  */
7421 static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
7422 {
7423     if (dd->comm->dlbState != edlbsOffCanTurnOn)
7424     {
7425         return FALSE;
7426     }
7427
7428     if (dd->ddp_count <= dd->comm->ddPartioningCountFirstDlbOff)
7429     {
7430         /* We ignore the first nstlist steps at the start of the run
7431          * or after PME load balancing or after turning DLB off, since
7432          * these often have extra allocation or cache miss overhead.
7433          */
7434         return FALSE;
7435     }
7436
7437     if (dd->comm->cycl_n[ddCyclStep] == 0)
7438     {
7439         /* We can have zero timed steps when dd_partition_system is called
7440          * more than once at the same step, e.g. with replica exchange.
7441          * Turning on DLB would trigger an assertion failure later, but is
7442          * also useless right after exchanging replicas.
7443          */
7444         return FALSE;
7445     }
7446
7447     /* We should check whether we should use DLB directly after
7448      * unlocking DLB. */
7449     if (dd->comm->bCheckWhetherToTurnDlbOn)
7450     {
7451         /* This flag was set when the PME load-balancing routines
7452            unlocked DLB, and should now be cleared. */
7453         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
7454         return TRUE;
7455     }
7456     /* We check whether we should use DLB every c_checkTurnDlbOnInterval
7457      * partitionings (we do not do this every partioning, so that we
7458      * avoid excessive communication). */
7459     if (dd->comm->n_load_have % c_checkTurnDlbOnInterval == c_checkTurnDlbOnInterval - 1)
7460     {
7461         return TRUE;
7462     }
7463
7464     return FALSE;
7465 }
7466
7467 gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
7468 {
7469     return isDlbOn(dd->comm);
7470 }
7471
7472 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
7473 {
7474     return (dd->comm->dlbState == edlbsOffTemporarilyLocked);
7475 }
7476
7477 void dd_dlb_lock(gmx_domdec_t *dd)
7478 {
7479     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7480     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7481     {
7482         dd->comm->dlbState = edlbsOffTemporarilyLocked;
7483     }
7484 }
7485
7486 void dd_dlb_unlock(gmx_domdec_t *dd)
7487 {
7488     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7489     if (dd->comm->dlbState == edlbsOffTemporarilyLocked)
7490     {
7491         dd->comm->dlbState = edlbsOffCanTurnOn;
7492         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
7493     }
7494 }
7495
7496 static void merge_cg_buffers(int ncell,
7497                              gmx_domdec_comm_dim_t *cd, int pulse,
7498                              int  *ncg_cell,
7499                              int  *index_gl, int  *recv_i,
7500                              rvec *cg_cm,    rvec *recv_vr,
7501                              int *cgindex,
7502                              cginfo_mb_t *cginfo_mb, int *cginfo)
7503 {
7504     gmx_domdec_ind_t *ind, *ind_p;
7505     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7506     int               shift, shift_at;
7507
7508     ind = &cd->ind[pulse];
7509
7510     /* First correct the already stored data */
7511     shift = ind->nrecv[ncell];
7512     for (cell = ncell-1; cell >= 0; cell--)
7513     {
7514         shift -= ind->nrecv[cell];
7515         if (shift > 0)
7516         {
7517             /* Move the cg's present from previous grid pulses */
7518             cg0                = ncg_cell[ncell+cell];
7519             cg1                = ncg_cell[ncell+cell+1];
7520             cgindex[cg1+shift] = cgindex[cg1];
7521             for (cg = cg1-1; cg >= cg0; cg--)
7522             {
7523                 index_gl[cg+shift] = index_gl[cg];
7524                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7525                 cgindex[cg+shift] = cgindex[cg];
7526                 cginfo[cg+shift]  = cginfo[cg];
7527             }
7528             /* Correct the already stored send indices for the shift */
7529             for (p = 1; p <= pulse; p++)
7530             {
7531                 ind_p = &cd->ind[p];
7532                 cg0   = 0;
7533                 for (c = 0; c < cell; c++)
7534                 {
7535                     cg0 += ind_p->nsend[c];
7536                 }
7537                 cg1 = cg0 + ind_p->nsend[cell];
7538                 for (cg = cg0; cg < cg1; cg++)
7539                 {
7540                     ind_p->index[cg] += shift;
7541                 }
7542             }
7543         }
7544     }
7545
7546     /* Merge in the communicated buffers */
7547     shift    = 0;
7548     shift_at = 0;
7549     cg0      = 0;
7550     for (cell = 0; cell < ncell; cell++)
7551     {
7552         cg1 = ncg_cell[ncell+cell+1] + shift;
7553         if (shift_at > 0)
7554         {
7555             /* Correct the old cg indices */
7556             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7557             {
7558                 cgindex[cg+1] += shift_at;
7559             }
7560         }
7561         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7562         {
7563             /* Copy this charge group from the buffer */
7564             index_gl[cg1] = recv_i[cg0];
7565             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7566             /* Add it to the cgindex */
7567             cg_gl          = index_gl[cg1];
7568             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7569             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7570             cgindex[cg1+1] = cgindex[cg1] + nat;
7571             cg0++;
7572             cg1++;
7573             shift_at += nat;
7574         }
7575         shift                 += ind->nrecv[cell];
7576         ncg_cell[ncell+cell+1] = cg1;
7577     }
7578 }
7579
7580 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7581                                int nzone, int cg0, const int *cgindex)
7582 {
7583     int cg, zone, p;
7584
7585     /* Store the atom block boundaries for easy copying of communication buffers
7586      */
7587     cg = cg0;
7588     for (zone = 0; zone < nzone; zone++)
7589     {
7590         for (p = 0; p < cd->np; p++)
7591         {
7592             cd->ind[p].cell2at0[zone] = cgindex[cg];
7593             cg += cd->ind[p].nrecv[zone];
7594             cd->ind[p].cell2at1[zone] = cgindex[cg];
7595         }
7596     }
7597 }
7598
7599 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7600 {
7601     int      i;
7602     gmx_bool bMiss;
7603
7604     bMiss = FALSE;
7605     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7606     {
7607         if (!bLocalCG[link->a[i]])
7608         {
7609             bMiss = TRUE;
7610         }
7611     }
7612
7613     return bMiss;
7614 }
7615
7616 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7617 typedef struct {
7618     real c[DIM][4]; /* the corners for the non-bonded communication */
7619     real cr0;       /* corner for rounding */
7620     real cr1[4];    /* corners for rounding */
7621     real bc[DIM];   /* corners for bounded communication */
7622     real bcr1;      /* corner for rounding for bonded communication */
7623 } dd_corners_t;
7624
7625 /* Determine the corners of the domain(s) we are communicating with */
7626 static void
7627 set_dd_corners(const gmx_domdec_t *dd,
7628                int dim0, int dim1, int dim2,
7629                gmx_bool bDistMB,
7630                dd_corners_t *c)
7631 {
7632     const gmx_domdec_comm_t  *comm;
7633     const gmx_domdec_zones_t *zones;
7634     int i, j;
7635
7636     comm = dd->comm;
7637
7638     zones = &comm->zones;
7639
7640     /* Keep the compiler happy */
7641     c->cr0  = 0;
7642     c->bcr1 = 0;
7643
7644     /* The first dimension is equal for all cells */
7645     c->c[0][0] = comm->cell_x0[dim0];
7646     if (bDistMB)
7647     {
7648         c->bc[0] = c->c[0][0];
7649     }
7650     if (dd->ndim >= 2)
7651     {
7652         dim1 = dd->dim[1];
7653         /* This cell row is only seen from the first row */
7654         c->c[1][0] = comm->cell_x0[dim1];
7655         /* All rows can see this row */
7656         c->c[1][1] = comm->cell_x0[dim1];
7657         if (isDlbOn(dd->comm))
7658         {
7659             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7660             if (bDistMB)
7661             {
7662                 /* For the multi-body distance we need the maximum */
7663                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7664             }
7665         }
7666         /* Set the upper-right corner for rounding */
7667         c->cr0 = comm->cell_x1[dim0];
7668
7669         if (dd->ndim >= 3)
7670         {
7671             dim2 = dd->dim[2];
7672             for (j = 0; j < 4; j++)
7673             {
7674                 c->c[2][j] = comm->cell_x0[dim2];
7675             }
7676             if (isDlbOn(dd->comm))
7677             {
7678                 /* Use the maximum of the i-cells that see a j-cell */
7679                 for (i = 0; i < zones->nizone; i++)
7680                 {
7681                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7682                     {
7683                         if (j >= 4)
7684                         {
7685                             c->c[2][j-4] =
7686                                 std::max(c->c[2][j-4],
7687                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7688                         }
7689                     }
7690                 }
7691                 if (bDistMB)
7692                 {
7693                     /* For the multi-body distance we need the maximum */
7694                     c->bc[2] = comm->cell_x0[dim2];
7695                     for (i = 0; i < 2; i++)
7696                     {
7697                         for (j = 0; j < 2; j++)
7698                         {
7699                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
7700                         }
7701                     }
7702                 }
7703             }
7704
7705             /* Set the upper-right corner for rounding */
7706             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7707              * Only cell (0,0,0) can see cell 7 (1,1,1)
7708              */
7709             c->cr1[0] = comm->cell_x1[dim1];
7710             c->cr1[3] = comm->cell_x1[dim1];
7711             if (isDlbOn(dd->comm))
7712             {
7713                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7714                 if (bDistMB)
7715                 {
7716                     /* For the multi-body distance we need the maximum */
7717                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7718                 }
7719             }
7720         }
7721     }
7722 }
7723
7724 /* Determine which cg's we need to send in this pulse from this zone */
7725 static void
7726 get_zone_pulse_cgs(gmx_domdec_t *dd,
7727                    int zonei, int zone,
7728                    int cg0, int cg1,
7729                    const int *index_gl,
7730                    const int *cgindex,
7731                    int dim, int dim_ind,
7732                    int dim0, int dim1, int dim2,
7733                    real r_comm2, real r_bcomm2,
7734                    matrix box,
7735                    ivec tric_dist,
7736                    rvec *normal,
7737                    real skew_fac2_d, real skew_fac_01,
7738                    rvec *v_d, rvec *v_0, rvec *v_1,
7739                    const dd_corners_t *c,
7740                    rvec sf2_round,
7741                    gmx_bool bDistBonded,
7742                    gmx_bool bBondComm,
7743                    gmx_bool bDist2B,
7744                    gmx_bool bDistMB,
7745                    rvec *cg_cm,
7746                    int *cginfo,
7747                    gmx_domdec_ind_t *ind,
7748                    int **ibuf, int *ibuf_nalloc,
7749                    vec_rvec_t *vbuf,
7750                    int *nsend_ptr,
7751                    int *nat_ptr,
7752                    int *nsend_z_ptr)
7753 {
7754     gmx_domdec_comm_t *comm;
7755     gmx_bool           bScrew;
7756     gmx_bool           bDistMB_pulse;
7757     int                cg, i;
7758     real               r2, rb2, r, tric_sh;
7759     rvec               rn, rb;
7760     int                dimd;
7761     int                nsend_z, nsend, nat;
7762
7763     comm = dd->comm;
7764
7765     bScrew = (dd->bScrewPBC && dim == XX);
7766
7767     bDistMB_pulse = (bDistMB && bDistBonded);
7768
7769     nsend_z = 0;
7770     nsend   = *nsend_ptr;
7771     nat     = *nat_ptr;
7772
7773     for (cg = cg0; cg < cg1; cg++)
7774     {
7775         r2  = 0;
7776         rb2 = 0;
7777         if (tric_dist[dim_ind] == 0)
7778         {
7779             /* Rectangular direction, easy */
7780             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7781             if (r > 0)
7782             {
7783                 r2 += r*r;
7784             }
7785             if (bDistMB_pulse)
7786             {
7787                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7788                 if (r > 0)
7789                 {
7790                     rb2 += r*r;
7791                 }
7792             }
7793             /* Rounding gives at most a 16% reduction
7794              * in communicated atoms
7795              */
7796             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7797             {
7798                 r = cg_cm[cg][dim0] - c->cr0;
7799                 /* This is the first dimension, so always r >= 0 */
7800                 r2 += r*r;
7801                 if (bDistMB_pulse)
7802                 {
7803                     rb2 += r*r;
7804                 }
7805             }
7806             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7807             {
7808                 r = cg_cm[cg][dim1] - c->cr1[zone];
7809                 if (r > 0)
7810                 {
7811                     r2 += r*r;
7812                 }
7813                 if (bDistMB_pulse)
7814                 {
7815                     r = cg_cm[cg][dim1] - c->bcr1;
7816                     if (r > 0)
7817                     {
7818                         rb2 += r*r;
7819                     }
7820                 }
7821             }
7822         }
7823         else
7824         {
7825             /* Triclinic direction, more complicated */
7826             clear_rvec(rn);
7827             clear_rvec(rb);
7828             /* Rounding, conservative as the skew_fac multiplication
7829              * will slightly underestimate the distance.
7830              */
7831             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7832             {
7833                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7834                 for (i = dim0+1; i < DIM; i++)
7835                 {
7836                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7837                 }
7838                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7839                 if (bDistMB_pulse)
7840                 {
7841                     rb[dim0] = rn[dim0];
7842                     rb2      = r2;
7843                 }
7844                 /* Take care that the cell planes along dim0 might not
7845                  * be orthogonal to those along dim1 and dim2.
7846                  */
7847                 for (i = 1; i <= dim_ind; i++)
7848                 {
7849                     dimd = dd->dim[i];
7850                     if (normal[dim0][dimd] > 0)
7851                     {
7852                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7853                         if (bDistMB_pulse)
7854                         {
7855                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7856                         }
7857                     }
7858                 }
7859             }
7860             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7861             {
7862                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7863                 tric_sh   = 0;
7864                 for (i = dim1+1; i < DIM; i++)
7865                 {
7866                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7867                 }
7868                 rn[dim1] += tric_sh;
7869                 if (rn[dim1] > 0)
7870                 {
7871                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7872                     /* Take care of coupling of the distances
7873                      * to the planes along dim0 and dim1 through dim2.
7874                      */
7875                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7876                     /* Take care that the cell planes along dim1
7877                      * might not be orthogonal to that along dim2.
7878                      */
7879                     if (normal[dim1][dim2] > 0)
7880                     {
7881                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7882                     }
7883                 }
7884                 if (bDistMB_pulse)
7885                 {
7886                     rb[dim1] +=
7887                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7888                     if (rb[dim1] > 0)
7889                     {
7890                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7891                         /* Take care of coupling of the distances
7892                          * to the planes along dim0 and dim1 through dim2.
7893                          */
7894                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7895                         /* Take care that the cell planes along dim1
7896                          * might not be orthogonal to that along dim2.
7897                          */
7898                         if (normal[dim1][dim2] > 0)
7899                         {
7900                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7901                         }
7902                     }
7903                 }
7904             }
7905             /* The distance along the communication direction */
7906             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7907             tric_sh  = 0;
7908             for (i = dim+1; i < DIM; i++)
7909             {
7910                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7911             }
7912             rn[dim] += tric_sh;
7913             if (rn[dim] > 0)
7914             {
7915                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7916                 /* Take care of coupling of the distances
7917                  * to the planes along dim0 and dim1 through dim2.
7918                  */
7919                 if (dim_ind == 1 && zonei == 1)
7920                 {
7921                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7922                 }
7923             }
7924             if (bDistMB_pulse)
7925             {
7926                 clear_rvec(rb);
7927                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7928                 if (rb[dim] > 0)
7929                 {
7930                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7931                     /* Take care of coupling of the distances
7932                      * to the planes along dim0 and dim1 through dim2.
7933                      */
7934                     if (dim_ind == 1 && zonei == 1)
7935                     {
7936                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7937                     }
7938                 }
7939             }
7940         }
7941
7942         if (r2 < r_comm2 ||
7943             (bDistBonded &&
7944              ((bDistMB && rb2 < r_bcomm2) ||
7945               (bDist2B && r2  < r_bcomm2)) &&
7946              (!bBondComm ||
7947               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7948                missing_link(comm->cglink, index_gl[cg],
7949                             comm->bLocalCG)))))
7950         {
7951             /* Make an index to the local charge groups */
7952             if (nsend+1 > ind->nalloc)
7953             {
7954                 ind->nalloc = over_alloc_large(nsend+1);
7955                 srenew(ind->index, ind->nalloc);
7956             }
7957             if (nsend+1 > *ibuf_nalloc)
7958             {
7959                 *ibuf_nalloc = over_alloc_large(nsend+1);
7960                 srenew(*ibuf, *ibuf_nalloc);
7961             }
7962             ind->index[nsend] = cg;
7963             (*ibuf)[nsend]    = index_gl[cg];
7964             nsend_z++;
7965             vec_rvec_check_alloc(vbuf, nsend+1);
7966
7967             if (dd->ci[dim] == 0)
7968             {
7969                 /* Correct cg_cm for pbc */
7970                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7971                 if (bScrew)
7972                 {
7973                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7974                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7975                 }
7976             }
7977             else
7978             {
7979                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7980             }
7981             nsend++;
7982             nat += cgindex[cg+1] - cgindex[cg];
7983         }
7984     }
7985
7986     *nsend_ptr   = nsend;
7987     *nat_ptr     = nat;
7988     *nsend_z_ptr = nsend_z;
7989 }
7990
7991 static void setup_dd_communication(gmx_domdec_t *dd,
7992                                    matrix box, gmx_ddbox_t *ddbox,
7993                                    t_forcerec *fr,
7994                                    t_state *state, PaddedRVecVector *f)
7995 {
7996     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7997     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7998     int                    c, i, cg, cg_gl, nrcg;
7999     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
8000     gmx_domdec_comm_t     *comm;
8001     gmx_domdec_zones_t    *zones;
8002     gmx_domdec_comm_dim_t *cd;
8003     gmx_domdec_ind_t      *ind;
8004     cginfo_mb_t           *cginfo_mb;
8005     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
8006     real                   r_comm2, r_bcomm2;
8007     dd_corners_t           corners;
8008     ivec                   tric_dist;
8009     rvec                  *cg_cm, *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr, *recv_vr;
8010     real                   skew_fac2_d, skew_fac_01;
8011     rvec                   sf2_round;
8012     int                    nsend, nat;
8013     int                    th;
8014
8015     if (debug)
8016     {
8017         fprintf(debug, "Setting up DD communication\n");
8018     }
8019
8020     comm  = dd->comm;
8021
8022     if (comm->nth == 0)
8023     {
8024         /* Initialize the thread data.
8025          * This can not be done in init_domain_decomposition,
8026          * as the numbers of threads is determined later.
8027          */
8028         comm->nth = gmx_omp_nthreads_get(emntDomdec);
8029         if (comm->nth > 1)
8030         {
8031             snew(comm->dth, comm->nth);
8032         }
8033     }
8034
8035     switch (fr->cutoff_scheme)
8036     {
8037         case ecutsGROUP:
8038             cg_cm = fr->cg_cm;
8039             break;
8040         case ecutsVERLET:
8041             cg_cm = as_rvec_array(state->x.data());
8042             break;
8043         default:
8044             gmx_incons("unimplemented");
8045             cg_cm = nullptr;
8046     }
8047
8048     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8049     {
8050         /* Check if we need to use triclinic distances */
8051         tric_dist[dim_ind] = 0;
8052         for (i = 0; i <= dim_ind; i++)
8053         {
8054             if (ddbox->tric_dir[dd->dim[i]])
8055             {
8056                 tric_dist[dim_ind] = 1;
8057             }
8058         }
8059     }
8060
8061     bBondComm = comm->bBondComm;
8062
8063     /* Do we need to determine extra distances for multi-body bondeds? */
8064     bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
8065
8066     /* Do we need to determine extra distances for only two-body bondeds? */
8067     bDist2B = (bBondComm && !bDistMB);
8068
8069     r_comm2  = gmx::square(comm->cutoff);
8070     r_bcomm2 = gmx::square(comm->cutoff_mbody);
8071
8072     if (debug)
8073     {
8074         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, std::sqrt(r_bcomm2));
8075     }
8076
8077     zones = &comm->zones;
8078
8079     dim0 = dd->dim[0];
8080     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8081     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8082
8083     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8084
8085     /* Triclinic stuff */
8086     normal      = ddbox->normal;
8087     skew_fac_01 = 0;
8088     if (dd->ndim >= 2)
8089     {
8090         v_0 = ddbox->v[dim0];
8091         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8092         {
8093             /* Determine the coupling coefficient for the distances
8094              * to the cell planes along dim0 and dim1 through dim2.
8095              * This is required for correct rounding.
8096              */
8097             skew_fac_01 =
8098                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8099             if (debug)
8100             {
8101                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8102             }
8103         }
8104     }
8105     if (dd->ndim >= 3)
8106     {
8107         v_1 = ddbox->v[dim1];
8108     }
8109
8110     zone_cg_range = zones->cg_range;
8111     index_gl      = dd->index_gl;
8112     cgindex       = dd->cgindex;
8113     cginfo_mb     = fr->cginfo_mb;
8114
8115     zone_cg_range[0]   = 0;
8116     zone_cg_range[1]   = dd->ncg_home;
8117     comm->zone_ncg1[0] = dd->ncg_home;
8118     pos_cg             = dd->ncg_home;
8119
8120     nat_tot = dd->nat_home;
8121     nzone   = 1;
8122     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8123     {
8124         dim = dd->dim[dim_ind];
8125         cd  = &comm->cd[dim_ind];
8126
8127         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8128         {
8129             /* No pbc in this dimension, the first node should not comm. */
8130             nzone_send = 0;
8131         }
8132         else
8133         {
8134             nzone_send = nzone;
8135         }
8136
8137         v_d         = ddbox->v[dim];
8138         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
8139
8140         cd->bInPlace = TRUE;
8141         for (p = 0; p < cd->np; p++)
8142         {
8143             /* Only atoms communicated in the first pulse are used
8144              * for multi-body bonded interactions or for bBondComm.
8145              */
8146             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8147
8148             ind   = &cd->ind[p];
8149             nsend = 0;
8150             nat   = 0;
8151             for (zone = 0; zone < nzone_send; zone++)
8152             {
8153                 if (tric_dist[dim_ind] && dim_ind > 0)
8154                 {
8155                     /* Determine slightly more optimized skew_fac's
8156                      * for rounding.
8157                      * This reduces the number of communicated atoms
8158                      * by about 10% for 3D DD of rhombic dodecahedra.
8159                      */
8160                     for (dimd = 0; dimd < dim; dimd++)
8161                     {
8162                         sf2_round[dimd] = 1;
8163                         if (ddbox->tric_dir[dimd])
8164                         {
8165                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8166                             {
8167                                 /* If we are shifted in dimension i
8168                                  * and the cell plane is tilted forward
8169                                  * in dimension i, skip this coupling.
8170                                  */
8171                                 if (!(zones->shift[nzone+zone][i] &&
8172                                       ddbox->v[dimd][i][dimd] >= 0))
8173                                 {
8174                                     sf2_round[dimd] +=
8175                                         gmx::square(ddbox->v[dimd][i][dimd]);
8176                                 }
8177                             }
8178                             sf2_round[dimd] = 1/sf2_round[dimd];
8179                         }
8180                     }
8181                 }
8182
8183                 zonei = zone_perm[dim_ind][zone];
8184                 if (p == 0)
8185                 {
8186                     /* Here we permutate the zones to obtain a convenient order
8187                      * for neighbor searching
8188                      */
8189                     cg0 = zone_cg_range[zonei];
8190                     cg1 = zone_cg_range[zonei+1];
8191                 }
8192                 else
8193                 {
8194                     /* Look only at the cg's received in the previous grid pulse
8195                      */
8196                     cg1 = zone_cg_range[nzone+zone+1];
8197                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8198                 }
8199
8200 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8201                 for (th = 0; th < comm->nth; th++)
8202                 {
8203                     try
8204                     {
8205                         gmx_domdec_ind_t *ind_p;
8206                         int             **ibuf_p, *ibuf_nalloc_p;
8207                         vec_rvec_t       *vbuf_p;
8208                         int              *nsend_p, *nat_p;
8209                         int              *nsend_zone_p;
8210                         int               cg0_th, cg1_th;
8211
8212                         if (th == 0)
8213                         {
8214                             /* Thread 0 writes in the comm buffers */
8215                             ind_p         = ind;
8216                             ibuf_p        = &comm->buf_int;
8217                             ibuf_nalloc_p = &comm->nalloc_int;
8218                             vbuf_p        = &comm->vbuf;
8219                             nsend_p       = &nsend;
8220                             nat_p         = &nat;
8221                             nsend_zone_p  = &ind->nsend[zone];
8222                         }
8223                         else
8224                         {
8225                             /* Other threads write into temp buffers */
8226                             ind_p         = &comm->dth[th].ind;
8227                             ibuf_p        = &comm->dth[th].ibuf;
8228                             ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8229                             vbuf_p        = &comm->dth[th].vbuf;
8230                             nsend_p       = &comm->dth[th].nsend;
8231                             nat_p         = &comm->dth[th].nat;
8232                             nsend_zone_p  = &comm->dth[th].nsend_zone;
8233
8234                             comm->dth[th].nsend      = 0;
8235                             comm->dth[th].nat        = 0;
8236                             comm->dth[th].nsend_zone = 0;
8237                         }
8238
8239                         if (comm->nth == 1)
8240                         {
8241                             cg0_th = cg0;
8242                             cg1_th = cg1;
8243                         }
8244                         else
8245                         {
8246                             cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8247                             cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8248                         }
8249
8250                         /* Get the cg's for this pulse in this zone */
8251                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8252                                            index_gl, cgindex,
8253                                            dim, dim_ind, dim0, dim1, dim2,
8254                                            r_comm2, r_bcomm2,
8255                                            box, tric_dist,
8256                                            normal, skew_fac2_d, skew_fac_01,
8257                                            v_d, v_0, v_1, &corners, sf2_round,
8258                                            bDistBonded, bBondComm,
8259                                            bDist2B, bDistMB,
8260                                            cg_cm, fr->cginfo,
8261                                            ind_p,
8262                                            ibuf_p, ibuf_nalloc_p,
8263                                            vbuf_p,
8264                                            nsend_p, nat_p,
8265                                            nsend_zone_p);
8266                     }
8267                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
8268                 } // END
8269
8270                 /* Append data of threads>=1 to the communication buffers */
8271                 for (th = 1; th < comm->nth; th++)
8272                 {
8273                     dd_comm_setup_work_t *dth;
8274                     int                   i, ns1;
8275
8276                     dth = &comm->dth[th];
8277
8278                     ns1 = nsend + dth->nsend_zone;
8279                     if (ns1 > ind->nalloc)
8280                     {
8281                         ind->nalloc = over_alloc_dd(ns1);
8282                         srenew(ind->index, ind->nalloc);
8283                     }
8284                     if (ns1 > comm->nalloc_int)
8285                     {
8286                         comm->nalloc_int = over_alloc_dd(ns1);
8287                         srenew(comm->buf_int, comm->nalloc_int);
8288                     }
8289                     if (ns1 > comm->vbuf.nalloc)
8290                     {
8291                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8292                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8293                     }
8294
8295                     for (i = 0; i < dth->nsend_zone; i++)
8296                     {
8297                         ind->index[nsend]    = dth->ind.index[i];
8298                         comm->buf_int[nsend] = dth->ibuf[i];
8299                         copy_rvec(dth->vbuf.v[i],
8300                                   comm->vbuf.v[nsend]);
8301                         nsend++;
8302                     }
8303                     nat              += dth->nat;
8304                     ind->nsend[zone] += dth->nsend_zone;
8305                 }
8306             }
8307             /* Clear the counts in case we do not have pbc */
8308             for (zone = nzone_send; zone < nzone; zone++)
8309             {
8310                 ind->nsend[zone] = 0;
8311             }
8312             ind->nsend[nzone]   = nsend;
8313             ind->nsend[nzone+1] = nat;
8314             /* Communicate the number of cg's and atoms to receive */
8315             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8316                             ind->nsend, nzone+2,
8317                             ind->nrecv, nzone+2);
8318
8319             /* The rvec buffer is also required for atom buffers of size nsend
8320              * in dd_move_x and dd_move_f.
8321              */
8322             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8323
8324             if (p > 0)
8325             {
8326                 /* We can receive in place if only the last zone is not empty */
8327                 for (zone = 0; zone < nzone-1; zone++)
8328                 {
8329                     if (ind->nrecv[zone] > 0)
8330                     {
8331                         cd->bInPlace = FALSE;
8332                     }
8333                 }
8334                 if (!cd->bInPlace)
8335                 {
8336                     /* The int buffer is only required here for the cg indices */
8337                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8338                     {
8339                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8340                         srenew(comm->buf_int2, comm->nalloc_int2);
8341                     }
8342                     /* The rvec buffer is also required for atom buffers
8343                      * of size nrecv in dd_move_x and dd_move_f.
8344                      */
8345                     i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8346                     vec_rvec_check_alloc(&comm->vbuf2, i);
8347                 }
8348             }
8349
8350             /* Make space for the global cg indices */
8351             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8352                 || dd->cg_nalloc == 0)
8353             {
8354                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8355                 srenew(index_gl, dd->cg_nalloc);
8356                 srenew(cgindex, dd->cg_nalloc+1);
8357             }
8358             /* Communicate the global cg indices */
8359             if (cd->bInPlace)
8360             {
8361                 recv_i = index_gl + pos_cg;
8362             }
8363             else
8364             {
8365                 recv_i = comm->buf_int2;
8366             }
8367             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8368                             comm->buf_int, nsend,
8369                             recv_i,        ind->nrecv[nzone]);
8370
8371             /* Make space for cg_cm */
8372             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8373             if (fr->cutoff_scheme == ecutsGROUP)
8374             {
8375                 cg_cm = fr->cg_cm;
8376             }
8377             else
8378             {
8379                 cg_cm = as_rvec_array(state->x.data());
8380             }
8381             /* Communicate cg_cm */
8382             if (cd->bInPlace)
8383             {
8384                 recv_vr = cg_cm + pos_cg;
8385             }
8386             else
8387             {
8388                 recv_vr = comm->vbuf2.v;
8389             }
8390             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8391                              comm->vbuf.v, nsend,
8392                              recv_vr,      ind->nrecv[nzone]);
8393
8394             /* Make the charge group index */
8395             if (cd->bInPlace)
8396             {
8397                 zone = (p == 0 ? 0 : nzone - 1);
8398                 while (zone < nzone)
8399                 {
8400                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8401                     {
8402                         cg_gl              = index_gl[pos_cg];
8403                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8404                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8405                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8406                         if (bBondComm)
8407                         {
8408                             /* Update the charge group presence,
8409                              * so we can use it in the next pass of the loop.
8410                              */
8411                             comm->bLocalCG[cg_gl] = TRUE;
8412                         }
8413                         pos_cg++;
8414                     }
8415                     if (p == 0)
8416                     {
8417                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8418                     }
8419                     zone++;
8420                     zone_cg_range[nzone+zone] = pos_cg;
8421                 }
8422             }
8423             else
8424             {
8425                 /* This part of the code is never executed with bBondComm. */
8426                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8427                                  index_gl, recv_i, cg_cm, recv_vr,
8428                                  cgindex, fr->cginfo_mb, fr->cginfo);
8429                 pos_cg += ind->nrecv[nzone];
8430             }
8431             nat_tot += ind->nrecv[nzone+1];
8432         }
8433         if (!cd->bInPlace)
8434         {
8435             /* Store the atom block for easy copying of communication buffers */
8436             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8437         }
8438         nzone += nzone;
8439     }
8440     dd->index_gl = index_gl;
8441     dd->cgindex  = cgindex;
8442
8443     dd->ncg_tot          = zone_cg_range[zones->n];
8444     dd->nat_tot          = nat_tot;
8445     comm->nat[ddnatHOME] = dd->nat_home;
8446     for (i = ddnatZONE; i < ddnatNR; i++)
8447     {
8448         comm->nat[i] = dd->nat_tot;
8449     }
8450
8451     if (!bBondComm)
8452     {
8453         /* We don't need to update cginfo, since that was alrady done above.
8454          * So we pass NULL for the forcerec.
8455          */
8456         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8457                       nullptr, comm->bLocalCG);
8458     }
8459
8460     if (debug)
8461     {
8462         fprintf(debug, "Finished setting up DD communication, zones:");
8463         for (c = 0; c < zones->n; c++)
8464         {
8465             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8466         }
8467         fprintf(debug, "\n");
8468     }
8469 }
8470
8471 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8472 {
8473     int c;
8474
8475     for (c = 0; c < zones->nizone; c++)
8476     {
8477         zones->izone[c].cg1  = zones->cg_range[c+1];
8478         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8479         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8480     }
8481 }
8482
8483 /* \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
8484  *
8485  * Also sets the atom density for the home zone when \p zone_start=0.
8486  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
8487  * how many charge groups will move but are still part of the current range.
8488  * \todo When converting domdec to use proper classes, all these variables
8489  *       should be private and a method should return the correct count
8490  *       depending on an internal state.
8491  *
8492  * \param[in,out] dd          The domain decomposition struct
8493  * \param[in]     box         The box
8494  * \param[in]     ddbox       The domain decomposition box struct
8495  * \param[in]     zone_start  The start of the zone range to set sizes for
8496  * \param[in]     zone_end    The end of the zone range to set sizes for
8497  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
8498  */
8499 static void set_zones_size(gmx_domdec_t *dd,
8500                            matrix box, const gmx_ddbox_t *ddbox,
8501                            int zone_start, int zone_end,
8502                            int numMovedChargeGroupsInHomeZone)
8503 {
8504     gmx_domdec_comm_t  *comm;
8505     gmx_domdec_zones_t *zones;
8506     gmx_bool            bDistMB;
8507     int                 z, zi, d, dim;
8508     real                rcs, rcmbs;
8509     int                 i, j;
8510     real                vol;
8511
8512     comm = dd->comm;
8513
8514     zones = &comm->zones;
8515
8516     /* Do we need to determine extra distances for multi-body bondeds? */
8517     bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
8518
8519     for (z = zone_start; z < zone_end; z++)
8520     {
8521         /* Copy cell limits to zone limits.
8522          * Valid for non-DD dims and non-shifted dims.
8523          */
8524         copy_rvec(comm->cell_x0, zones->size[z].x0);
8525         copy_rvec(comm->cell_x1, zones->size[z].x1);
8526     }
8527
8528     for (d = 0; d < dd->ndim; d++)
8529     {
8530         dim = dd->dim[d];
8531
8532         for (z = 0; z < zones->n; z++)
8533         {
8534             /* With a staggered grid we have different sizes
8535              * for non-shifted dimensions.
8536              */
8537             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
8538             {
8539                 if (d == 1)
8540                 {
8541                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8542                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8543                 }
8544                 else if (d == 2)
8545                 {
8546                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8547                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8548                 }
8549             }
8550         }
8551
8552         rcs   = comm->cutoff;
8553         rcmbs = comm->cutoff_mbody;
8554         if (ddbox->tric_dir[dim])
8555         {
8556             rcs   /= ddbox->skew_fac[dim];
8557             rcmbs /= ddbox->skew_fac[dim];
8558         }
8559
8560         /* Set the lower limit for the shifted zone dimensions */
8561         for (z = zone_start; z < zone_end; z++)
8562         {
8563             if (zones->shift[z][dim] > 0)
8564             {
8565                 dim = dd->dim[d];
8566                 if (!isDlbOn(dd->comm) || d == 0)
8567                 {
8568                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8569                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8570                 }
8571                 else
8572                 {
8573                     /* Here we take the lower limit of the zone from
8574                      * the lowest domain of the zone below.
8575                      */
8576                     if (z < 4)
8577                     {
8578                         zones->size[z].x0[dim] =
8579                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8580                     }
8581                     else
8582                     {
8583                         if (d == 1)
8584                         {
8585                             zones->size[z].x0[dim] =
8586                                 zones->size[zone_perm[2][z-4]].x0[dim];
8587                         }
8588                         else
8589                         {
8590                             zones->size[z].x0[dim] =
8591                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8592                         }
8593                     }
8594                     /* A temporary limit, is updated below */
8595                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8596
8597                     if (bDistMB)
8598                     {
8599                         for (zi = 0; zi < zones->nizone; zi++)
8600                         {
8601                             if (zones->shift[zi][dim] == 0)
8602                             {
8603                                 /* This takes the whole zone into account.
8604                                  * With multiple pulses this will lead
8605                                  * to a larger zone then strictly necessary.
8606                                  */
8607                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8608                                                                   zones->size[zi].x1[dim]+rcmbs);
8609                             }
8610                         }
8611                     }
8612                 }
8613             }
8614         }
8615
8616         /* Loop over the i-zones to set the upper limit of each
8617          * j-zone they see.
8618          */
8619         for (zi = 0; zi < zones->nizone; zi++)
8620         {
8621             if (zones->shift[zi][dim] == 0)
8622             {
8623                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8624                 {
8625                     if (zones->shift[z][dim] > 0)
8626                     {
8627                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8628                                                           zones->size[zi].x1[dim]+rcs);
8629                     }
8630                 }
8631             }
8632         }
8633     }
8634
8635     for (z = zone_start; z < zone_end; z++)
8636     {
8637         /* Initialization only required to keep the compiler happy */
8638         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8639         int  nc, c;
8640
8641         /* To determine the bounding box for a zone we need to find
8642          * the extreme corners of 4, 2 or 1 corners.
8643          */
8644         nc = 1 << (ddbox->nboundeddim - 1);
8645
8646         for (c = 0; c < nc; c++)
8647         {
8648             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8649             corner[XX] = 0;
8650             if ((c & 1) == 0)
8651             {
8652                 corner[YY] = zones->size[z].x0[YY];
8653             }
8654             else
8655             {
8656                 corner[YY] = zones->size[z].x1[YY];
8657             }
8658             if ((c & 2) == 0)
8659             {
8660                 corner[ZZ] = zones->size[z].x0[ZZ];
8661             }
8662             else
8663             {
8664                 corner[ZZ] = zones->size[z].x1[ZZ];
8665             }
8666             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
8667                 box[ZZ][1 - dd->dim[0]] != 0)
8668             {
8669                 /* With 1D domain decomposition the cg's are not in
8670                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
8671                  * Shift the corner of the z-vector back to along the box
8672                  * vector of dimension d, so it will later end up at 0 along d.
8673                  * This can affect the location of this corner along dd->dim[0]
8674                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
8675                  */
8676                 int d = 1 - dd->dim[0];
8677
8678                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
8679             }
8680             /* Apply the triclinic couplings */
8681             assert(ddbox->npbcdim <= DIM);
8682             for (i = YY; i < ddbox->npbcdim; i++)
8683             {
8684                 for (j = XX; j < i; j++)
8685                 {
8686                     corner[j] += corner[i]*box[i][j]/box[i][i];
8687                 }
8688             }
8689             if (c == 0)
8690             {
8691                 copy_rvec(corner, corner_min);
8692                 copy_rvec(corner, corner_max);
8693             }
8694             else
8695             {
8696                 for (i = 0; i < DIM; i++)
8697                 {
8698                     corner_min[i] = std::min(corner_min[i], corner[i]);
8699                     corner_max[i] = std::max(corner_max[i], corner[i]);
8700                 }
8701             }
8702         }
8703         /* Copy the extreme cornes without offset along x */
8704         for (i = 0; i < DIM; i++)
8705         {
8706             zones->size[z].bb_x0[i] = corner_min[i];
8707             zones->size[z].bb_x1[i] = corner_max[i];
8708         }
8709         /* Add the offset along x */
8710         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8711         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8712     }
8713
8714     if (zone_start == 0)
8715     {
8716         vol = 1;
8717         for (dim = 0; dim < DIM; dim++)
8718         {
8719             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8720         }
8721         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone)/vol;
8722     }
8723
8724     if (debug)
8725     {
8726         for (z = zone_start; z < zone_end; z++)
8727         {
8728             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8729                     z,
8730                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8731                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8732                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8733             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8734                     z,
8735                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8736                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8737                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8738         }
8739     }
8740 }
8741
8742 static int comp_cgsort(const void *a, const void *b)
8743 {
8744     int           comp;
8745
8746     gmx_cgsort_t *cga, *cgb;
8747     cga = (gmx_cgsort_t *)a;
8748     cgb = (gmx_cgsort_t *)b;
8749
8750     comp = cga->nsc - cgb->nsc;
8751     if (comp == 0)
8752     {
8753         comp = cga->ind_gl - cgb->ind_gl;
8754     }
8755
8756     return comp;
8757 }
8758
8759 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8760                          int *a, int *buf)
8761 {
8762     int i;
8763
8764     /* Order the data */
8765     for (i = 0; i < n; i++)
8766     {
8767         buf[i] = a[sort[i].ind];
8768     }
8769
8770     /* Copy back to the original array */
8771     for (i = 0; i < n; i++)
8772     {
8773         a[i] = buf[i];
8774     }
8775 }
8776
8777 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8778                          rvec *v, rvec *buf)
8779 {
8780     int i;
8781
8782     /* Order the data */
8783     for (i = 0; i < n; i++)
8784     {
8785         copy_rvec(v[sort[i].ind], buf[i]);
8786     }
8787
8788     /* Copy back to the original array */
8789     for (i = 0; i < n; i++)
8790     {
8791         copy_rvec(buf[i], v[i]);
8792     }
8793 }
8794
8795 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8796                            rvec *v, rvec *buf)
8797 {
8798     int a, atot, cg, cg0, cg1, i;
8799
8800     if (cgindex == nullptr)
8801     {
8802         /* Avoid the useless loop of the atoms within a cg */
8803         order_vec_cg(ncg, sort, v, buf);
8804
8805         return;
8806     }
8807
8808     /* Order the data */
8809     a = 0;
8810     for (cg = 0; cg < ncg; cg++)
8811     {
8812         cg0 = cgindex[sort[cg].ind];
8813         cg1 = cgindex[sort[cg].ind+1];
8814         for (i = cg0; i < cg1; i++)
8815         {
8816             copy_rvec(v[i], buf[a]);
8817             a++;
8818         }
8819     }
8820     atot = a;
8821
8822     /* Copy back to the original array */
8823     for (a = 0; a < atot; a++)
8824     {
8825         copy_rvec(buf[a], v[a]);
8826     }
8827 }
8828
8829 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8830                          int nsort_new, gmx_cgsort_t *sort_new,
8831                          gmx_cgsort_t *sort1)
8832 {
8833     int i1, i2, i_new;
8834
8835     /* The new indices are not very ordered, so we qsort them */
8836     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8837
8838     /* sort2 is already ordered, so now we can merge the two arrays */
8839     i1    = 0;
8840     i2    = 0;
8841     i_new = 0;
8842     while (i2 < nsort2 || i_new < nsort_new)
8843     {
8844         if (i2 == nsort2)
8845         {
8846             sort1[i1++] = sort_new[i_new++];
8847         }
8848         else if (i_new == nsort_new)
8849         {
8850             sort1[i1++] = sort2[i2++];
8851         }
8852         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8853                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8854                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8855         {
8856             sort1[i1++] = sort2[i2++];
8857         }
8858         else
8859         {
8860             sort1[i1++] = sort_new[i_new++];
8861         }
8862     }
8863 }
8864
8865 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8866 {
8867     gmx_domdec_sort_t *sort;
8868     gmx_cgsort_t      *cgsort, *sort_i;
8869     int                ncg_new, nsort2, nsort_new, i, *a, moved;
8870
8871     sort = dd->comm->sort;
8872
8873     a = fr->ns->grid->cell_index;
8874
8875     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
8876
8877     if (ncg_home_old >= 0)
8878     {
8879         /* The charge groups that remained in the same ns grid cell
8880          * are completely ordered. So we can sort efficiently by sorting
8881          * the charge groups that did move into the stationary list.
8882          */
8883         ncg_new   = 0;
8884         nsort2    = 0;
8885         nsort_new = 0;
8886         for (i = 0; i < dd->ncg_home; i++)
8887         {
8888             /* Check if this cg did not move to another node */
8889             if (a[i] < moved)
8890             {
8891                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8892                 {
8893                     /* This cg is new on this node or moved ns grid cell */
8894                     if (nsort_new >= sort->sort_new_nalloc)
8895                     {
8896                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8897                         srenew(sort->sort_new, sort->sort_new_nalloc);
8898                     }
8899                     sort_i = &(sort->sort_new[nsort_new++]);
8900                 }
8901                 else
8902                 {
8903                     /* This cg did not move */
8904                     sort_i = &(sort->sort2[nsort2++]);
8905                 }
8906                 /* Sort on the ns grid cell indices
8907                  * and the global topology index.
8908                  * index_gl is irrelevant with cell ns,
8909                  * but we set it here anyhow to avoid a conditional.
8910                  */
8911                 sort_i->nsc    = a[i];
8912                 sort_i->ind_gl = dd->index_gl[i];
8913                 sort_i->ind    = i;
8914                 ncg_new++;
8915             }
8916         }
8917         if (debug)
8918         {
8919             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8920                     nsort2, nsort_new);
8921         }
8922         /* Sort efficiently */
8923         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8924                      sort->sort);
8925     }
8926     else
8927     {
8928         cgsort  = sort->sort;
8929         ncg_new = 0;
8930         for (i = 0; i < dd->ncg_home; i++)
8931         {
8932             /* Sort on the ns grid cell indices
8933              * and the global topology index
8934              */
8935             cgsort[i].nsc    = a[i];
8936             cgsort[i].ind_gl = dd->index_gl[i];
8937             cgsort[i].ind    = i;
8938             if (cgsort[i].nsc < moved)
8939             {
8940                 ncg_new++;
8941             }
8942         }
8943         if (debug)
8944         {
8945             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8946         }
8947         /* Determine the order of the charge groups using qsort */
8948         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8949     }
8950
8951     return ncg_new;
8952 }
8953
8954 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8955 {
8956     gmx_cgsort_t *sort;
8957     int           ncg_new, i, na;
8958     const int    *a;
8959
8960     sort = dd->comm->sort->sort;
8961
8962     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8963
8964     ncg_new = 0;
8965     for (i = 0; i < na; i++)
8966     {
8967         if (a[i] >= 0)
8968         {
8969             sort[ncg_new].ind = a[i];
8970             ncg_new++;
8971         }
8972     }
8973
8974     return ncg_new;
8975 }
8976
8977 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
8978                           int ncg_home_old)
8979 {
8980     gmx_domdec_sort_t *sort;
8981     gmx_cgsort_t      *cgsort;
8982     int               *cgindex;
8983     int                ncg_new, i, *ibuf, cgsize;
8984     rvec              *vbuf;
8985
8986     sort = dd->comm->sort;
8987
8988     if (dd->ncg_home > sort->sort_nalloc)
8989     {
8990         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8991         srenew(sort->sort, sort->sort_nalloc);
8992         srenew(sort->sort2, sort->sort_nalloc);
8993     }
8994     cgsort = sort->sort;
8995
8996     switch (fr->cutoff_scheme)
8997     {
8998         case ecutsGROUP:
8999             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
9000             break;
9001         case ecutsVERLET:
9002             ncg_new = dd_sort_order_nbnxn(dd, fr);
9003             break;
9004         default:
9005             gmx_incons("unimplemented");
9006             ncg_new = 0;
9007     }
9008
9009     /* We alloc with the old size, since cgindex is still old */
9010     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
9011     vbuf = dd->comm->vbuf.v;
9012
9013     if (dd->comm->bCGs)
9014     {
9015         cgindex = dd->cgindex;
9016     }
9017     else
9018     {
9019         cgindex = nullptr;
9020     }
9021
9022     /* Remove the charge groups which are no longer at home here */
9023     dd->ncg_home = ncg_new;
9024     if (debug)
9025     {
9026         fprintf(debug, "Set the new home charge group count to %d\n",
9027                 dd->ncg_home);
9028     }
9029
9030     /* Reorder the state */
9031     if (state->flags & (1 << estX))
9032     {
9033         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->x.data()), vbuf);
9034     }
9035     if (state->flags & (1 << estV))
9036     {
9037         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->v.data()), vbuf);
9038     }
9039     if (state->flags & (1 << estCGP))
9040     {
9041         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->cg_p.data()), vbuf);
9042     }
9043
9044     if (fr->cutoff_scheme == ecutsGROUP)
9045     {
9046         /* Reorder cgcm */
9047         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
9048     }
9049
9050     if (dd->ncg_home+1 > sort->ibuf_nalloc)
9051     {
9052         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9053         srenew(sort->ibuf, sort->ibuf_nalloc);
9054     }
9055     ibuf = sort->ibuf;
9056     /* Reorder the global cg index */
9057     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9058     /* Reorder the cginfo */
9059     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9060     /* Rebuild the local cg index */
9061     if (dd->comm->bCGs)
9062     {
9063         ibuf[0] = 0;
9064         for (i = 0; i < dd->ncg_home; i++)
9065         {
9066             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9067             ibuf[i+1] = ibuf[i] + cgsize;
9068         }
9069         for (i = 0; i < dd->ncg_home+1; i++)
9070         {
9071             dd->cgindex[i] = ibuf[i];
9072         }
9073     }
9074     else
9075     {
9076         for (i = 0; i < dd->ncg_home+1; i++)
9077         {
9078             dd->cgindex[i] = i;
9079         }
9080     }
9081     /* Set the home atom number */
9082     dd->nat_home = dd->cgindex[dd->ncg_home];
9083
9084     if (fr->cutoff_scheme == ecutsVERLET)
9085     {
9086         /* The atoms are now exactly in grid order, update the grid order */
9087         nbnxn_set_atomorder(fr->nbv->nbs);
9088     }
9089     else
9090     {
9091         /* Copy the sorted ns cell indices back to the ns grid struct */
9092         for (i = 0; i < dd->ncg_home; i++)
9093         {
9094             fr->ns->grid->cell_index[i] = cgsort[i].nsc;
9095         }
9096         fr->ns->grid->nr = dd->ncg_home;
9097     }
9098 }
9099
9100 static void add_dd_statistics(gmx_domdec_t *dd)
9101 {
9102     gmx_domdec_comm_t *comm;
9103     int                ddnat;
9104
9105     comm = dd->comm;
9106
9107     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9108     {
9109         comm->sum_nat[ddnat-ddnatZONE] +=
9110             comm->nat[ddnat] - comm->nat[ddnat-1];
9111     }
9112     comm->ndecomp++;
9113 }
9114
9115 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9116 {
9117     gmx_domdec_comm_t *comm;
9118     int                ddnat;
9119
9120     comm = dd->comm;
9121
9122     /* Reset all the statistics and counters for total run counting */
9123     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9124     {
9125         comm->sum_nat[ddnat-ddnatZONE] = 0;
9126     }
9127     comm->ndecomp   = 0;
9128     comm->nload     = 0;
9129     comm->load_step = 0;
9130     comm->load_sum  = 0;
9131     comm->load_max  = 0;
9132     clear_ivec(comm->load_lim);
9133     comm->load_mdf = 0;
9134     comm->load_pme = 0;
9135 }
9136
9137 void print_dd_statistics(const t_commrec *cr, const t_inputrec *ir, FILE *fplog)
9138 {
9139     gmx_domdec_comm_t *comm;
9140     int                ddnat;
9141     double             av;
9142
9143     comm = cr->dd->comm;
9144
9145     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9146
9147     if (fplog == nullptr)
9148     {
9149         return;
9150     }
9151
9152     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9153
9154     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9155     {
9156         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9157         switch (ddnat)
9158         {
9159             case ddnatZONE:
9160                 fprintf(fplog,
9161                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9162                         2, av);
9163                 break;
9164             case ddnatVSITE:
9165                 if (cr->dd->vsite_comm)
9166                 {
9167                     fprintf(fplog,
9168                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9169                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9170                             av);
9171                 }
9172                 break;
9173             case ddnatCON:
9174                 if (cr->dd->constraint_comm)
9175                 {
9176                     fprintf(fplog,
9177                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9178                             1 + ir->nLincsIter, av);
9179                 }
9180                 break;
9181             default:
9182                 gmx_incons(" Unknown type for DD statistics");
9183         }
9184     }
9185     fprintf(fplog, "\n");
9186
9187     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9188     {
9189         print_dd_load_av(fplog, cr->dd);
9190     }
9191 }
9192
9193 void dd_partition_system(FILE                *fplog,
9194                          gmx_int64_t          step,
9195                          const t_commrec     *cr,
9196                          gmx_bool             bMasterState,
9197                          int                  nstglobalcomm,
9198                          t_state             *state_global,
9199                          const gmx_mtop_t    *top_global,
9200                          const t_inputrec    *ir,
9201                          t_state             *state_local,
9202                          PaddedRVecVector    *f,
9203                          gmx::MDAtoms        *mdAtoms,
9204                          gmx_localtop_t      *top_local,
9205                          t_forcerec          *fr,
9206                          gmx_vsite_t         *vsite,
9207                          gmx_constr_t         constr,
9208                          t_nrnb              *nrnb,
9209                          gmx_wallcycle       *wcycle,
9210                          gmx_bool             bVerbose)
9211 {
9212     gmx_domdec_t      *dd;
9213     gmx_domdec_comm_t *comm;
9214     gmx_ddbox_t        ddbox = {0};
9215     t_block           *cgs_gl;
9216     gmx_int64_t        step_pcoupl;
9217     rvec               cell_ns_x0, cell_ns_x1;
9218     int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9219     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
9220     gmx_bool           bRedist, bSortCG, bResortAll;
9221     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9222     real               grid_density;
9223     char               sbuf[22];
9224
9225     wallcycle_start(wcycle, ewcDOMDEC);
9226
9227     dd   = cr->dd;
9228     comm = dd->comm;
9229
9230     bBoxChanged = (bMasterState || inputrecDeform(ir));
9231     if (ir->epc != epcNO)
9232     {
9233         /* With nstpcouple > 1 pressure coupling happens.
9234          * one step after calculating the pressure.
9235          * Box scaling happens at the end of the MD step,
9236          * after the DD partitioning.
9237          * We therefore have to do DLB in the first partitioning
9238          * after an MD step where P-coupling occurred.
9239          * We need to determine the last step in which p-coupling occurred.
9240          * MRS -- need to validate this for vv?
9241          */
9242         n = ir->nstpcouple;
9243         if (n == 1)
9244         {
9245             step_pcoupl = step - 1;
9246         }
9247         else
9248         {
9249             step_pcoupl = ((step - 1)/n)*n + 1;
9250         }
9251         if (step_pcoupl >= comm->partition_step)
9252         {
9253             bBoxChanged = TRUE;
9254         }
9255     }
9256
9257     bNStGlobalComm = (step % nstglobalcomm == 0);
9258
9259     if (!isDlbOn(comm))
9260     {
9261         bDoDLB = FALSE;
9262     }
9263     else
9264     {
9265         /* Should we do dynamic load balacing this step?
9266          * Since it requires (possibly expensive) global communication,
9267          * we might want to do DLB less frequently.
9268          */
9269         if (bBoxChanged || ir->epc != epcNO)
9270         {
9271             bDoDLB = bBoxChanged;
9272         }
9273         else
9274         {
9275             bDoDLB = bNStGlobalComm;
9276         }
9277     }
9278
9279     /* Check if we have recorded loads on the nodes */
9280     if (comm->bRecordLoad && dd_load_count(comm) > 0)
9281     {
9282         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
9283
9284         /* Print load every nstlog, first and last step to the log file */
9285         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9286                     comm->n_load_collect == 0 ||
9287                     (ir->nsteps >= 0 &&
9288                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9289
9290         /* Avoid extra communication due to verbose screen output
9291          * when nstglobalcomm is set.
9292          */
9293         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
9294             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9295         {
9296             get_load_distribution(dd, wcycle);
9297             if (DDMASTER(dd))
9298             {
9299                 if (bLogLoad)
9300                 {
9301                     dd_print_load(fplog, dd, step-1);
9302                 }
9303                 if (bVerbose)
9304                 {
9305                     dd_print_load_verbose(dd);
9306                 }
9307             }
9308             comm->n_load_collect++;
9309
9310             if (isDlbOn(comm))
9311             {
9312                 if (DDMASTER(dd))
9313                 {
9314                     /* Add the measured cycles to the running average */
9315                     const float averageFactor        = 0.1f;
9316                     comm->cyclesPerStepDlbExpAverage =
9317                         (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
9318                         averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
9319                 }
9320                 if (comm->dlbState == edlbsOnCanTurnOff &&
9321                     dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
9322                 {
9323                     gmx_bool turnOffDlb;
9324                     if (DDMASTER(dd))
9325                     {
9326                         /* If the running averaged cycles with DLB are more
9327                          * than before we turned on DLB, turn off DLB.
9328                          * We will again run and check the cycles without DLB
9329                          * and we can then decide if to turn off DLB forever.
9330                          */
9331                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
9332                                       comm->cyclesPerStepBeforeDLB);
9333                     }
9334                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
9335                     if (turnOffDlb)
9336                     {
9337                         /* To turn off DLB, we need to redistribute the atoms */
9338                         dd_collect_state(dd, state_local, state_global);
9339                         bMasterState = TRUE;
9340                         turn_off_dlb(fplog, cr, step);
9341                     }
9342                 }
9343             }
9344             else if (bCheckWhetherToTurnDlbOn)
9345             {
9346                 gmx_bool turnOffDlbForever = FALSE;
9347                 gmx_bool turnOnDlb         = FALSE;
9348
9349                 /* Since the timings are node dependent, the master decides */
9350                 if (DDMASTER(dd))
9351                 {
9352                     /* If we recently turned off DLB, we want to check if
9353                      * performance is better without DLB. We want to do this
9354                      * ASAP to minimize the chance that external factors
9355                      * slowed down the DLB step are gone here and we
9356                      * incorrectly conclude that DLB was causing the slowdown.
9357                      * So we measure one nstlist block, no running average.
9358                      */
9359                     if (comm->haveTurnedOffDlb &&
9360                         comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
9361                         comm->cyclesPerStepDlbExpAverage)
9362                     {
9363                         /* After turning off DLB we ran nstlist steps in fewer
9364                          * cycles than with DLB. This likely means that DLB
9365                          * in not benefical, but this could be due to a one
9366                          * time unlucky fluctuation, so we require two such
9367                          * observations in close succession to turn off DLB
9368                          * forever.
9369                          */
9370                         if (comm->dlbSlowerPartitioningCount > 0 &&
9371                             dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
9372                         {
9373                             turnOffDlbForever = TRUE;
9374                         }
9375                         comm->haveTurnedOffDlb           = false;
9376                         /* Register when we last measured DLB slowdown */
9377                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
9378                     }
9379                     else
9380                     {
9381                         /* Here we check if the max PME rank load is more than 0.98
9382                          * the max PP force load. If so, PP DLB will not help,
9383                          * since we are (almost) limited by PME. Furthermore,
9384                          * DLB will cause a significant extra x/f redistribution
9385                          * cost on the PME ranks, which will then surely result
9386                          * in lower total performance.
9387                          */
9388                         if (cr->npmenodes > 0 &&
9389                             dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
9390                         {
9391                             turnOnDlb = FALSE;
9392                         }
9393                         else
9394                         {
9395                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9396                         }
9397                     }
9398                 }
9399                 struct
9400                 {
9401                     gmx_bool turnOffDlbForever;
9402                     gmx_bool turnOnDlb;
9403                 }
9404                 bools {
9405                     turnOffDlbForever, turnOnDlb
9406                 };
9407                 dd_bcast(dd, sizeof(bools), &bools);
9408                 if (bools.turnOffDlbForever)
9409                 {
9410                     turn_off_dlb_forever(fplog, cr, step);
9411                 }
9412                 else if (bools.turnOnDlb)
9413                 {
9414                     turn_on_dlb(fplog, cr, step);
9415                     bDoDLB = TRUE;
9416                 }
9417             }
9418         }
9419         comm->n_load_have++;
9420     }
9421
9422     cgs_gl = &comm->cgs_gl;
9423
9424     bRedist = FALSE;
9425     if (bMasterState)
9426     {
9427         /* Clear the old state */
9428         clear_dd_indices(dd, 0, 0);
9429         ncgindex_set = 0;
9430
9431         rvec *xGlobal = (SIMMASTER(cr) ? as_rvec_array(state_global->x.data()) : nullptr);
9432
9433         set_ddbox(dd, bMasterState, cr, ir,
9434                   SIMMASTER(cr) ? state_global->box : nullptr,
9435                   TRUE, cgs_gl, xGlobal,
9436                   &ddbox);
9437
9438         get_cg_distribution(fplog, dd, cgs_gl,
9439                             SIMMASTER(cr) ? state_global->box : nullptr,
9440                             &ddbox, xGlobal);
9441
9442         dd_distribute_state(dd, cgs_gl,
9443                             state_global, state_local, f);
9444
9445         dd_make_local_cgs(dd, &top_local->cgs);
9446
9447         /* Ensure that we have space for the new distribution */
9448         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9449
9450         if (fr->cutoff_scheme == ecutsGROUP)
9451         {
9452             calc_cgcm(fplog, 0, dd->ncg_home,
9453                       &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9454         }
9455
9456         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9457
9458         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9459     }
9460     else if (state_local->ddp_count != dd->ddp_count)
9461     {
9462         if (state_local->ddp_count > dd->ddp_count)
9463         {
9464             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9465         }
9466
9467         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9468         {
9469             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9470         }
9471
9472         /* Clear the old state */
9473         clear_dd_indices(dd, 0, 0);
9474
9475         /* Build the new indices */
9476         rebuild_cgindex(dd, cgs_gl->index, state_local);
9477         make_dd_indices(dd, cgs_gl->index, 0);
9478         ncgindex_set = dd->ncg_home;
9479
9480         if (fr->cutoff_scheme == ecutsGROUP)
9481         {
9482             /* Redetermine the cg COMs */
9483             calc_cgcm(fplog, 0, dd->ncg_home,
9484                       &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9485         }
9486
9487         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9488
9489         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9490
9491         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9492                   TRUE, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9493
9494         bRedist = isDlbOn(comm);
9495     }
9496     else
9497     {
9498         /* We have the full state, only redistribute the cgs */
9499
9500         /* Clear the non-home indices */
9501         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9502         ncgindex_set = 0;
9503
9504         /* Avoid global communication for dim's without pbc and -gcom */
9505         if (!bNStGlobalComm)
9506         {
9507             copy_rvec(comm->box0, ddbox.box0    );
9508             copy_rvec(comm->box_size, ddbox.box_size);
9509         }
9510         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9511                   bNStGlobalComm, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9512
9513         bBoxChanged = TRUE;
9514         bRedist     = TRUE;
9515     }
9516     /* For dim's without pbc and -gcom */
9517     copy_rvec(ddbox.box0, comm->box0    );
9518     copy_rvec(ddbox.box_size, comm->box_size);
9519
9520     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9521                       step, wcycle);
9522
9523     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9524     {
9525         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9526     }
9527
9528     /* Check if we should sort the charge groups */
9529     bSortCG = (bMasterState || bRedist);
9530
9531     ncg_home_old = dd->ncg_home;
9532
9533     /* When repartitioning we mark charge groups that will move to neighboring
9534      * DD cells, but we do not move them right away for performance reasons.
9535      * Thus we need to keep track of how many charge groups will move for
9536      * obtaining correct local charge group / atom counts.
9537      */
9538     ncg_moved = 0;
9539     if (bRedist)
9540     {
9541         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9542
9543         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9544                            state_local, f, fr,
9545                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9546
9547         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9548     }
9549
9550     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9551                           dd, &ddbox,
9552                           &comm->cell_x0, &comm->cell_x1,
9553                           dd->ncg_home, fr->cg_cm,
9554                           cell_ns_x0, cell_ns_x1, &grid_density);
9555
9556     if (bBoxChanged)
9557     {
9558         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9559     }
9560
9561     switch (fr->cutoff_scheme)
9562     {
9563         case ecutsGROUP:
9564             copy_ivec(fr->ns->grid->n, ncells_old);
9565             grid_first(fplog, fr->ns->grid, dd, &ddbox,
9566                        state_local->box, cell_ns_x0, cell_ns_x1,
9567                        fr->rlist, grid_density);
9568             break;
9569         case ecutsVERLET:
9570             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9571             break;
9572         default:
9573             gmx_incons("unimplemented");
9574     }
9575     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9576     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9577
9578     if (bSortCG)
9579     {
9580         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9581
9582         /* Sort the state on charge group position.
9583          * This enables exact restarts from this step.
9584          * It also improves performance by about 15% with larger numbers
9585          * of atoms per node.
9586          */
9587
9588         /* Fill the ns grid with the home cell,
9589          * so we can sort with the indices.
9590          */
9591         set_zones_ncg_home(dd);
9592
9593         switch (fr->cutoff_scheme)
9594         {
9595             case ecutsVERLET:
9596                 set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
9597
9598                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9599                                   0,
9600                                   comm->zones.size[0].bb_x0,
9601                                   comm->zones.size[0].bb_x1,
9602                                   0, dd->ncg_home,
9603                                   comm->zones.dens_zone0,
9604                                   fr->cginfo,
9605                                   as_rvec_array(state_local->x.data()),
9606                                   ncg_moved, bRedist ? comm->moved : nullptr,
9607                                   fr->nbv->grp[eintLocal].kernel_type,
9608                                   fr->nbv->nbat);
9609
9610                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9611                 break;
9612             case ecutsGROUP:
9613                 fill_grid(&comm->zones, fr->ns->grid, dd->ncg_home,
9614                           0, dd->ncg_home, fr->cg_cm);
9615
9616                 copy_ivec(fr->ns->grid->n, ncells_new);
9617                 break;
9618             default:
9619                 gmx_incons("unimplemented");
9620         }
9621
9622         bResortAll = bMasterState;
9623
9624         /* Check if we can user the old order and ns grid cell indices
9625          * of the charge groups to sort the charge groups efficiently.
9626          */
9627         if (ncells_new[XX] != ncells_old[XX] ||
9628             ncells_new[YY] != ncells_old[YY] ||
9629             ncells_new[ZZ] != ncells_old[ZZ])
9630         {
9631             bResortAll = TRUE;
9632         }
9633
9634         if (debug)
9635         {
9636             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9637                     gmx_step_str(step, sbuf), dd->ncg_home);
9638         }
9639         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9640                       bResortAll ? -1 : ncg_home_old);
9641
9642         /* After sorting and compacting we set the correct size */
9643         dd_resize_state(state_local, f, dd->nat_home);
9644
9645         /* Rebuild all the indices */
9646         ga2la_clear(dd->ga2la);
9647         ncgindex_set = 0;
9648
9649         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9650     }
9651
9652     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9653
9654     /* Setup up the communication and communicate the coordinates */
9655     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9656
9657     /* Set the indices */
9658     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9659
9660     /* Set the charge group boundaries for neighbor searching */
9661     set_cg_boundaries(&comm->zones);
9662
9663     if (fr->cutoff_scheme == ecutsVERLET)
9664     {
9665         set_zones_size(dd, state_local->box, &ddbox,
9666                        bSortCG ? 1 : 0, comm->zones.n,
9667                        0);
9668     }
9669
9670     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9671
9672     /*
9673        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9674                  -1,as_rvec_array(state_local->x.data()),state_local->box);
9675      */
9676
9677     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9678
9679     /* Extract a local topology from the global topology */
9680     for (i = 0; i < dd->ndim; i++)
9681     {
9682         np[dd->dim[i]] = comm->cd[i].np;
9683     }
9684     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9685                       comm->cellsize_min, np,
9686                       fr,
9687                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : as_rvec_array(state_local->x.data()),
9688                       vsite, top_global, top_local);
9689
9690     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9691
9692     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9693
9694     /* Set up the special atom communication */
9695     n = comm->nat[ddnatZONE];
9696     for (i = ddnatZONE+1; i < ddnatNR; i++)
9697     {
9698         switch (i)
9699         {
9700             case ddnatVSITE:
9701                 if (vsite && vsite->n_intercg_vsite)
9702                 {
9703                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9704                 }
9705                 break;
9706             case ddnatCON:
9707                 if (dd->bInterCGcons || dd->bInterCGsettles)
9708                 {
9709                     /* Only for inter-cg constraints we need special code */
9710                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9711                                                   constr, ir->nProjOrder,
9712                                                   top_local->idef.il);
9713                 }
9714                 break;
9715             default:
9716                 gmx_incons("Unknown special atom type setup");
9717         }
9718         comm->nat[i] = n;
9719     }
9720
9721     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9722
9723     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9724
9725     /* Make space for the extra coordinates for virtual site
9726      * or constraint communication.
9727      */
9728     state_local->natoms = comm->nat[ddnatNR-1];
9729
9730     dd_resize_state(state_local, f, state_local->natoms);
9731
9732     if (fr->haveDirectVirialContributions)
9733     {
9734         if (vsite && vsite->n_intercg_vsite)
9735         {
9736             nat_f_novirsum = comm->nat[ddnatVSITE];
9737         }
9738         else
9739         {
9740             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9741             {
9742                 nat_f_novirsum = dd->nat_tot;
9743             }
9744             else
9745             {
9746                 nat_f_novirsum = dd->nat_home;
9747             }
9748         }
9749     }
9750     else
9751     {
9752         nat_f_novirsum = 0;
9753     }
9754
9755     /* Set the number of atoms required for the force calculation.
9756      * Forces need to be constrained when doing energy
9757      * minimization. For simple simulations we could avoid some
9758      * allocation, zeroing and copying, but this is probably not worth
9759      * the complications and checking.
9760      */
9761     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9762                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9763
9764     /* Update atom data for mdatoms and several algorithms */
9765     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
9766                               nullptr, mdAtoms, vsite, nullptr);
9767
9768     auto mdatoms = mdAtoms->mdatoms();
9769     if (!thisRankHasDuty(cr, DUTY_PME))
9770     {
9771         /* Send the charges and/or c6/sigmas to our PME only node */
9772         gmx_pme_send_parameters(cr,
9773                                 fr->ic,
9774                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9775                                 mdatoms->chargeA, mdatoms->chargeB,
9776                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9777                                 mdatoms->sigmaA, mdatoms->sigmaB,
9778                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9779     }
9780
9781     if (constr)
9782     {
9783         set_constraints(constr, top_local, ir, mdatoms, cr);
9784     }
9785
9786     if (ir->bPull)
9787     {
9788         /* Update the local pull groups */
9789         dd_make_local_pull_groups(cr, ir->pull_work, mdatoms);
9790     }
9791
9792     if (ir->bRot)
9793     {
9794         /* Update the local rotation groups */
9795         dd_make_local_rotation_groups(dd, ir->rot);
9796     }
9797
9798     if (ir->eSwapCoords != eswapNO)
9799     {
9800         /* Update the local groups needed for ion swapping */
9801         dd_make_local_swap_groups(dd, ir->swap);
9802     }
9803
9804     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9805     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9806
9807     add_dd_statistics(dd);
9808
9809     /* Make sure we only count the cycles for this DD partitioning */
9810     clear_dd_cycle_counts(dd);
9811
9812     /* Because the order of the atoms might have changed since
9813      * the last vsite construction, we need to communicate the constructing
9814      * atom coordinates again (for spreading the forces this MD step).
9815      */
9816     dd_move_x_vsites(dd, state_local->box, as_rvec_array(state_local->x.data()));
9817
9818     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9819
9820     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9821     {
9822         dd_move_x(dd, state_local->box, as_rvec_array(state_local->x.data()), nullWallcycle);
9823         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9824                      -1, as_rvec_array(state_local->x.data()), state_local->box);
9825     }
9826
9827     /* Store the partitioning step */
9828     comm->partition_step = step;
9829
9830     /* Increase the DD partitioning counter */
9831     dd->ddp_count++;
9832     /* The state currently matches this DD partitioning count, store it */
9833     state_local->ddp_count = dd->ddp_count;
9834     if (bMasterState)
9835     {
9836         /* The DD master node knows the complete cg distribution,
9837          * store the count so we can possibly skip the cg info communication.
9838          */
9839         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9840     }
9841
9842     if (comm->DD_debug > 0)
9843     {
9844         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9845         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9846                                 "after partitioning");
9847     }
9848
9849     wallcycle_stop(wcycle, ewcDOMDEC);
9850 }