src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <limits.h>
  44 #include <math.h>
  45 #include <stdio.h>
  46 #include <stdlib.h>
  47 #include <string.h>
  48
  49 #include <algorithm>
  50
  51 #include "gromacs/domdec/domdec_network.h"
  52 #include "gromacs/domdec/ga2la.h"
  53 #include "gromacs/ewald/pme.h"
  54 #include "gromacs/fileio/gmxfio.h"
  55 #include "gromacs/fileio/pdbio.h"
  56 #include "gromacs/gmxlib/chargegroup.h"
  57 #include "gromacs/gmxlib/network.h"
  58 #include "gromacs/gmxlib/nrnb.h"
  59 #include "gromacs/gpu_utils/gpu_utils.h"
  60 #include "gromacs/hardware/hw_info.h"
  61 #include "gromacs/imd/imd.h"
  62 #include "gromacs/listed-forces/manage-threading.h"
  63 #include "gromacs/math/functions.h"
  64 #include "gromacs/math/vec.h"
  65 #include "gromacs/math/vectypes.h"
  66 #include "gromacs/mdlib/constr.h"
  67 #include "gromacs/mdlib/force.h"
  68 #include "gromacs/mdlib/forcerec.h"
  69 #include "gromacs/mdlib/genborn.h"
  70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  71 #include "gromacs/mdlib/mdatoms.h"
  72 #include "gromacs/mdlib/mdrun.h"
  73 #include "gromacs/mdlib/mdsetup.h"
  74 #include "gromacs/mdlib/nb_verlet.h"
  75 #include "gromacs/mdlib/nbnxn_grid.h"
  76 #include "gromacs/mdlib/nsgrid.h"
  77 #include "gromacs/mdlib/vsite.h"
  78 #include "gromacs/mdtypes/commrec.h"
  79 #include "gromacs/mdtypes/df_history.h"
  80 #include "gromacs/mdtypes/forcerec.h"
  81 #include "gromacs/mdtypes/inputrec.h"
  82 #include "gromacs/mdtypes/md_enums.h"
  83 #include "gromacs/mdtypes/mdatom.h"
  84 #include "gromacs/mdtypes/nblist.h"
  85 #include "gromacs/mdtypes/state.h"
  86 #include "gromacs/pbcutil/ishift.h"
  87 #include "gromacs/pbcutil/pbc.h"
  88 #include "gromacs/pulling/pull.h"
  89 #include "gromacs/pulling/pull_rotation.h"
  90 #include "gromacs/swap/swapcoords.h"
  91 #include "gromacs/timing/wallcycle.h"
  92 #include "gromacs/topology/block.h"
  93 #include "gromacs/topology/idef.h"
  94 #include "gromacs/topology/ifunc.h"
  95 #include "gromacs/topology/mtop_lookup.h"
  96 #include "gromacs/topology/mtop_util.h"
  97 #include "gromacs/topology/topology.h"
  98 #include "gromacs/utility/basedefinitions.h"
  99 #include "gromacs/utility/basenetwork.h"
 100 #include "gromacs/utility/cstringutil.h"
 101 #include "gromacs/utility/exceptions.h"
 102 #include "gromacs/utility/fatalerror.h"
 103 #include "gromacs/utility/gmxmpi.h"
 104 #include "gromacs/utility/qsort_threadsafe.h"
 105 #include "gromacs/utility/real.h"
 106 #include "gromacs/utility/smalloc.h"
 107
 108 #include "domdec_constraints.h"
 109 #include "domdec_internal.h"
 110 #include "domdec_vsite.h"
 111
 112 #define DDRANK(dd, rank)    (rank)
 113 #define DDMASTERRANK(dd)   (dd->masterrank)
 114
 115 struct gmx_domdec_master_t
 116 {
 117     /* The cell boundaries */
 118     real **cell_x;
 119     /* The global charge group division */
 120     int   *ncg;    /* Number of home charge groups for each node */
 121     int   *index;  /* Index of nnodes+1 into cg */
 122     int   *cg;     /* Global charge group index */
 123     int   *nat;    /* Number of home atoms for each node. */
 124     int   *ibuf;   /* Buffer for communication */
 125     rvec  *vbuf;   /* Buffer for state scattering and gathering */
 126 };
 127
 128 #define DD_NLOAD_MAX 9
 129
 130 const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on" };
 131
 132 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 133 #define DD_CGIBS 2
 134
 135 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 136 #define DD_FLAG_NRCG  65535
 137 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 138 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 139
 140 /* The DD zone order */
 141 static const ivec dd_zo[DD_MAXZONE] =
 142 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 143
 144 /* The non-bonded zone-pair setup for domain decomposition
 145  * The first number is the i-zone, the second number the first j-zone seen by
 146  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 147  * As is, this is for 3D decomposition, where there are 4 i-zones.
 148  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 149  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 150  */
 151 static const int
 152     ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
 153                                                  {1, 3, 6},
 154                                                  {2, 5, 6},
 155                                                  {3, 5, 7}};
 156
 157 /* Factors used to avoid problems due to rounding issues */
 158 #define DD_CELL_MARGIN       1.0001
 159 #define DD_CELL_MARGIN2      1.00005
 160 /* Factor to account for pressure scaling during nstlist steps */
 161 #define DD_PRES_SCALE_MARGIN 1.02
 162
 163 /* Turn on DLB when the load imbalance causes this amount of total loss.
 164  * There is a bit of overhead with DLB and it's difficult to achieve
 165  * a load imbalance of less than 2% with DLB.
 166  */
 167 #define DD_PERF_LOSS_DLB_ON  0.02
 168
 169 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 170 #define DD_PERF_LOSS_WARN    0.05
 171
 172 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 173
 174 /* Use separate MPI send and receive commands
 175  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 176  * This saves memory (and some copying for small nnodes).
 177  * For high parallelization scatter and gather calls are used.
 178  */
 179 #define GMX_DD_NNODES_SENDRECV 4
 180
 181
 182 /* We check if to turn on DLB at the first and every 100 DD partitionings.
 183  * With large imbalance DLB will turn on at the first step, so we can
 184  * make the interval so large that the MPI overhead of the check is negligible.
 185  */
 186 static const int c_checkTurnDlbOnInterval  = 100;
 187 /* We need to check if DLB results in worse performance and then turn it off.
 188  * We check this more often then for turning DLB on, because the DLB can scale
 189  * the domains very rapidly, so if unlucky the load imbalance can go up quickly
 190  * and furthermore, we are already synchronizing often with DLB, so
 191  * the overhead of the MPI Bcast is not that high.
 192  */
 193 static const int c_checkTurnDlbOffInterval =  20;
 194
 195 /* Forward declaration */
 196 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue);
 197
 198
 199 /*
 200    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 201
 202    static void index2xyz(ivec nc,int ind,ivec xyz)
 203    {
 204    xyz[XX] = ind % nc[XX];
 205    xyz[YY] = (ind / nc[XX]) % nc[YY];
 206    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 207    }
 208  */
 209
 210 /* This order is required to minimize the coordinate communication in PME
 211  * which uses decomposition in the x direction.
 212  */
 213 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 214
 215 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 216 {
 217     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 218     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 219     xyz[ZZ] = ind % nc[ZZ];
 220 }
 221
 222 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 223 {
 224     int ddindex;
 225     int ddnodeid = -1;
 226
 227     ddindex = dd_index(dd->nc, c);
 228     if (dd->comm->bCartesianPP_PME)
 229     {
 230         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 231     }
 232     else if (dd->comm->bCartesianPP)
 233     {
 234 #if GMX_MPI
 235         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 236 #endif
 237     }
 238     else
 239     {
 240         ddnodeid = ddindex;
 241     }
 242
 243     return ddnodeid;
 244 }
 245
 246 static gmx_bool dynamic_dd_box(const gmx_ddbox_t *ddbox, const t_inputrec *ir)
 247 {
 248     return (ddbox->nboundeddim < DIM || inputrecDynamicBox(ir));
 249 }
 250
 251 int ddglatnr(const gmx_domdec_t *dd, int i)
 252 {
 253     int atnr;
 254
 255     if (dd == nullptr)
 256     {
 257         atnr = i + 1;
 258     }
 259     else
 260     {
 261         if (i >= dd->comm->nat[ddnatNR-1])
 262         {
 263             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 264         }
 265         atnr = dd->gatindex[i] + 1;
 266     }
 267
 268     return atnr;
 269 }
 270
 271 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 272 {
 273     return &dd->comm->cgs_gl;
 274 }
 275
 276 static bool dlbIsOn(const gmx_domdec_comm_t *comm)
 277 {
 278     return (comm->dlbState == edlbsOnCanTurnOff ||
 279             comm->dlbState == edlbsOnForever);
 280 }
 281
 282 static void vec_rvec_init(vec_rvec_t *v)
 283 {
 284     v->nalloc = 0;
 285     v->v      = nullptr;
 286 }
 287
 288 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 289 {
 290     if (n > v->nalloc)
 291     {
 292         v->nalloc = over_alloc_dd(n);
 293         srenew(v->v, v->nalloc);
 294     }
 295 }
 296
 297 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 298 {
 299     int i;
 300
 301     if (state->ddp_count != dd->ddp_count)
 302     {
 303         gmx_incons("The state does not the domain decomposition state");
 304     }
 305
 306     state->cg_gl.resize(dd->ncg_home);
 307     for (i = 0; i < dd->ncg_home; i++)
 308     {
 309         state->cg_gl[i] = dd->index_gl[i];
 310     }
 311
 312     state->ddp_count_cg_gl = dd->ddp_count;
 313 }
 314
 315 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 316 {
 317     return &dd->comm->zones;
 318 }
 319
 320 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
 321                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 322 {
 323     gmx_domdec_zones_t *zones;
 324     int                 izone, d, dim;
 325
 326     zones = &dd->comm->zones;
 327
 328     izone = 0;
 329     while (icg >= zones->izone[izone].cg1)
 330     {
 331         izone++;
 332     }
 333
 334     if (izone == 0)
 335     {
 336         *jcg0 = icg;
 337     }
 338     else if (izone < zones->nizone)
 339     {
 340         *jcg0 = zones->izone[izone].jcg0;
 341     }
 342     else
 343     {
 344         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 345                   icg, izone, zones->nizone);
 346     }
 347
 348     *jcg1 = zones->izone[izone].jcg1;
 349
 350     for (d = 0; d < dd->ndim; d++)
 351     {
 352         dim         = dd->dim[d];
 353         shift0[dim] = zones->izone[izone].shift0[dim];
 354         shift1[dim] = zones->izone[izone].shift1[dim];
 355         if (dd->comm->tric_dir[dim] || (dlbIsOn(dd->comm) && d > 0))
 356         {
 357             /* A conservative approach, this can be optimized */
 358             shift0[dim] -= 1;
 359             shift1[dim] += 1;
 360         }
 361     }
 362 }
 363
 364 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
 365 {
 366     /* We currently set mdatoms entries for all atoms:
 367      * local + non-local + communicated for vsite + constraints
 368      */
 369
 370     return dd->comm->nat[ddnatNR - 1];
 371 }
 372
 373 int dd_natoms_vsite(const gmx_domdec_t *dd)
 374 {
 375     return dd->comm->nat[ddnatVSITE];
 376 }
 377
 378 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
 379 {
 380     *at_start = dd->comm->nat[ddnatCON-1];
 381     *at_end   = dd->comm->nat[ddnatCON];
 382 }
 383
 384 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 385 {
 386     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 387     int                   *index, *cgindex;
 388     gmx_domdec_comm_t     *comm;
 389     gmx_domdec_comm_dim_t *cd;
 390     gmx_domdec_ind_t      *ind;
 391     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 392     gmx_bool               bPBC, bScrew;
 393
 394     comm = dd->comm;
 395
 396     cgindex = dd->cgindex;
 397
 398     buf = comm->vbuf.v;
 399
 400     nzone   = 1;
 401     nat_tot = dd->nat_home;
 402     for (d = 0; d < dd->ndim; d++)
 403     {
 404         bPBC   = (dd->ci[dd->dim[d]] == 0);
 405         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 406         if (bPBC)
 407         {
 408             copy_rvec(box[dd->dim[d]], shift);
 409         }
 410         cd = &comm->cd[d];
 411         for (p = 0; p < cd->np; p++)
 412         {
 413             ind   = &cd->ind[p];
 414             index = ind->index;
 415             n     = 0;
 416             if (!bPBC)
 417             {
 418                 for (i = 0; i < ind->nsend[nzone]; i++)
 419                 {
 420                     at0 = cgindex[index[i]];
 421                     at1 = cgindex[index[i]+1];
 422                     for (j = at0; j < at1; j++)
 423                     {
 424                         copy_rvec(x[j], buf[n]);
 425                         n++;
 426                     }
 427                 }
 428             }
 429             else if (!bScrew)
 430             {
 431                 for (i = 0; i < ind->nsend[nzone]; i++)
 432                 {
 433                     at0 = cgindex[index[i]];
 434                     at1 = cgindex[index[i]+1];
 435                     for (j = at0; j < at1; j++)
 436                     {
 437                         /* We need to shift the coordinates */
 438                         rvec_add(x[j], shift, buf[n]);
 439                         n++;
 440                     }
 441                 }
 442             }
 443             else
 444             {
 445                 for (i = 0; i < ind->nsend[nzone]; i++)
 446                 {
 447                     at0 = cgindex[index[i]];
 448                     at1 = cgindex[index[i]+1];
 449                     for (j = at0; j < at1; j++)
 450                     {
 451                         /* Shift x */
 452                         buf[n][XX] = x[j][XX] + shift[XX];
 453                         /* Rotate y and z.
 454                          * This operation requires a special shift force
 455                          * treatment, which is performed in calc_vir.
 456                          */
 457                         buf[n][YY] = box[YY][YY] - x[j][YY];
 458                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 459                         n++;
 460                     }
 461                 }
 462             }
 463
 464             if (cd->bInPlace)
 465             {
 466                 rbuf = x + nat_tot;
 467             }
 468             else
 469             {
 470                 rbuf = comm->vbuf2.v;
 471             }
 472             /* Send and receive the coordinates */
 473             dd_sendrecv_rvec(dd, d, dddirBackward,
 474                              buf,  ind->nsend[nzone+1],
 475                              rbuf, ind->nrecv[nzone+1]);
 476             if (!cd->bInPlace)
 477             {
 478                 j = 0;
 479                 for (zone = 0; zone < nzone; zone++)
 480                 {
 481                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 482                     {
 483                         copy_rvec(rbuf[j], x[i]);
 484                         j++;
 485                     }
 486                 }
 487             }
 488             nat_tot += ind->nrecv[nzone+1];
 489         }
 490         nzone += nzone;
 491     }
 492 }
 493
 494 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 495 {
 496     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 497     int                   *index, *cgindex;
 498     gmx_domdec_comm_t     *comm;
 499     gmx_domdec_comm_dim_t *cd;
 500     gmx_domdec_ind_t      *ind;
 501     rvec                  *buf, *sbuf;
 502     ivec                   vis;
 503     int                    is;
 504     gmx_bool               bShiftForcesNeedPbc, bScrew;
 505
 506     comm = dd->comm;
 507
 508     cgindex = dd->cgindex;
 509
 510     buf = comm->vbuf.v;
 511
 512     nzone   = comm->zones.n/2;
 513     nat_tot = dd->nat_tot;
 514     for (d = dd->ndim-1; d >= 0; d--)
 515     {
 516         /* Only forces in domains near the PBC boundaries need to
 517            consider PBC in the treatment of fshift */
 518         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 519         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 520         if (fshift == nullptr && !bScrew)
 521         {
 522             bShiftForcesNeedPbc = FALSE;
 523         }
 524         /* Determine which shift vector we need */
 525         clear_ivec(vis);
 526         vis[dd->dim[d]] = 1;
 527         is              = IVEC2IS(vis);
 528
 529         cd = &comm->cd[d];
 530         for (p = cd->np-1; p >= 0; p--)
 531         {
 532             ind      = &cd->ind[p];
 533             nat_tot -= ind->nrecv[nzone+1];
 534             if (cd->bInPlace)
 535             {
 536                 sbuf = f + nat_tot;
 537             }
 538             else
 539             {
 540                 sbuf = comm->vbuf2.v;
 541                 j    = 0;
 542                 for (zone = 0; zone < nzone; zone++)
 543                 {
 544                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 545                     {
 546                         copy_rvec(f[i], sbuf[j]);
 547                         j++;
 548                     }
 549                 }
 550             }
 551             /* Communicate the forces */
 552             dd_sendrecv_rvec(dd, d, dddirForward,
 553                              sbuf, ind->nrecv[nzone+1],
 554                              buf,  ind->nsend[nzone+1]);
 555             index = ind->index;
 556             /* Add the received forces */
 557             n = 0;
 558             if (!bShiftForcesNeedPbc)
 559             {
 560                 for (i = 0; i < ind->nsend[nzone]; i++)
 561                 {
 562                     at0 = cgindex[index[i]];
 563                     at1 = cgindex[index[i]+1];
 564                     for (j = at0; j < at1; j++)
 565                     {
 566                         rvec_inc(f[j], buf[n]);
 567                         n++;
 568                     }
 569                 }
 570             }
 571             else if (!bScrew)
 572             {
 573                 /* fshift should always be defined if this function is
 574                  * called when bShiftForcesNeedPbc is true */
 575                 assert(NULL != fshift);
 576                 for (i = 0; i < ind->nsend[nzone]; i++)
 577                 {
 578                     at0 = cgindex[index[i]];
 579                     at1 = cgindex[index[i]+1];
 580                     for (j = at0; j < at1; j++)
 581                     {
 582                         rvec_inc(f[j], buf[n]);
 583                         /* Add this force to the shift force */
 584                         rvec_inc(fshift[is], buf[n]);
 585                         n++;
 586                     }
 587                 }
 588             }
 589             else
 590             {
 591                 for (i = 0; i < ind->nsend[nzone]; i++)
 592                 {
 593                     at0 = cgindex[index[i]];
 594                     at1 = cgindex[index[i]+1];
 595                     for (j = at0; j < at1; j++)
 596                     {
 597                         /* Rotate the force */
 598                         f[j][XX] += buf[n][XX];
 599                         f[j][YY] -= buf[n][YY];
 600                         f[j][ZZ] -= buf[n][ZZ];
 601                         if (fshift)
 602                         {
 603                             /* Add this force to the shift force */
 604                             rvec_inc(fshift[is], buf[n]);
 605                         }
 606                         n++;
 607                     }
 608                 }
 609             }
 610         }
 611         nzone /= 2;
 612     }
 613 }
 614
 615 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 616 {
 617     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 618     int                   *index, *cgindex;
 619     gmx_domdec_comm_t     *comm;
 620     gmx_domdec_comm_dim_t *cd;
 621     gmx_domdec_ind_t      *ind;
 622     real                  *buf, *rbuf;
 623
 624     comm = dd->comm;
 625
 626     cgindex = dd->cgindex;
 627
 628     buf = &comm->vbuf.v[0][0];
 629
 630     nzone   = 1;
 631     nat_tot = dd->nat_home;
 632     for (d = 0; d < dd->ndim; d++)
 633     {
 634         cd = &comm->cd[d];
 635         for (p = 0; p < cd->np; p++)
 636         {
 637             ind   = &cd->ind[p];
 638             index = ind->index;
 639             n     = 0;
 640             for (i = 0; i < ind->nsend[nzone]; i++)
 641             {
 642                 at0 = cgindex[index[i]];
 643                 at1 = cgindex[index[i]+1];
 644                 for (j = at0; j < at1; j++)
 645                 {
 646                     buf[n] = v[j];
 647                     n++;
 648                 }
 649             }
 650
 651             if (cd->bInPlace)
 652             {
 653                 rbuf = v + nat_tot;
 654             }
 655             else
 656             {
 657                 rbuf = &comm->vbuf2.v[0][0];
 658             }
 659             /* Send and receive the coordinates */
 660             dd_sendrecv_real(dd, d, dddirBackward,
 661                              buf,  ind->nsend[nzone+1],
 662                              rbuf, ind->nrecv[nzone+1]);
 663             if (!cd->bInPlace)
 664             {
 665                 j = 0;
 666                 for (zone = 0; zone < nzone; zone++)
 667                 {
 668                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 669                     {
 670                         v[i] = rbuf[j];
 671                         j++;
 672                     }
 673                 }
 674             }
 675             nat_tot += ind->nrecv[nzone+1];
 676         }
 677         nzone += nzone;
 678     }
 679 }
 680
 681 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 682 {
 683     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 684     int                   *index, *cgindex;
 685     gmx_domdec_comm_t     *comm;
 686     gmx_domdec_comm_dim_t *cd;
 687     gmx_domdec_ind_t      *ind;
 688     real                  *buf, *sbuf;
 689
 690     comm = dd->comm;
 691
 692     cgindex = dd->cgindex;
 693
 694     buf = &comm->vbuf.v[0][0];
 695
 696     nzone   = comm->zones.n/2;
 697     nat_tot = dd->nat_tot;
 698     for (d = dd->ndim-1; d >= 0; d--)
 699     {
 700         cd = &comm->cd[d];
 701         for (p = cd->np-1; p >= 0; p--)
 702         {
 703             ind      = &cd->ind[p];
 704             nat_tot -= ind->nrecv[nzone+1];
 705             if (cd->bInPlace)
 706             {
 707                 sbuf = v + nat_tot;
 708             }
 709             else
 710             {
 711                 sbuf = &comm->vbuf2.v[0][0];
 712                 j    = 0;
 713                 for (zone = 0; zone < nzone; zone++)
 714                 {
 715                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 716                     {
 717                         sbuf[j] = v[i];
 718                         j++;
 719                     }
 720                 }
 721             }
 722             /* Communicate the forces */
 723             dd_sendrecv_real(dd, d, dddirForward,
 724                              sbuf, ind->nrecv[nzone+1],
 725                              buf,  ind->nsend[nzone+1]);
 726             index = ind->index;
 727             /* Add the received forces */
 728             n = 0;
 729             for (i = 0; i < ind->nsend[nzone]; i++)
 730             {
 731                 at0 = cgindex[index[i]];
 732                 at1 = cgindex[index[i]+1];
 733                 for (j = at0; j < at1; j++)
 734                 {
 735                     v[j] += buf[n];
 736                     n++;
 737                 }
 738             }
 739         }
 740         nzone /= 2;
 741     }
 742 }
 743
 744 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 745 {
 746     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 747             d, i, j,
 748             zone->min0, zone->max1,
 749             zone->mch0, zone->mch0,
 750             zone->p1_0, zone->p1_1);
 751 }
 752
 753
 754 #define DDZONECOMM_MAXZONE  5
 755 #define DDZONECOMM_BUFSIZE  3
 756
 757 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 758                                int ddimind, int direction,
 759                                gmx_ddzone_t *buf_s, int n_s,
 760                                gmx_ddzone_t *buf_r, int n_r)
 761 {
 762 #define ZBS  DDZONECOMM_BUFSIZE
 763     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 764     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 765     int  i;
 766
 767     for (i = 0; i < n_s; i++)
 768     {
 769         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 770         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 771         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 772         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 773         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 774         vbuf_s[i*ZBS+1][2] = 0;
 775         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 776         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 777         vbuf_s[i*ZBS+2][2] = 0;
 778     }
 779
 780     dd_sendrecv_rvec(dd, ddimind, direction,
 781                      vbuf_s, n_s*ZBS,
 782                      vbuf_r, n_r*ZBS);
 783
 784     for (i = 0; i < n_r; i++)
 785     {
 786         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 787         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 788         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 789         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 790         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 791         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 792         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 793     }
 794
 795 #undef ZBS
 796 }
 797
 798 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 799                           rvec cell_ns_x0, rvec cell_ns_x1)
 800 {
 801     int                d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
 802     gmx_ddzone_t      *zp;
 803     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 804     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 805     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 806     rvec               extr_s[2], extr_r[2];
 807     rvec               dh;
 808     real               dist_d, c = 0, det;
 809     gmx_domdec_comm_t *comm;
 810     gmx_bool           bPBC, bUse;
 811
 812     comm = dd->comm;
 813
 814     for (d = 1; d < dd->ndim; d++)
 815     {
 816         dim      = dd->dim[d];
 817         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 818         zp->min0 = cell_ns_x0[dim];
 819         zp->max1 = cell_ns_x1[dim];
 820         zp->min1 = cell_ns_x1[dim];
 821         zp->mch0 = cell_ns_x0[dim];
 822         zp->mch1 = cell_ns_x1[dim];
 823         zp->p1_0 = cell_ns_x0[dim];
 824         zp->p1_1 = cell_ns_x1[dim];
 825     }
 826
 827     for (d = dd->ndim-2; d >= 0; d--)
 828     {
 829         dim  = dd->dim[d];
 830         bPBC = (dim < ddbox->npbcdim);
 831
 832         /* Use an rvec to store two reals */
 833         extr_s[d][0] = comm->cell_f0[d+1];
 834         extr_s[d][1] = comm->cell_f1[d+1];
 835         extr_s[d][2] = comm->cell_f1[d+1];
 836
 837         pos = 0;
 838         /* Store the extremes in the backward sending buffer,
 839          * so the get updated separately from the forward communication.
 840          */
 841         for (d1 = d; d1 < dd->ndim-1; d1++)
 842         {
 843             /* We invert the order to be able to use the same loop for buf_e */
 844             buf_s[pos].min0 = extr_s[d1][1];
 845             buf_s[pos].max1 = extr_s[d1][0];
 846             buf_s[pos].min1 = extr_s[d1][2];
 847             buf_s[pos].mch0 = 0;
 848             buf_s[pos].mch1 = 0;
 849             /* Store the cell corner of the dimension we communicate along */
 850             buf_s[pos].p1_0 = comm->cell_x0[dim];
 851             buf_s[pos].p1_1 = 0;
 852             pos++;
 853         }
 854
 855         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 856         pos++;
 857
 858         if (dd->ndim == 3 && d == 0)
 859         {
 860             buf_s[pos] = comm->zone_d2[0][1];
 861             pos++;
 862             buf_s[pos] = comm->zone_d1[0];
 863             pos++;
 864         }
 865
 866         /* We only need to communicate the extremes
 867          * in the forward direction
 868          */
 869         npulse = comm->cd[d].np;
 870         if (bPBC)
 871         {
 872             /* Take the minimum to avoid double communication */
 873             npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
 874         }
 875         else
 876         {
 877             /* Without PBC we should really not communicate over
 878              * the boundaries, but implementing that complicates
 879              * the communication setup and therefore we simply
 880              * do all communication, but ignore some data.
 881              */
 882             npulse_min = npulse;
 883         }
 884         for (p = 0; p < npulse_min; p++)
 885         {
 886             /* Communicate the extremes forward */
 887             bUse = (bPBC || dd->ci[dim] > 0);
 888
 889             dd_sendrecv_rvec(dd, d, dddirForward,
 890                              extr_s+d, dd->ndim-d-1,
 891                              extr_r+d, dd->ndim-d-1);
 892
 893             if (bUse)
 894             {
 895                 for (d1 = d; d1 < dd->ndim-1; d1++)
 896                 {
 897                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 898                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 899                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 900                 }
 901             }
 902         }
 903
 904         buf_size = pos;
 905         for (p = 0; p < npulse; p++)
 906         {
 907             /* Communicate all the zone information backward */
 908             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 909
 910             dd_sendrecv_ddzone(dd, d, dddirBackward,
 911                                buf_s, buf_size,
 912                                buf_r, buf_size);
 913
 914             clear_rvec(dh);
 915             if (p > 0)
 916             {
 917                 for (d1 = d+1; d1 < dd->ndim; d1++)
 918                 {
 919                     /* Determine the decrease of maximum required
 920                      * communication height along d1 due to the distance along d,
 921                      * this avoids a lot of useless atom communication.
 922                      */
 923                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 924
 925                     if (ddbox->tric_dir[dim])
 926                     {
 927                         /* c is the off-diagonal coupling between the cell planes
 928                          * along directions d and d1.
 929                          */
 930                         c = ddbox->v[dim][dd->dim[d1]][dim];
 931                     }
 932                     else
 933                     {
 934                         c = 0;
 935                     }
 936                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 937                     if (det > 0)
 938                     {
 939                         dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
 940                     }
 941                     else
 942                     {
 943                         /* A negative value signals out of range */
 944                         dh[d1] = -1;
 945                     }
 946                 }
 947             }
 948
 949             /* Accumulate the extremes over all pulses */
 950             for (i = 0; i < buf_size; i++)
 951             {
 952                 if (p == 0)
 953                 {
 954                     buf_e[i] = buf_r[i];
 955                 }
 956                 else
 957                 {
 958                     if (bUse)
 959                     {
 960                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 961                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 962                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 963                     }
 964
 965                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 966                     {
 967                         d1 = 1;
 968                     }
 969                     else
 970                     {
 971                         d1 = d + 1;
 972                     }
 973                     if (bUse && dh[d1] >= 0)
 974                     {
 975                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 976                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 977                     }
 978                 }
 979                 /* Copy the received buffer to the send buffer,
 980                  * to pass the data through with the next pulse.
 981                  */
 982                 buf_s[i] = buf_r[i];
 983             }
 984             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 985                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 986             {
 987                 /* Store the extremes */
 988                 pos = 0;
 989
 990                 for (d1 = d; d1 < dd->ndim-1; d1++)
 991                 {
 992                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 993                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 994                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
 995                     pos++;
 996                 }
 997
 998                 if (d == 1 || (d == 0 && dd->ndim == 3))
 999                 {
1000                     for (i = d; i < 2; i++)
1001                     {
1002                         comm->zone_d2[1-d][i] = buf_e[pos];
1003                         pos++;
1004                     }
1005                 }
1006                 if (d == 0)
1007                 {
1008                     comm->zone_d1[1] = buf_e[pos];
1009                     pos++;
1010                 }
1011             }
1012         }
1013     }
1014
1015     if (dd->ndim >= 2)
1016     {
1017         dim = dd->dim[1];
1018         for (i = 0; i < 2; i++)
1019         {
1020             if (debug)
1021             {
1022                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1023             }
1024             cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1025             cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1026         }
1027     }
1028     if (dd->ndim >= 3)
1029     {
1030         dim = dd->dim[2];
1031         for (i = 0; i < 2; i++)
1032         {
1033             for (j = 0; j < 2; j++)
1034             {
1035                 if (debug)
1036                 {
1037                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1038                 }
1039                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1040                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1041             }
1042         }
1043     }
1044     for (d = 1; d < dd->ndim; d++)
1045     {
1046         comm->cell_f_max0[d] = extr_s[d-1][0];
1047         comm->cell_f_min1[d] = extr_s[d-1][1];
1048         if (debug)
1049         {
1050             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1051                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1052         }
1053     }
1054 }
1055
1056 static void dd_collect_cg(gmx_domdec_t *dd,
1057                           t_state      *state_local)
1058 {
1059     gmx_domdec_master_t *ma = nullptr;
1060     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = nullptr, nat_home = 0;
1061
1062     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1063     {
1064         /* The master has the correct distribution */
1065         return;
1066     }
1067
1068     if (state_local->ddp_count == dd->ddp_count)
1069     {
1070         /* The local state and DD are in sync, use the DD indices */
1071         ncg_home = dd->ncg_home;
1072         cg       = dd->index_gl;
1073         nat_home = dd->nat_home;
1074     }
1075     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1076     {
1077         /* The DD is out of sync with the local state, but we have stored
1078          * the cg indices with the local state, so we can use those.
1079          */
1080         t_block *cgs_gl;
1081
1082         cgs_gl = &dd->comm->cgs_gl;
1083
1084         ncg_home = state_local->cg_gl.size();
1085         cg       = state_local->cg_gl.data();
1086         nat_home = 0;
1087         for (i = 0; i < ncg_home; i++)
1088         {
1089             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1090         }
1091     }
1092     else
1093     {
1094         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1095     }
1096
1097     buf2[0] = ncg_home;
1098     buf2[1] = nat_home;
1099     if (DDMASTER(dd))
1100     {
1101         ma   = dd->ma;
1102         ibuf = ma->ibuf;
1103     }
1104     else
1105     {
1106         ibuf = nullptr;
1107     }
1108     /* Collect the charge group and atom counts on the master */
1109     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1110
1111     if (DDMASTER(dd))
1112     {
1113         ma->index[0] = 0;
1114         for (i = 0; i < dd->nnodes; i++)
1115         {
1116             ma->ncg[i]     = ma->ibuf[2*i];
1117             ma->nat[i]     = ma->ibuf[2*i+1];
1118             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1119
1120         }
1121         /* Make byte counts and indices */
1122         for (i = 0; i < dd->nnodes; i++)
1123         {
1124             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1125             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1126         }
1127         if (debug)
1128         {
1129             fprintf(debug, "Initial charge group distribution: ");
1130             for (i = 0; i < dd->nnodes; i++)
1131             {
1132                 fprintf(debug, " %d", ma->ncg[i]);
1133             }
1134             fprintf(debug, "\n");
1135         }
1136     }
1137
1138     /* Collect the charge group indices on the master */
1139     dd_gatherv(dd,
1140                ncg_home*sizeof(int), cg,
1141                DDMASTER(dd) ? ma->ibuf : nullptr,
1142                DDMASTER(dd) ? ma->ibuf+dd->nnodes : nullptr,
1143                DDMASTER(dd) ? ma->cg : nullptr);
1144
1145     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1146 }
1147
1148 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1149                                     const rvec *lv, rvec *v)
1150 {
1151     gmx_domdec_master_t *ma;
1152     int                  n, i, c, a, nalloc = 0;
1153     rvec                *buf = nullptr;
1154     t_block             *cgs_gl;
1155
1156     ma = dd->ma;
1157
1158     if (!DDMASTER(dd))
1159     {
1160 #if GMX_MPI
1161         MPI_Send(const_cast<void *>(static_cast<const void *>(lv)), dd->nat_home*sizeof(rvec), MPI_BYTE,
1162                  DDMASTERRANK(dd), dd->rank, dd->mpi_comm_all);
1163 #endif
1164     }
1165     else
1166     {
1167         /* Copy the master coordinates to the global array */
1168         cgs_gl = &dd->comm->cgs_gl;
1169
1170         n = DDMASTERRANK(dd);
1171         a = 0;
1172         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1173         {
1174             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1175             {
1176                 copy_rvec(lv[a++], v[c]);
1177             }
1178         }
1179
1180         for (n = 0; n < dd->nnodes; n++)
1181         {
1182             if (n != dd->rank)
1183             {
1184                 if (ma->nat[n] > nalloc)
1185                 {
1186                     nalloc = over_alloc_dd(ma->nat[n]);
1187                     srenew(buf, nalloc);
1188                 }
1189 #if GMX_MPI
1190                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1191                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1192 #endif
1193                 a = 0;
1194                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1195                 {
1196                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1197                     {
1198                         copy_rvec(buf[a++], v[c]);
1199                     }
1200                 }
1201             }
1202         }
1203         sfree(buf);
1204     }
1205 }
1206
1207 static void get_commbuffer_counts(gmx_domdec_t *dd,
1208                                   int **counts, int **disps)
1209 {
1210     gmx_domdec_master_t *ma;
1211     int                  n;
1212
1213     ma = dd->ma;
1214
1215     /* Make the rvec count and displacment arrays */
1216     *counts  = ma->ibuf;
1217     *disps   = ma->ibuf + dd->nnodes;
1218     for (n = 0; n < dd->nnodes; n++)
1219     {
1220         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1221         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1222     }
1223 }
1224
1225 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1226                                    const rvec *lv, rvec *v)
1227 {
1228     gmx_domdec_master_t *ma;
1229     int                 *rcounts = nullptr, *disps = nullptr;
1230     int                  n, i, c, a;
1231     rvec                *buf = nullptr;
1232     t_block             *cgs_gl;
1233
1234     ma = dd->ma;
1235
1236     if (DDMASTER(dd))
1237     {
1238         get_commbuffer_counts(dd, &rcounts, &disps);
1239
1240         buf = ma->vbuf;
1241     }
1242
1243     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1244
1245     if (DDMASTER(dd))
1246     {
1247         cgs_gl = &dd->comm->cgs_gl;
1248
1249         a = 0;
1250         for (n = 0; n < dd->nnodes; n++)
1251         {
1252             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1253             {
1254                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1255                 {
1256                     copy_rvec(buf[a++], v[c]);
1257                 }
1258             }
1259         }
1260     }
1261 }
1262
1263 void dd_collect_vec(gmx_domdec_t           *dd,
1264                     t_state                *state_local,
1265                     const PaddedRVecVector *localVector,
1266                     rvec                   *v)
1267 {
1268     dd_collect_cg(dd, state_local);
1269
1270     const rvec *lv = as_rvec_array(localVector->data());
1271
1272     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1273     {
1274         dd_collect_vec_sendrecv(dd, lv, v);
1275     }
1276     else
1277     {
1278         dd_collect_vec_gatherv(dd, lv, v);
1279     }
1280 }
1281
1282 void dd_collect_vec(gmx_domdec_t           *dd,
1283                     t_state                *state_local,
1284                     const PaddedRVecVector *localVector,
1285                     PaddedRVecVector       *vector)
1286 {
1287     dd_collect_vec(dd, state_local, localVector, as_rvec_array(vector->data()));
1288 }
1289
1290
1291 void dd_collect_state(gmx_domdec_t *dd,
1292                       t_state *state_local, t_state *state)
1293 {
1294     int nh = state->nhchainlength;
1295
1296     if (DDMASTER(dd))
1297     {
1298         for (int i = 0; i < efptNR; i++)
1299         {
1300             state->lambda[i] = state_local->lambda[i];
1301         }
1302         state->fep_state = state_local->fep_state;
1303         state->veta      = state_local->veta;
1304         state->vol0      = state_local->vol0;
1305         copy_mat(state_local->box, state->box);
1306         copy_mat(state_local->boxv, state->boxv);
1307         copy_mat(state_local->svir_prev, state->svir_prev);
1308         copy_mat(state_local->fvir_prev, state->fvir_prev);
1309         copy_mat(state_local->pres_prev, state->pres_prev);
1310
1311         for (int i = 0; i < state_local->ngtc; i++)
1312         {
1313             for (int j = 0; j < nh; j++)
1314             {
1315                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1316                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1317             }
1318             state->therm_integral[i] = state_local->therm_integral[i];
1319         }
1320         for (int i = 0; i < state_local->nnhpres; i++)
1321         {
1322             for (int j = 0; j < nh; j++)
1323             {
1324                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1325                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1326             }
1327         }
1328     }
1329     if (state_local->flags & (1 << estX))
1330     {
1331         dd_collect_vec(dd, state_local, &state_local->x, &state->x);
1332     }
1333     if (state_local->flags & (1 << estV))
1334     {
1335         dd_collect_vec(dd, state_local, &state_local->v, &state->v);
1336     }
1337     if (state_local->flags & (1 << estCGP))
1338     {
1339         dd_collect_vec(dd, state_local, &state_local->cg_p, &state->cg_p);
1340     }
1341 }
1342
1343 static void dd_resize_state(t_state *state, PaddedRVecVector *f, int natoms)
1344 {
1345     if (debug)
1346     {
1347         fprintf(debug, "Resizing state: currently %d, required %d\n", state->natoms, natoms);
1348     }
1349
1350     state_change_natoms(state, natoms);
1351
1352     if (f != nullptr)
1353     {
1354         /* We need to allocate one element extra, since we might use
1355          * (unaligned) 4-wide SIMD loads to access rvec entries.
1356          */
1357         f->resize(natoms + 1);
1358     }
1359 }
1360
1361 static void dd_check_alloc_ncg(t_forcerec       *fr,
1362                                t_state          *state,
1363                                PaddedRVecVector *f,
1364                                int               numChargeGroups)
1365 {
1366     if (numChargeGroups > fr->cg_nalloc)
1367     {
1368         if (debug)
1369         {
1370             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, numChargeGroups, over_alloc_dd(numChargeGroups));
1371         }
1372         fr->cg_nalloc = over_alloc_dd(numChargeGroups);
1373         srenew(fr->cginfo, fr->cg_nalloc);
1374         if (fr->cutoff_scheme == ecutsGROUP)
1375         {
1376             srenew(fr->cg_cm, fr->cg_nalloc);
1377         }
1378     }
1379     if (fr->cutoff_scheme == ecutsVERLET)
1380     {
1381         /* We don't use charge groups, we use x in state to set up
1382          * the atom communication.
1383          */
1384         dd_resize_state(state, f, numChargeGroups);
1385     }
1386 }
1387
1388 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1389                                        rvec *v, rvec *lv)
1390 {
1391     gmx_domdec_master_t *ma;
1392     int                  n, i, c, a, nalloc = 0;
1393     rvec                *buf = nullptr;
1394
1395     if (DDMASTER(dd))
1396     {
1397         ma  = dd->ma;
1398
1399         for (n = 0; n < dd->nnodes; n++)
1400         {
1401             if (n != dd->rank)
1402             {
1403                 if (ma->nat[n] > nalloc)
1404                 {
1405                     nalloc = over_alloc_dd(ma->nat[n]);
1406                     srenew(buf, nalloc);
1407                 }
1408                 /* Use lv as a temporary buffer */
1409                 a = 0;
1410                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1411                 {
1412                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1413                     {
1414                         copy_rvec(v[c], buf[a++]);
1415                     }
1416                 }
1417                 if (a != ma->nat[n])
1418                 {
1419                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1420                               a, ma->nat[n]);
1421                 }
1422
1423 #if GMX_MPI
1424                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1425                          DDRANK(dd, n), n, dd->mpi_comm_all);
1426 #endif
1427             }
1428         }
1429         sfree(buf);
1430         n = DDMASTERRANK(dd);
1431         a = 0;
1432         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1433         {
1434             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1435             {
1436                 copy_rvec(v[c], lv[a++]);
1437             }
1438         }
1439     }
1440     else
1441     {
1442 #if GMX_MPI
1443         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1444                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1445 #endif
1446     }
1447 }
1448
1449 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1450                                        rvec *v, rvec *lv)
1451 {
1452     gmx_domdec_master_t *ma;
1453     int                 *scounts = nullptr, *disps = nullptr;
1454     int                  n, i, c, a;
1455     rvec                *buf = nullptr;
1456
1457     if (DDMASTER(dd))
1458     {
1459         ma  = dd->ma;
1460
1461         get_commbuffer_counts(dd, &scounts, &disps);
1462
1463         buf = ma->vbuf;
1464         a   = 0;
1465         for (n = 0; n < dd->nnodes; n++)
1466         {
1467             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1468             {
1469                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1470                 {
1471                     copy_rvec(v[c], buf[a++]);
1472                 }
1473             }
1474         }
1475     }
1476
1477     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1478 }
1479
1480 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1481 {
1482     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1483     {
1484         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1485     }
1486     else
1487     {
1488         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1489     }
1490 }
1491
1492 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1493 {
1494     if (dfhist == nullptr)
1495     {
1496         return;
1497     }
1498
1499     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1500     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1501     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1502
1503     if (dfhist->nlambda > 0)
1504     {
1505         int nlam = dfhist->nlambda;
1506         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1507         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1508         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1509         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1510         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1511         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1512
1513         for (int i = 0; i < nlam; i++)
1514         {
1515             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1516             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1517             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1518             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1519             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1520             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1521         }
1522     }
1523 }
1524
1525 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1526                                 t_state *state, t_state *state_local,
1527                                 PaddedRVecVector *f)
1528 {
1529     int nh = state->nhchainlength;
1530
1531     if (DDMASTER(dd))
1532     {
1533         for (int i = 0; i < efptNR; i++)
1534         {
1535             state_local->lambda[i] = state->lambda[i];
1536         }
1537         state_local->fep_state = state->fep_state;
1538         state_local->veta      = state->veta;
1539         state_local->vol0      = state->vol0;
1540         copy_mat(state->box, state_local->box);
1541         copy_mat(state->box_rel, state_local->box_rel);
1542         copy_mat(state->boxv, state_local->boxv);
1543         copy_mat(state->svir_prev, state_local->svir_prev);
1544         copy_mat(state->fvir_prev, state_local->fvir_prev);
1545         if (state->dfhist != nullptr)
1546         {
1547             copy_df_history(state_local->dfhist, state->dfhist);
1548         }
1549         for (int i = 0; i < state_local->ngtc; i++)
1550         {
1551             for (int j = 0; j < nh; j++)
1552             {
1553                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1554                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1555             }
1556             state_local->therm_integral[i] = state->therm_integral[i];
1557         }
1558         for (int i = 0; i < state_local->nnhpres; i++)
1559         {
1560             for (int j = 0; j < nh; j++)
1561             {
1562                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1563                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1564             }
1565         }
1566     }
1567     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda.data());
1568     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1569     dd_bcast(dd, sizeof(real), &state_local->veta);
1570     dd_bcast(dd, sizeof(real), &state_local->vol0);
1571     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1572     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1573     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1574     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1575     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1576     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi.data());
1577     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi.data());
1578     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral.data());
1579     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi.data());
1580     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi.data());
1581
1582     /* communicate df_history -- required for restarting from checkpoint */
1583     dd_distribute_dfhist(dd, state_local->dfhist);
1584
1585     dd_resize_state(state_local, f, dd->nat_home);
1586
1587     if (state_local->flags & (1 << estX))
1588     {
1589         dd_distribute_vec(dd, cgs, as_rvec_array(state->x.data()), as_rvec_array(state_local->x.data()));
1590     }
1591     if (state_local->flags & (1 << estV))
1592     {
1593         dd_distribute_vec(dd, cgs, as_rvec_array(state->v.data()), as_rvec_array(state_local->v.data()));
1594     }
1595     if (state_local->flags & (1 << estCGP))
1596     {
1597         dd_distribute_vec(dd, cgs, as_rvec_array(state->cg_p.data()), as_rvec_array(state_local->cg_p.data()));
1598     }
1599 }
1600
1601 static char dim2char(int dim)
1602 {
1603     char c = '?';
1604
1605     switch (dim)
1606     {
1607         case XX: c = 'X'; break;
1608         case YY: c = 'Y'; break;
1609         case ZZ: c = 'Z'; break;
1610         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1611     }
1612
1613     return c;
1614 }
1615
1616 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1617                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1618 {
1619     rvec   grid_s[2], *grid_r = nullptr, cx, r;
1620     char   fname[STRLEN], buf[22];
1621     FILE  *out;
1622     int    a, i, d, z, y, x;
1623     matrix tric;
1624     real   vol;
1625
1626     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1627     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1628
1629     if (DDMASTER(dd))
1630     {
1631         snew(grid_r, 2*dd->nnodes);
1632     }
1633
1634     dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : nullptr);
1635
1636     if (DDMASTER(dd))
1637     {
1638         for (d = 0; d < DIM; d++)
1639         {
1640             for (i = 0; i < DIM; i++)
1641             {
1642                 if (d == i)
1643                 {
1644                     tric[d][i] = 1;
1645                 }
1646                 else
1647                 {
1648                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1649                     {
1650                         tric[d][i] = box[i][d]/box[i][i];
1651                     }
1652                     else
1653                     {
1654                         tric[d][i] = 0;
1655                     }
1656                 }
1657             }
1658         }
1659         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1660         out = gmx_fio_fopen(fname, "w");
1661         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1662         a = 1;
1663         for (i = 0; i < dd->nnodes; i++)
1664         {
1665             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1666             for (d = 0; d < DIM; d++)
1667             {
1668                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1669             }
1670             for (z = 0; z < 2; z++)
1671             {
1672                 for (y = 0; y < 2; y++)
1673                 {
1674                     for (x = 0; x < 2; x++)
1675                     {
1676                         cx[XX] = grid_r[i*2+x][XX];
1677                         cx[YY] = grid_r[i*2+y][YY];
1678                         cx[ZZ] = grid_r[i*2+z][ZZ];
1679                         mvmul(tric, cx, r);
1680                         gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
1681                                                  10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
1682                     }
1683                 }
1684             }
1685             for (d = 0; d < DIM; d++)
1686             {
1687                 for (x = 0; x < 4; x++)
1688                 {
1689                     switch (d)
1690                     {
1691                         case 0: y = 1 + i*8 + 2*x; break;
1692                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1693                         case 2: y = 1 + i*8 + x; break;
1694                     }
1695                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
1696                 }
1697             }
1698         }
1699         gmx_fio_fclose(out);
1700         sfree(grid_r);
1701     }
1702 }
1703
1704 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
1705                   const gmx_mtop_t *mtop, t_commrec *cr,
1706                   int natoms, rvec x[], matrix box)
1707 {
1708     char          fname[STRLEN], buf[22];
1709     FILE         *out;
1710     int           i, ii, resnr, c;
1711     const char   *atomname, *resname;
1712     real          b;
1713     gmx_domdec_t *dd;
1714
1715     dd = cr->dd;
1716     if (natoms == -1)
1717     {
1718         natoms = dd->comm->nat[ddnatVSITE];
1719     }
1720
1721     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
1722
1723     out = gmx_fio_fopen(fname, "w");
1724
1725     fprintf(out, "TITLE     %s\n", title);
1726     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1727     int molb = 0;
1728     for (i = 0; i < natoms; i++)
1729     {
1730         ii = dd->gatindex[i];
1731         mtopGetAtomAndResidueName(mtop, ii, &molb, &atomname, &resnr, &resname, nullptr);
1732         if (i < dd->comm->nat[ddnatZONE])
1733         {
1734             c = 0;
1735             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1736             {
1737                 c++;
1738             }
1739             b = c;
1740         }
1741         else if (i < dd->comm->nat[ddnatVSITE])
1742         {
1743             b = dd->comm->zones.n;
1744         }
1745         else
1746         {
1747             b = dd->comm->zones.n + 1;
1748         }
1749         gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
1750                                  10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
1751     }
1752     fprintf(out, "TER\n");
1753
1754     gmx_fio_fclose(out);
1755 }
1756
1757 real dd_cutoff_multibody(const gmx_domdec_t *dd)
1758 {
1759     gmx_domdec_comm_t *comm;
1760     int                di;
1761     real               r;
1762
1763     comm = dd->comm;
1764
1765     r = -1;
1766     if (comm->bInterCGBondeds)
1767     {
1768         if (comm->cutoff_mbody > 0)
1769         {
1770             r = comm->cutoff_mbody;
1771         }
1772         else
1773         {
1774             /* cutoff_mbody=0 means we do not have DLB */
1775             r = comm->cellsize_min[dd->dim[0]];
1776             for (di = 1; di < dd->ndim; di++)
1777             {
1778                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
1779             }
1780             if (comm->bBondComm)
1781             {
1782                 r = std::max(r, comm->cutoff_mbody);
1783             }
1784             else
1785             {
1786                 r = std::min(r, comm->cutoff);
1787             }
1788         }
1789     }
1790
1791     return r;
1792 }
1793
1794 real dd_cutoff_twobody(const gmx_domdec_t *dd)
1795 {
1796     real r_mb;
1797
1798     r_mb = dd_cutoff_multibody(dd);
1799
1800     return std::max(dd->comm->cutoff, r_mb);
1801 }
1802
1803
1804 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
1805                                    ivec coord_pme)
1806 {
1807     int nc, ntot;
1808
1809     nc   = dd->nc[dd->comm->cartpmedim];
1810     ntot = dd->comm->ntot[dd->comm->cartpmedim];
1811     copy_ivec(coord, coord_pme);
1812     coord_pme[dd->comm->cartpmedim] =
1813         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
1814 }
1815
1816 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
1817 {
1818     int npp, npme;
1819
1820     npp  = dd->nnodes;
1821     npme = dd->comm->npmenodes;
1822
1823     /* Here we assign a PME node to communicate with this DD node
1824      * by assuming that the major index of both is x.
1825      * We add cr->npmenodes/2 to obtain an even distribution.
1826      */
1827     return (ddindex*npme + npme/2)/npp;
1828 }
1829
1830 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
1831 {
1832     int *pme_rank;
1833     int  n, i, p0, p1;
1834
1835     snew(pme_rank, dd->comm->npmenodes);
1836     n = 0;
1837     for (i = 0; i < dd->nnodes; i++)
1838     {
1839         p0 = ddindex2pmeindex(dd, i);
1840         p1 = ddindex2pmeindex(dd, i+1);
1841         if (i+1 == dd->nnodes || p1 > p0)
1842         {
1843             if (debug)
1844             {
1845                 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
1846             }
1847             pme_rank[n] = i + 1 + n;
1848             n++;
1849         }
1850     }
1851
1852     return pme_rank;
1853 }
1854
1855 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
1856 {
1857     gmx_domdec_t *dd;
1858     ivec          coords;
1859     int           slab;
1860
1861     dd = cr->dd;
1862     /*
1863        if (dd->comm->bCartesian) {
1864        gmx_ddindex2xyz(dd->nc,ddindex,coords);
1865        dd_coords2pmecoords(dd,coords,coords_pme);
1866        copy_ivec(dd->ntot,nc);
1867        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
1868        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1869
1870        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
1871        } else {
1872        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
1873        }
1874      */
1875     coords[XX] = x;
1876     coords[YY] = y;
1877     coords[ZZ] = z;
1878     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
1879
1880     return slab;
1881 }
1882
1883 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
1884 {
1885     gmx_domdec_comm_t *comm;
1886     ivec               coords;
1887     int                ddindex, nodeid = -1;
1888
1889     comm = cr->dd->comm;
1890
1891     coords[XX] = x;
1892     coords[YY] = y;
1893     coords[ZZ] = z;
1894     if (comm->bCartesianPP_PME)
1895     {
1896 #if GMX_MPI
1897         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
1898 #endif
1899     }
1900     else
1901     {
1902         ddindex = dd_index(cr->dd->nc, coords);
1903         if (comm->bCartesianPP)
1904         {
1905             nodeid = comm->ddindex2simnodeid[ddindex];
1906         }
1907         else
1908         {
1909             if (comm->pmenodes)
1910             {
1911                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
1912             }
1913             else
1914             {
1915                 nodeid = ddindex;
1916             }
1917         }
1918     }
1919
1920     return nodeid;
1921 }
1922
1923 static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
1924                               const t_commrec gmx_unused *cr,
1925                               int                         sim_nodeid)
1926 {
1927     int pmenode = -1;
1928
1929     const gmx_domdec_comm_t *comm = dd->comm;
1930
1931     /* This assumes a uniform x domain decomposition grid cell size */
1932     if (comm->bCartesianPP_PME)
1933     {
1934 #if GMX_MPI
1935         ivec coord, coord_pme;
1936         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
1937         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1938         {
1939             /* This is a PP node */
1940             dd_cart_coord2pmecoord(dd, coord, coord_pme);
1941             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
1942         }
1943 #endif
1944     }
1945     else if (comm->bCartesianPP)
1946     {
1947         if (sim_nodeid < dd->nnodes)
1948         {
1949             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1950         }
1951     }
1952     else
1953     {
1954         /* This assumes DD cells with identical x coordinates
1955          * are numbered sequentially.
1956          */
1957         if (dd->comm->pmenodes == nullptr)
1958         {
1959             if (sim_nodeid < dd->nnodes)
1960             {
1961                 /* The DD index equals the nodeid */
1962                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1963             }
1964         }
1965         else
1966         {
1967             int i = 0;
1968             while (sim_nodeid > dd->comm->pmenodes[i])
1969             {
1970                 i++;
1971             }
1972             if (sim_nodeid < dd->comm->pmenodes[i])
1973             {
1974                 pmenode = dd->comm->pmenodes[i];
1975             }
1976         }
1977     }
1978
1979     return pmenode;
1980 }
1981
1982 void get_pme_nnodes(const gmx_domdec_t *dd,
1983                     int *npmenodes_x, int *npmenodes_y)
1984 {
1985     if (dd != nullptr)
1986     {
1987         *npmenodes_x = dd->comm->npmenodes_x;
1988         *npmenodes_y = dd->comm->npmenodes_y;
1989     }
1990     else
1991     {
1992         *npmenodes_x = 1;
1993         *npmenodes_y = 1;
1994     }
1995 }
1996
1997 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
1998                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
1999 {
2000     gmx_domdec_t *dd;
2001     int           x, y, z;
2002     ivec          coord, coord_pme;
2003
2004     dd = cr->dd;
2005
2006     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2007
2008     *nmy_ddnodes = 0;
2009     for (x = 0; x < dd->nc[XX]; x++)
2010     {
2011         for (y = 0; y < dd->nc[YY]; y++)
2012         {
2013             for (z = 0; z < dd->nc[ZZ]; z++)
2014             {
2015                 if (dd->comm->bCartesianPP_PME)
2016                 {
2017                     coord[XX] = x;
2018                     coord[YY] = y;
2019                     coord[ZZ] = z;
2020                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2021                     if (dd->ci[XX] == coord_pme[XX] &&
2022                         dd->ci[YY] == coord_pme[YY] &&
2023                         dd->ci[ZZ] == coord_pme[ZZ])
2024                     {
2025                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2026                     }
2027                 }
2028                 else
2029                 {
2030                     /* The slab corresponds to the nodeid in the PME group */
2031                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2032                     {
2033                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2034                     }
2035                 }
2036             }
2037         }
2038     }
2039
2040     /* The last PP-only node is the peer node */
2041     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2042
2043     if (debug)
2044     {
2045         fprintf(debug, "Receive coordinates from PP ranks:");
2046         for (x = 0; x < *nmy_ddnodes; x++)
2047         {
2048             fprintf(debug, " %d", (*my_ddnodes)[x]);
2049         }
2050         fprintf(debug, "\n");
2051     }
2052 }
2053
2054 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
2055 {
2056     gmx_bool bReceive = TRUE;
2057
2058     if (cr->npmenodes < dd->nnodes)
2059     {
2060         gmx_domdec_comm_t *comm = dd->comm;
2061         if (comm->bCartesianPP_PME)
2062         {
2063 #if GMX_MPI
2064             int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2065             ivec coords;
2066             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2067             coords[comm->cartpmedim]++;
2068             if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2069             {
2070                 int rank;
2071                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2072                 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
2073                 {
2074                     /* This is not the last PP node for pmenode */
2075                     bReceive = FALSE;
2076                 }
2077             }
2078 #else
2079             GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
2080 #endif
2081         }
2082         else
2083         {
2084             int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2085             if (cr->sim_nodeid+1 < cr->nnodes &&
2086                 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
2087             {
2088                 /* This is not the last PP node for pmenode */
2089                 bReceive = FALSE;
2090             }
2091         }
2092     }
2093
2094     return bReceive;
2095 }
2096
2097 static void set_zones_ncg_home(gmx_domdec_t *dd)
2098 {
2099     gmx_domdec_zones_t *zones;
2100     int                 i;
2101
2102     zones = &dd->comm->zones;
2103
2104     zones->cg_range[0] = 0;
2105     for (i = 1; i < zones->n+1; i++)
2106     {
2107         zones->cg_range[i] = dd->ncg_home;
2108     }
2109     /* zone_ncg1[0] should always be equal to ncg_home */
2110     dd->comm->zone_ncg1[0] = dd->ncg_home;
2111 }
2112
2113 static void rebuild_cgindex(gmx_domdec_t *dd,
2114                             const int *gcgs_index, const t_state *state)
2115 {
2116     int * gmx_restrict dd_cg_gl = dd->index_gl;
2117     int * gmx_restrict cgindex  = dd->cgindex;
2118     int                nat      = 0;
2119
2120     /* Copy back the global charge group indices from state
2121      * and rebuild the local charge group to atom index.
2122      */
2123     cgindex[0] = nat;
2124     for (unsigned int i = 0; i < state->cg_gl.size(); i++)
2125     {
2126         cgindex[i]  = nat;
2127         int cg_gl   = state->cg_gl[i];
2128         dd_cg_gl[i] = cg_gl;
2129         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2130     }
2131     cgindex[state->cg_gl.size()] = nat;
2132
2133     dd->ncg_home = state->cg_gl.size();
2134     dd->nat_home = nat;
2135
2136     set_zones_ncg_home(dd);
2137 }
2138
2139 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2140 {
2141     while (cg >= cginfo_mb->cg_end)
2142     {
2143         cginfo_mb++;
2144     }
2145
2146     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2147 }
2148
2149 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2150                           t_forcerec *fr, char *bLocalCG)
2151 {
2152     cginfo_mb_t *cginfo_mb;
2153     int         *cginfo;
2154     int          cg;
2155
2156     if (fr != nullptr)
2157     {
2158         cginfo_mb = fr->cginfo_mb;
2159         cginfo    = fr->cginfo;
2160
2161         for (cg = cg0; cg < cg1; cg++)
2162         {
2163             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2164         }
2165     }
2166
2167     if (bLocalCG != nullptr)
2168     {
2169         for (cg = cg0; cg < cg1; cg++)
2170         {
2171             bLocalCG[index_gl[cg]] = TRUE;
2172         }
2173     }
2174 }
2175
2176 static void make_dd_indices(gmx_domdec_t *dd,
2177                             const int *gcgs_index, int cg_start)
2178 {
2179     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2180     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2181     gmx_bool     bCGs;
2182
2183     if (dd->nat_tot > dd->gatindex_nalloc)
2184     {
2185         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2186         srenew(dd->gatindex, dd->gatindex_nalloc);
2187     }
2188
2189     nzone      = dd->comm->zones.n;
2190     zone2cg    = dd->comm->zones.cg_range;
2191     zone_ncg1  = dd->comm->zone_ncg1;
2192     index_gl   = dd->index_gl;
2193     gatindex   = dd->gatindex;
2194     bCGs       = dd->comm->bCGs;
2195
2196     if (zone2cg[1] != dd->ncg_home)
2197     {
2198         gmx_incons("dd->ncg_zone is not up to date");
2199     }
2200
2201     /* Make the local to global and global to local atom index */
2202     a = dd->cgindex[cg_start];
2203     for (zone = 0; zone < nzone; zone++)
2204     {
2205         if (zone == 0)
2206         {
2207             cg0 = cg_start;
2208         }
2209         else
2210         {
2211             cg0 = zone2cg[zone];
2212         }
2213         cg1    = zone2cg[zone+1];
2214         cg1_p1 = cg0 + zone_ncg1[zone];
2215
2216         for (cg = cg0; cg < cg1; cg++)
2217         {
2218             zone1 = zone;
2219             if (cg >= cg1_p1)
2220             {
2221                 /* Signal that this cg is from more than one pulse away */
2222                 zone1 += nzone;
2223             }
2224             cg_gl = index_gl[cg];
2225             if (bCGs)
2226             {
2227                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2228                 {
2229                     gatindex[a] = a_gl;
2230                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2231                     a++;
2232                 }
2233             }
2234             else
2235             {
2236                 gatindex[a] = cg_gl;
2237                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2238                 a++;
2239             }
2240         }
2241     }
2242 }
2243
2244 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2245                           const char *where)
2246 {
2247     int i, ngl, nerr;
2248
2249     nerr = 0;
2250     if (bLocalCG == nullptr)
2251     {
2252         return nerr;
2253     }
2254     for (i = 0; i < dd->ncg_tot; i++)
2255     {
2256         if (!bLocalCG[dd->index_gl[i]])
2257         {
2258             fprintf(stderr,
2259                     "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2260             nerr++;
2261         }
2262     }
2263     ngl = 0;
2264     for (i = 0; i < ncg_sys; i++)
2265     {
2266         if (bLocalCG[i])
2267         {
2268             ngl++;
2269         }
2270     }
2271     if (ngl != dd->ncg_tot)
2272     {
2273         fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2274         nerr++;
2275     }
2276
2277     return nerr;
2278 }
2279
2280 static void check_index_consistency(gmx_domdec_t *dd,
2281                                     int natoms_sys, int ncg_sys,
2282                                     const char *where)
2283 {
2284     int   nerr, ngl, i, a, cell;
2285     int  *have;
2286
2287     nerr = 0;
2288
2289     if (dd->comm->DD_debug > 1)
2290     {
2291         snew(have, natoms_sys);
2292         for (a = 0; a < dd->nat_tot; a++)
2293         {
2294             if (have[dd->gatindex[a]] > 0)
2295             {
2296                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2297             }
2298             else
2299             {
2300                 have[dd->gatindex[a]] = a + 1;
2301             }
2302         }
2303         sfree(have);
2304     }
2305
2306     snew(have, dd->nat_tot);
2307
2308     ngl  = 0;
2309     for (i = 0; i < natoms_sys; i++)
2310     {
2311         if (ga2la_get(dd->ga2la, i, &a, &cell))
2312         {
2313             if (a >= dd->nat_tot)
2314             {
2315                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2316                 nerr++;
2317             }
2318             else
2319             {
2320                 have[a] = 1;
2321                 if (dd->gatindex[a] != i)
2322                 {
2323                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2324                     nerr++;
2325                 }
2326             }
2327             ngl++;
2328         }
2329     }
2330     if (ngl != dd->nat_tot)
2331     {
2332         fprintf(stderr,
2333                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2334                 dd->rank, where, ngl, dd->nat_tot);
2335     }
2336     for (a = 0; a < dd->nat_tot; a++)
2337     {
2338         if (have[a] == 0)
2339         {
2340             fprintf(stderr,
2341                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
2342                     dd->rank, where, a+1, dd->gatindex[a]+1);
2343         }
2344     }
2345     sfree(have);
2346
2347     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2348
2349     if (nerr > 0)
2350     {
2351         gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2352                   dd->rank, where, nerr);
2353     }
2354 }
2355
2356 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2357 {
2358     int   i;
2359     char *bLocalCG;
2360
2361     if (a_start == 0)
2362     {
2363         /* Clear the whole list without searching */
2364         ga2la_clear(dd->ga2la);
2365     }
2366     else
2367     {
2368         for (i = a_start; i < dd->nat_tot; i++)
2369         {
2370             ga2la_del(dd->ga2la, dd->gatindex[i]);
2371         }
2372     }
2373
2374     bLocalCG = dd->comm->bLocalCG;
2375     if (bLocalCG)
2376     {
2377         for (i = cg_start; i < dd->ncg_tot; i++)
2378         {
2379             bLocalCG[dd->index_gl[i]] = FALSE;
2380         }
2381     }
2382
2383     dd_clear_local_vsite_indices(dd);
2384
2385     if (dd->constraints)
2386     {
2387         dd_clear_local_constraint_indices(dd);
2388     }
2389 }
2390
2391 /* This function should be used for moving the domain boudaries during DLB,
2392  * for obtaining the minimum cell size. It checks the initially set limit
2393  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2394  * and, possibly, a longer cut-off limit set for PME load balancing.
2395  */
2396 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2397 {
2398     real cellsize_min;
2399
2400     cellsize_min = comm->cellsize_min[dim];
2401
2402     if (!comm->bVacDLBNoLimit)
2403     {
2404         /* The cut-off might have changed, e.g. by PME load balacning,
2405          * from the value used to set comm->cellsize_min, so check it.
2406          */
2407         cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2408
2409         if (comm->bPMELoadBalDLBLimits)
2410         {
2411             /* Check for the cut-off limit set by the PME load balancing */
2412             cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2413         }
2414     }
2415
2416     return cellsize_min;
2417 }
2418
2419 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2420                             int dim_ind)
2421 {
2422     real grid_jump_limit;
2423
2424     /* The distance between the boundaries of cells at distance
2425      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2426      * and by the fact that cells should not be shifted by more than
2427      * half their size, such that cg's only shift by one cell
2428      * at redecomposition.
2429      */
2430     grid_jump_limit = comm->cellsize_limit;
2431     if (!comm->bVacDLBNoLimit)
2432     {
2433         if (comm->bPMELoadBalDLBLimits)
2434         {
2435             cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
2436         }
2437         grid_jump_limit = std::max(grid_jump_limit,
2438                                    cutoff/comm->cd[dim_ind].np);
2439     }
2440
2441     return grid_jump_limit;
2442 }
2443
2444 static gmx_bool check_grid_jump(gmx_int64_t     step,
2445                                 gmx_domdec_t   *dd,
2446                                 real            cutoff,
2447                                 gmx_ddbox_t    *ddbox,
2448                                 gmx_bool        bFatal)
2449 {
2450     gmx_domdec_comm_t *comm;
2451     int                d, dim;
2452     real               limit, bfac;
2453     gmx_bool           bInvalid;
2454
2455     bInvalid = FALSE;
2456
2457     comm = dd->comm;
2458
2459     for (d = 1; d < dd->ndim; d++)
2460     {
2461         dim   = dd->dim[d];
2462         limit = grid_jump_limit(comm, cutoff, d);
2463         bfac  = ddbox->box_size[dim];
2464         if (ddbox->tric_dir[dim])
2465         {
2466             bfac *= ddbox->skew_fac[dim];
2467         }
2468         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2469                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2470         {
2471             bInvalid = TRUE;
2472
2473             if (bFatal)
2474             {
2475                 char buf[22];
2476
2477                 /* This error should never be triggered under normal
2478                  * circumstances, but you never know ...
2479                  */
2480                 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2481                           gmx_step_str(step, buf),
2482                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2483             }
2484         }
2485     }
2486
2487     return bInvalid;
2488 }
2489
2490 static int dd_load_count(gmx_domdec_comm_t *comm)
2491 {
2492     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2493 }
2494
2495 static float dd_force_load(gmx_domdec_comm_t *comm)
2496 {
2497     float load;
2498
2499     if (comm->eFlop)
2500     {
2501         load = comm->flop;
2502         if (comm->eFlop > 1)
2503         {
2504             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2505         }
2506     }
2507     else
2508     {
2509         load = comm->cycl[ddCyclF];
2510         if (comm->cycl_n[ddCyclF] > 1)
2511         {
2512             /* Subtract the maximum of the last n cycle counts
2513              * to get rid of possible high counts due to other sources,
2514              * for instance system activity, that would otherwise
2515              * affect the dynamic load balancing.
2516              */
2517             load -= comm->cycl_max[ddCyclF];
2518         }
2519
2520 #if GMX_MPI
2521         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2522         {
2523             float gpu_wait, gpu_wait_sum;
2524
2525             gpu_wait = comm->cycl[ddCyclWaitGPU];
2526             if (comm->cycl_n[ddCyclF] > 1)
2527             {
2528                 /* We should remove the WaitGPU time of the same MD step
2529                  * as the one with the maximum F time, since the F time
2530                  * and the wait time are not independent.
2531                  * Furthermore, the step for the max F time should be chosen
2532                  * the same on all ranks that share the same GPU.
2533                  * But to keep the code simple, we remove the average instead.
2534                  * The main reason for artificially long times at some steps
2535                  * is spurious CPU activity or MPI time, so we don't expect
2536                  * that changes in the GPU wait time matter a lot here.
2537                  */
2538                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2539             }
2540             /* Sum the wait times over the ranks that share the same GPU */
2541             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2542                           comm->mpi_comm_gpu_shared);
2543             /* Replace the wait time by the average over the ranks */
2544             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2545         }
2546 #endif
2547     }
2548
2549     return load;
2550 }
2551
2552 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2553 {
2554     gmx_domdec_comm_t *comm;
2555     int                i;
2556
2557     comm = dd->comm;
2558
2559     snew(*dim_f, dd->nc[dim]+1);
2560     (*dim_f)[0] = 0;
2561     for (i = 1; i < dd->nc[dim]; i++)
2562     {
2563         if (comm->slb_frac[dim])
2564         {
2565             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2566         }
2567         else
2568         {
2569             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2570         }
2571     }
2572     (*dim_f)[dd->nc[dim]] = 1;
2573 }
2574
2575 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2576 {
2577     int  pmeindex, slab, nso, i;
2578     ivec xyz;
2579
2580     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2581     {
2582         ddpme->dim = YY;
2583     }
2584     else
2585     {
2586         ddpme->dim = dimind;
2587     }
2588     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2589
2590     ddpme->nslab = (ddpme->dim == 0 ?
2591                     dd->comm->npmenodes_x :
2592                     dd->comm->npmenodes_y);
2593
2594     if (ddpme->nslab <= 1)
2595     {
2596         return;
2597     }
2598
2599     nso = dd->comm->npmenodes/ddpme->nslab;
2600     /* Determine for each PME slab the PP location range for dimension dim */
2601     snew(ddpme->pp_min, ddpme->nslab);
2602     snew(ddpme->pp_max, ddpme->nslab);
2603     for (slab = 0; slab < ddpme->nslab; slab++)
2604     {
2605         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2606         ddpme->pp_max[slab] = 0;
2607     }
2608     for (i = 0; i < dd->nnodes; i++)
2609     {
2610         ddindex2xyz(dd->nc, i, xyz);
2611         /* For y only use our y/z slab.
2612          * This assumes that the PME x grid size matches the DD grid size.
2613          */
2614         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2615         {
2616             pmeindex = ddindex2pmeindex(dd, i);
2617             if (dimind == 0)
2618             {
2619                 slab = pmeindex/nso;
2620             }
2621             else
2622             {
2623                 slab = pmeindex % ddpme->nslab;
2624             }
2625             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
2626             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
2627         }
2628     }
2629
2630     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2631 }
2632
2633 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
2634 {
2635     if (dd->comm->ddpme[0].dim == XX)
2636     {
2637         return dd->comm->ddpme[0].maxshift;
2638     }
2639     else
2640     {
2641         return 0;
2642     }
2643 }
2644
2645 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
2646 {
2647     if (dd->comm->ddpme[0].dim == YY)
2648     {
2649         return dd->comm->ddpme[0].maxshift;
2650     }
2651     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2652     {
2653         return dd->comm->ddpme[1].maxshift;
2654     }
2655     else
2656     {
2657         return 0;
2658     }
2659 }
2660
2661 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2662                              gmx_bool bUniform, const gmx_ddbox_t *ddbox,
2663                              const real *cell_f)
2664 {
2665     gmx_domdec_comm_t *comm;
2666     int                nc, ns, s;
2667     int               *xmin, *xmax;
2668     real               range, pme_boundary;
2669     int                sh;
2670
2671     comm = dd->comm;
2672     nc   = dd->nc[ddpme->dim];
2673     ns   = ddpme->nslab;
2674
2675     if (!ddpme->dim_match)
2676     {
2677         /* PP decomposition is not along dim: the worst situation */
2678         sh = ns/2;
2679     }
2680     else if (ns <= 3 || (bUniform && ns == nc))
2681     {
2682         /* The optimal situation */
2683         sh = 1;
2684     }
2685     else
2686     {
2687         /* We need to check for all pme nodes which nodes they
2688          * could possibly need to communicate with.
2689          */
2690         xmin = ddpme->pp_min;
2691         xmax = ddpme->pp_max;
2692         /* Allow for atoms to be maximally 2/3 times the cut-off
2693          * out of their DD cell. This is a reasonable balance between
2694          * between performance and support for most charge-group/cut-off
2695          * combinations.
2696          */
2697         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2698         /* Avoid extra communication when we are exactly at a boundary */
2699         range *= 0.999;
2700
2701         sh = 1;
2702         for (s = 0; s < ns; s++)
2703         {
2704             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2705             pme_boundary = (real)s/ns;
2706             while (sh+1 < ns &&
2707                    ((s-(sh+1) >= 0 &&
2708                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2709                     (s-(sh+1) <  0 &&
2710                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2711             {
2712                 sh++;
2713             }
2714             pme_boundary = (real)(s+1)/ns;
2715             while (sh+1 < ns &&
2716                    ((s+(sh+1) <  ns &&
2717                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2718                     (s+(sh+1) >= ns &&
2719                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2720             {
2721                 sh++;
2722             }
2723         }
2724     }
2725
2726     ddpme->maxshift = sh;
2727
2728     if (debug)
2729     {
2730         fprintf(debug, "PME slab communication range for dim %d is %d\n",
2731                 ddpme->dim, ddpme->maxshift);
2732     }
2733 }
2734
2735 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
2736 {
2737     int d, dim;
2738
2739     for (d = 0; d < dd->ndim; d++)
2740     {
2741         dim = dd->dim[d];
2742         if (dim < ddbox->nboundeddim &&
2743             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2744             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2745         {
2746             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2747                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2748                       dd->nc[dim], dd->comm->cellsize_limit);
2749         }
2750     }
2751 }
2752
2753 enum {
2754     setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
2755 };
2756
2757 /* Set the domain boundaries. Use for static (or no) load balancing,
2758  * and also for the starting state for dynamic load balancing.
2759  * setmode determine if and where the boundaries are stored, use enum above.
2760  * Returns the number communication pulses in npulse.
2761  */
2762 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, const gmx_ddbox_t *ddbox,
2763                                   int setmode, ivec npulse)
2764 {
2765     gmx_domdec_comm_t *comm;
2766     int                d, j;
2767     rvec               cellsize_min;
2768     real              *cell_x, cell_dx, cellsize;
2769
2770     comm = dd->comm;
2771
2772     for (d = 0; d < DIM; d++)
2773     {
2774         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2775         npulse[d]       = 1;
2776         if (dd->nc[d] == 1 || comm->slb_frac[d] == nullptr)
2777         {
2778             /* Uniform grid */
2779             cell_dx = ddbox->box_size[d]/dd->nc[d];
2780             switch (setmode)
2781             {
2782                 case setcellsizeslbMASTER:
2783                     for (j = 0; j < dd->nc[d]+1; j++)
2784                     {
2785                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2786                     }
2787                     break;
2788                 case setcellsizeslbLOCAL:
2789                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2790                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2791                     break;
2792                 default:
2793                     break;
2794             }
2795             cellsize = cell_dx*ddbox->skew_fac[d];
2796             while (cellsize*npulse[d] < comm->cutoff)
2797             {
2798                 npulse[d]++;
2799             }
2800             cellsize_min[d] = cellsize;
2801         }
2802         else
2803         {
2804             /* Statically load balanced grid */
2805             /* Also when we are not doing a master distribution we determine
2806              * all cell borders in a loop to obtain identical values
2807              * to the master distribution case and to determine npulse.
2808              */
2809             if (setmode == setcellsizeslbMASTER)
2810             {
2811                 cell_x = dd->ma->cell_x[d];
2812             }
2813             else
2814             {
2815                 snew(cell_x, dd->nc[d]+1);
2816             }
2817             cell_x[0] = ddbox->box0[d];
2818             for (j = 0; j < dd->nc[d]; j++)
2819             {
2820                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
2821                 cell_x[j+1] = cell_x[j] + cell_dx;
2822                 cellsize    = cell_dx*ddbox->skew_fac[d];
2823                 while (cellsize*npulse[d] < comm->cutoff &&
2824                        npulse[d] < dd->nc[d]-1)
2825                 {
2826                     npulse[d]++;
2827                 }
2828                 cellsize_min[d] = std::min(cellsize_min[d], cellsize);
2829             }
2830             if (setmode == setcellsizeslbLOCAL)
2831             {
2832                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2833                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2834             }
2835             if (setmode != setcellsizeslbMASTER)
2836             {
2837                 sfree(cell_x);
2838             }
2839         }
2840         /* The following limitation is to avoid that a cell would receive
2841          * some of its own home charge groups back over the periodic boundary.
2842          * Double charge groups cause trouble with the global indices.
2843          */
2844         if (d < ddbox->npbcdim &&
2845             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2846         {
2847             char error_string[STRLEN];
2848
2849             sprintf(error_string,
2850                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2851                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
2852                     comm->cutoff,
2853                     dd->nc[d], dd->nc[d],
2854                     dd->nnodes > dd->nc[d] ? "cells" : "ranks");
2855
2856             if (setmode == setcellsizeslbLOCAL)
2857             {
2858                 gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2859                                      error_string);
2860             }
2861             else
2862             {
2863                 gmx_fatal(FARGS, error_string);
2864             }
2865         }
2866     }
2867
2868     if (!dlbIsOn(comm))
2869     {
2870         copy_rvec(cellsize_min, comm->cellsize_min);
2871     }
2872
2873     for (d = 0; d < comm->npmedecompdim; d++)
2874     {
2875         set_pme_maxshift(dd, &comm->ddpme[d],
2876                          comm->slb_frac[dd->dim[d]] == nullptr, ddbox,
2877                          comm->ddpme[d].slb_dim_f);
2878     }
2879 }
2880
2881
2882 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2883                                                   int d, int dim, domdec_root_t *root,
2884                                                   const gmx_ddbox_t *ddbox,
2885                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
2886 {
2887     gmx_domdec_comm_t *comm;
2888     int                ncd, i, j, nmin, nmin_old;
2889     gmx_bool           bLimLo, bLimHi;
2890     real              *cell_size;
2891     real               fac, halfway, cellsize_limit_f_i, region_size;
2892     gmx_bool           bPBC, bLastHi = FALSE;
2893     int                nrange[] = {range[0], range[1]};
2894
2895     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
2896
2897     comm = dd->comm;
2898
2899     ncd = dd->nc[dim];
2900
2901     bPBC = (dim < ddbox->npbcdim);
2902
2903     cell_size = root->buf_ncd;
2904
2905     if (debug)
2906     {
2907         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
2908     }
2909
2910     /* First we need to check if the scaling does not make cells
2911      * smaller than the smallest allowed size.
2912      * We need to do this iteratively, since if a cell is too small,
2913      * it needs to be enlarged, which makes all the other cells smaller,
2914      * which could in turn make another cell smaller than allowed.
2915      */
2916     for (i = range[0]; i < range[1]; i++)
2917     {
2918         root->bCellMin[i] = FALSE;
2919     }
2920     nmin = 0;
2921     do
2922     {
2923         nmin_old = nmin;
2924         /* We need the total for normalization */
2925         fac = 0;
2926         for (i = range[0]; i < range[1]; i++)
2927         {
2928             if (root->bCellMin[i] == FALSE)
2929             {
2930                 fac += cell_size[i];
2931             }
2932         }
2933         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
2934         /* Determine the cell boundaries */
2935         for (i = range[0]; i < range[1]; i++)
2936         {
2937             if (root->bCellMin[i] == FALSE)
2938             {
2939                 cell_size[i] *= fac;
2940                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
2941                 {
2942                     cellsize_limit_f_i = 0;
2943                 }
2944                 else
2945                 {
2946                     cellsize_limit_f_i = cellsize_limit_f;
2947                 }
2948                 if (cell_size[i] < cellsize_limit_f_i)
2949                 {
2950                     root->bCellMin[i] = TRUE;
2951                     cell_size[i]      = cellsize_limit_f_i;
2952                     nmin++;
2953                 }
2954             }
2955             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
2956         }
2957     }
2958     while (nmin > nmin_old);
2959
2960     i            = range[1]-1;
2961     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
2962     /* For this check we should not use DD_CELL_MARGIN,
2963      * but a slightly smaller factor,
2964      * since rounding could get use below the limit.
2965      */
2966     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
2967     {
2968         char buf[22];
2969         gmx_fatal(FARGS, "step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
2970                   gmx_step_str(step, buf),
2971                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2972                   ncd, comm->cellsize_min[dim]);
2973     }
2974
2975     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
2976
2977     if (!bUniform)
2978     {
2979         /* Check if the boundary did not displace more than halfway
2980          * each of the cells it bounds, as this could cause problems,
2981          * especially when the differences between cell sizes are large.
2982          * If changes are applied, they will not make cells smaller
2983          * than the cut-off, as we check all the boundaries which
2984          * might be affected by a change and if the old state was ok,
2985          * the cells will at most be shrunk back to their old size.
2986          */
2987         for (i = range[0]+1; i < range[1]; i++)
2988         {
2989             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
2990             if (root->cell_f[i] < halfway)
2991             {
2992                 root->cell_f[i] = halfway;
2993                 /* Check if the change also causes shifts of the next boundaries */
2994                 for (j = i+1; j < range[1]; j++)
2995                 {
2996                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
2997                     {
2998                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
2999                     }
3000                 }
3001             }
3002             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3003             if (root->cell_f[i] > halfway)
3004             {
3005                 root->cell_f[i] = halfway;
3006                 /* Check if the change also causes shifts of the next boundaries */
3007                 for (j = i-1; j >= range[0]+1; j--)
3008                 {
3009                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3010                     {
3011                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3012                     }
3013                 }
3014             }
3015         }
3016     }
3017
3018     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3019     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3020      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3021      * for a and b nrange is used */
3022     if (d > 0)
3023     {
3024         /* Take care of the staggering of the cell boundaries */
3025         if (bUniform)
3026         {
3027             for (i = range[0]; i < range[1]; i++)
3028             {
3029                 root->cell_f_max0[i] = root->cell_f[i];
3030                 root->cell_f_min1[i] = root->cell_f[i+1];
3031             }
3032         }
3033         else
3034         {
3035             for (i = range[0]+1; i < range[1]; i++)
3036             {
3037                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3038                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3039                 if (bLimLo && bLimHi)
3040                 {
3041                     /* Both limits violated, try the best we can */
3042                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3043                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3044                     nrange[0]       = range[0];
3045                     nrange[1]       = i;
3046                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3047
3048                     nrange[0] = i;
3049                     nrange[1] = range[1];
3050                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3051
3052                     return;
3053                 }
3054                 else if (bLimLo)
3055                 {
3056                     /* root->cell_f[i] = root->bound_min[i]; */
3057                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3058                     bLastHi   = FALSE;
3059                 }
3060                 else if (bLimHi && !bLastHi)
3061                 {
3062                     bLastHi = TRUE;
3063                     if (nrange[1] < range[1])   /* found a LimLo before */
3064                     {
3065                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3066                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3067                         nrange[0] = nrange[1];
3068                     }
3069                     root->cell_f[i] = root->bound_max[i];
3070                     nrange[1]       = i;
3071                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3072                     nrange[0] = i;
3073                     nrange[1] = range[1];
3074                 }
3075             }
3076             if (nrange[1] < range[1])   /* found last a LimLo */
3077             {
3078                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3079                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3080                 nrange[0] = nrange[1];
3081                 nrange[1] = range[1];
3082                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3083             }
3084             else if (nrange[0] > range[0]) /* found at least one LimHi */
3085             {
3086                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3087             }
3088         }
3089     }
3090 }
3091
3092
3093 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3094                                        int d, int dim, domdec_root_t *root,
3095                                        const gmx_ddbox_t *ddbox,
3096                                        gmx_bool bDynamicBox,
3097                                        gmx_bool bUniform, gmx_int64_t step)
3098 {
3099     gmx_domdec_comm_t *comm;
3100     int                ncd, d1, i, pos;
3101     real              *cell_size;
3102     real               load_aver, load_i, imbalance, change, change_max, sc;
3103     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3104     real               change_limit;
3105     real               relax = 0.5;
3106     gmx_bool           bPBC;
3107     int                range[] = { 0, 0 };
3108
3109     comm = dd->comm;
3110
3111     /* Convert the maximum change from the input percentage to a fraction */
3112     change_limit = comm->dlb_scale_lim*0.01;
3113
3114     ncd = dd->nc[dim];
3115
3116     bPBC = (dim < ddbox->npbcdim);
3117
3118     cell_size = root->buf_ncd;
3119
3120     /* Store the original boundaries */
3121     for (i = 0; i < ncd+1; i++)
3122     {
3123         root->old_cell_f[i] = root->cell_f[i];
3124     }
3125     if (bUniform)
3126     {
3127         for (i = 0; i < ncd; i++)
3128         {
3129             cell_size[i] = 1.0/ncd;
3130         }
3131     }
3132     else if (dd_load_count(comm) > 0)
3133     {
3134         load_aver  = comm->load[d].sum_m/ncd;
3135         change_max = 0;
3136         for (i = 0; i < ncd; i++)
3137         {
3138             /* Determine the relative imbalance of cell i */
3139             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3140             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3141             /* Determine the change of the cell size using underrelaxation */
3142             change     = -relax*imbalance;
3143             change_max = std::max(change_max, std::max(change, -change));
3144         }
3145         /* Limit the amount of scaling.
3146          * We need to use the same rescaling for all cells in one row,
3147          * otherwise the load balancing might not converge.
3148          */
3149         sc = relax;
3150         if (change_max > change_limit)
3151         {
3152             sc *= change_limit/change_max;
3153         }
3154         for (i = 0; i < ncd; i++)
3155         {
3156             /* Determine the relative imbalance of cell i */
3157             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3158             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3159             /* Determine the change of the cell size using underrelaxation */
3160             change       = -sc*imbalance;
3161             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3162         }
3163     }
3164
3165     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3166     cellsize_limit_f *= DD_CELL_MARGIN;
3167     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3168     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3169     if (ddbox->tric_dir[dim])
3170     {
3171         cellsize_limit_f /= ddbox->skew_fac[dim];
3172         dist_min_f       /= ddbox->skew_fac[dim];
3173     }
3174     if (bDynamicBox && d > 0)
3175     {
3176         dist_min_f *= DD_PRES_SCALE_MARGIN;
3177     }
3178     if (d > 0 && !bUniform)
3179     {
3180         /* Make sure that the grid is not shifted too much */
3181         for (i = 1; i < ncd; i++)
3182         {
3183             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3184             {
3185                 gmx_incons("Inconsistent DD boundary staggering limits!");
3186             }
3187             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3188             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3189             if (space > 0)
3190             {
3191                 root->bound_min[i] += 0.5*space;
3192             }
3193             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3194             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3195             if (space < 0)
3196             {
3197                 root->bound_max[i] += 0.5*space;
3198             }
3199             if (debug)
3200             {
3201                 fprintf(debug,
3202                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3203                         d, i,
3204                         root->cell_f_max0[i-1] + dist_min_f,
3205                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3206                         root->cell_f_min1[i] - dist_min_f);
3207             }
3208         }
3209     }
3210     range[1]          = ncd;
3211     root->cell_f[0]   = 0;
3212     root->cell_f[ncd] = 1;
3213     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3214
3215
3216     /* After the checks above, the cells should obey the cut-off
3217      * restrictions, but it does not hurt to check.
3218      */
3219     for (i = 0; i < ncd; i++)
3220     {
3221         if (debug)
3222         {
3223             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3224                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3225         }
3226
3227         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3228             root->cell_f[i+1] - root->cell_f[i] <
3229             cellsize_limit_f/DD_CELL_MARGIN)
3230         {
3231             char buf[22];
3232             fprintf(stderr,
3233                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3234                     gmx_step_str(step, buf), dim2char(dim), i,
3235                     (root->cell_f[i+1] - root->cell_f[i])
3236                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3237         }
3238     }
3239
3240     pos = ncd + 1;
3241     /* Store the cell boundaries of the lower dimensions at the end */
3242     for (d1 = 0; d1 < d; d1++)
3243     {
3244         root->cell_f[pos++] = comm->cell_f0[d1];
3245         root->cell_f[pos++] = comm->cell_f1[d1];
3246     }
3247
3248     if (d < comm->npmedecompdim)
3249     {
3250         /* The master determines the maximum shift for
3251          * the coordinate communication between separate PME nodes.
3252          */
3253         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3254     }
3255     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3256     if (d >= 1)
3257     {
3258         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3259     }
3260 }
3261
3262 static void relative_to_absolute_cell_bounds(gmx_domdec_t      *dd,
3263                                              const gmx_ddbox_t *ddbox,
3264                                              int                dimind)
3265 {
3266     gmx_domdec_comm_t *comm;
3267     int                dim;
3268
3269     comm = dd->comm;
3270
3271     /* Set the cell dimensions */
3272     dim                = dd->dim[dimind];
3273     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3274     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3275     if (dim >= ddbox->nboundeddim)
3276     {
3277         comm->cell_x0[dim] += ddbox->box0[dim];
3278         comm->cell_x1[dim] += ddbox->box0[dim];
3279     }
3280 }
3281
3282 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3283                                          int d, int dim, real *cell_f_row,
3284                                          const gmx_ddbox_t *ddbox)
3285 {
3286     gmx_domdec_comm_t *comm;
3287     int                d1, pos;
3288
3289     comm = dd->comm;
3290
3291 #if GMX_MPI
3292     /* Each node would only need to know two fractions,
3293      * but it is probably cheaper to broadcast the whole array.
3294      */
3295     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3296               0, comm->mpi_comm_load[d]);
3297 #endif
3298     /* Copy the fractions for this dimension from the buffer */
3299     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3300     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3301     /* The whole array was communicated, so set the buffer position */
3302     pos = dd->nc[dim] + 1;
3303     for (d1 = 0; d1 <= d; d1++)
3304     {
3305         if (d1 < d)
3306         {
3307             /* Copy the cell fractions of the lower dimensions */
3308             comm->cell_f0[d1] = cell_f_row[pos++];
3309             comm->cell_f1[d1] = cell_f_row[pos++];
3310         }
3311         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3312     }
3313     /* Convert the communicated shift from float to int */
3314     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3315     if (d >= 1)
3316     {
3317         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3318     }
3319 }
3320
3321 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3322                                          const gmx_ddbox_t *ddbox,
3323                                          gmx_bool bDynamicBox,
3324                                          gmx_bool bUniform, gmx_int64_t step)
3325 {
3326     gmx_domdec_comm_t *comm;
3327     int                d, dim, d1;
3328     gmx_bool           bRowMember, bRowRoot;
3329     real              *cell_f_row;
3330
3331     comm = dd->comm;
3332
3333     for (d = 0; d < dd->ndim; d++)
3334     {
3335         dim        = dd->dim[d];
3336         bRowMember = TRUE;
3337         bRowRoot   = TRUE;
3338         for (d1 = d; d1 < dd->ndim; d1++)
3339         {
3340             if (dd->ci[dd->dim[d1]] > 0)
3341             {
3342                 if (d1 != d)
3343                 {
3344                     bRowMember = FALSE;
3345                 }
3346                 bRowRoot = FALSE;
3347             }
3348         }
3349         if (bRowMember)
3350         {
3351             if (bRowRoot)
3352             {
3353                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3354                                            ddbox, bDynamicBox, bUniform, step);
3355                 cell_f_row = comm->root[d]->cell_f;
3356             }
3357             else
3358             {
3359                 cell_f_row = comm->cell_f_row;
3360             }
3361             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3362         }
3363     }
3364 }
3365
3366 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t      *dd,
3367                                            const gmx_ddbox_t *ddbox)
3368 {
3369     int d;
3370
3371     /* This function assumes the box is static and should therefore
3372      * not be called when the box has changed since the last
3373      * call to dd_partition_system.
3374      */
3375     for (d = 0; d < dd->ndim; d++)
3376     {
3377         relative_to_absolute_cell_bounds(dd, ddbox, d);
3378     }
3379 }
3380
3381
3382
3383 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3384                                   const gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3385                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3386                                   gmx_wallcycle_t wcycle)
3387 {
3388     gmx_domdec_comm_t *comm;
3389     int                dim;
3390
3391     comm = dd->comm;
3392
3393     if (bDoDLB)
3394     {
3395         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3396         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3397         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3398     }
3399     else if (bDynamicBox)
3400     {
3401         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3402     }
3403
3404     /* Set the dimensions for which no DD is used */
3405     for (dim = 0; dim < DIM; dim++)
3406     {
3407         if (dd->nc[dim] == 1)
3408         {
3409             comm->cell_x0[dim] = 0;
3410             comm->cell_x1[dim] = ddbox->box_size[dim];
3411             if (dim >= ddbox->nboundeddim)
3412             {
3413                 comm->cell_x0[dim] += ddbox->box0[dim];
3414                 comm->cell_x1[dim] += ddbox->box0[dim];
3415             }
3416         }
3417     }
3418 }
3419
3420 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3421 {
3422     int                    d, np, i;
3423     gmx_domdec_comm_dim_t *cd;
3424
3425     for (d = 0; d < dd->ndim; d++)
3426     {
3427         cd = &dd->comm->cd[d];
3428         np = npulse[dd->dim[d]];
3429         if (np > cd->np_nalloc)
3430         {
3431             if (debug)
3432             {
3433                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3434                         dim2char(dd->dim[d]), np);
3435             }
3436             if (DDMASTER(dd) && cd->np_nalloc > 0)
3437             {
3438                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3439             }
3440             srenew(cd->ind, np);
3441             for (i = cd->np_nalloc; i < np; i++)
3442             {
3443                 cd->ind[i].index  = nullptr;
3444                 cd->ind[i].nalloc = 0;
3445             }
3446             cd->np_nalloc = np;
3447         }
3448         cd->np = np;
3449     }
3450 }
3451
3452
3453 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3454                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3455                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3456                               gmx_wallcycle_t wcycle)
3457 {
3458     gmx_domdec_comm_t *comm;
3459     int                d;
3460     ivec               npulse;
3461
3462     comm = dd->comm;
3463
3464     /* Copy the old cell boundaries for the cg displacement check */
3465     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3466     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3467
3468     if (dlbIsOn(comm))
3469     {
3470         if (DDMASTER(dd))
3471         {
3472             check_box_size(dd, ddbox);
3473         }
3474         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3475     }
3476     else
3477     {
3478         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3479         realloc_comm_ind(dd, npulse);
3480     }
3481
3482     if (debug)
3483     {
3484         for (d = 0; d < DIM; d++)
3485         {
3486             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3487                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3488         }
3489     }
3490 }
3491
3492 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3493                                   gmx_ddbox_t *ddbox,
3494                                   rvec cell_ns_x0, rvec cell_ns_x1,
3495                                   gmx_int64_t step)
3496 {
3497     gmx_domdec_comm_t *comm;
3498     int                dim_ind, dim;
3499
3500     comm = dd->comm;
3501
3502     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3503     {
3504         dim = dd->dim[dim_ind];
3505
3506         /* Without PBC we don't have restrictions on the outer cells */
3507         if (!(dim >= ddbox->npbcdim &&
3508               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3509             dlbIsOn(comm) &&
3510             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3511             comm->cellsize_min[dim])
3512         {
3513             char buf[22];
3514             gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3515                       gmx_step_str(step, buf), dim2char(dim),
3516                       comm->cell_x1[dim] - comm->cell_x0[dim],
3517                       ddbox->skew_fac[dim],
3518                       dd->comm->cellsize_min[dim],
3519                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3520         }
3521     }
3522
3523     if ((dlbIsOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3524     {
3525         /* Communicate the boundaries and update cell_ns_x0/1 */
3526         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3527         if (dlbIsOn(dd->comm) && dd->ndim > 1)
3528         {
3529             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3530         }
3531     }
3532 }
3533
3534 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3535 {
3536     if (YY < npbcdim)
3537     {
3538         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3539     }
3540     else
3541     {
3542         tcm[YY][XX] = 0;
3543     }
3544     if (ZZ < npbcdim)
3545     {
3546         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3547         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3548     }
3549     else
3550     {
3551         tcm[ZZ][XX] = 0;
3552         tcm[ZZ][YY] = 0;
3553     }
3554 }
3555
3556 static void check_screw_box(matrix box)
3557 {
3558     /* Mathematical limitation */
3559     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3560     {
3561         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3562     }
3563
3564     /* Limitation due to the asymmetry of the eighth shell method */
3565     if (box[ZZ][YY] != 0)
3566     {
3567         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3568     }
3569 }
3570
3571 static void distribute_cg(FILE *fplog,
3572                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3573                           gmx_domdec_t *dd)
3574 {
3575     gmx_domdec_master_t *ma;
3576     int                **tmp_ind = nullptr, *tmp_nalloc = nullptr;
3577     int                  i, icg, j, k, k0, k1, d;
3578     matrix               tcm;
3579     rvec                 cg_cm;
3580     ivec                 ind;
3581     real                 nrcg, inv_ncg, pos_d;
3582     int                 *cgindex;
3583     gmx_bool             bScrew;
3584
3585     ma = dd->ma;
3586
3587     snew(tmp_nalloc, dd->nnodes);
3588     snew(tmp_ind, dd->nnodes);
3589     for (i = 0; i < dd->nnodes; i++)
3590     {
3591         tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3592         snew(tmp_ind[i], tmp_nalloc[i]);
3593     }
3594
3595     /* Clear the count */
3596     for (i = 0; i < dd->nnodes; i++)
3597     {
3598         ma->ncg[i] = 0;
3599         ma->nat[i] = 0;
3600     }
3601
3602     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3603
3604     cgindex = cgs->index;
3605
3606     /* Compute the center of geometry for all charge groups */
3607     for (icg = 0; icg < cgs->nr; icg++)
3608     {
3609         k0      = cgindex[icg];
3610         k1      = cgindex[icg+1];
3611         nrcg    = k1 - k0;
3612         if (nrcg == 1)
3613         {
3614             copy_rvec(pos[k0], cg_cm);
3615         }
3616         else
3617         {
3618             inv_ncg = 1.0/nrcg;
3619
3620             clear_rvec(cg_cm);
3621             for (k = k0; (k < k1); k++)
3622             {
3623                 rvec_inc(cg_cm, pos[k]);
3624             }
3625             for (d = 0; (d < DIM); d++)
3626             {
3627                 cg_cm[d] *= inv_ncg;
3628             }
3629         }
3630         /* Put the charge group in the box and determine the cell index */
3631         for (d = DIM-1; d >= 0; d--)
3632         {
3633             pos_d = cg_cm[d];
3634             if (d < dd->npbcdim)
3635             {
3636                 bScrew = (dd->bScrewPBC && d == XX);
3637                 if (tric_dir[d] && dd->nc[d] > 1)
3638                 {
3639                     /* Use triclinic coordintates for this dimension */
3640                     for (j = d+1; j < DIM; j++)
3641                     {
3642                         pos_d += cg_cm[j]*tcm[j][d];
3643                     }
3644                 }
3645                 while (pos_d >= box[d][d])
3646                 {
3647                     pos_d -= box[d][d];
3648                     rvec_dec(cg_cm, box[d]);
3649                     if (bScrew)
3650                     {
3651                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3652                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3653                     }
3654                     for (k = k0; (k < k1); k++)
3655                     {
3656                         rvec_dec(pos[k], box[d]);
3657                         if (bScrew)
3658                         {
3659                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3660                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3661                         }
3662                     }
3663                 }
3664                 while (pos_d < 0)
3665                 {
3666                     pos_d += box[d][d];
3667                     rvec_inc(cg_cm, box[d]);
3668                     if (bScrew)
3669                     {
3670                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3671                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3672                     }
3673                     for (k = k0; (k < k1); k++)
3674                     {
3675                         rvec_inc(pos[k], box[d]);
3676                         if (bScrew)
3677                         {
3678                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3679                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3680                         }
3681                     }
3682                 }
3683             }
3684             /* This could be done more efficiently */
3685             ind[d] = 0;
3686             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3687             {
3688                 ind[d]++;
3689             }
3690         }
3691         i = dd_index(dd->nc, ind);
3692         if (ma->ncg[i] == tmp_nalloc[i])
3693         {
3694             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3695             srenew(tmp_ind[i], tmp_nalloc[i]);
3696         }
3697         tmp_ind[i][ma->ncg[i]] = icg;
3698         ma->ncg[i]++;
3699         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3700     }
3701
3702     k1 = 0;
3703     for (i = 0; i < dd->nnodes; i++)
3704     {
3705         ma->index[i] = k1;
3706         for (k = 0; k < ma->ncg[i]; k++)
3707         {
3708             ma->cg[k1++] = tmp_ind[i][k];
3709         }
3710     }
3711     ma->index[dd->nnodes] = k1;
3712
3713     for (i = 0; i < dd->nnodes; i++)
3714     {
3715         sfree(tmp_ind[i]);
3716     }
3717     sfree(tmp_ind);
3718     sfree(tmp_nalloc);
3719
3720     if (fplog)
3721     {
3722         // Use double for the sums to avoid natoms^2 overflowing
3723         // (65537^2 > 2^32)
3724         int    nat_sum, nat_min, nat_max;
3725         double nat2_sum;
3726
3727         nat_sum  = 0;
3728         nat2_sum = 0;
3729         nat_min  = ma->nat[0];
3730         nat_max  = ma->nat[0];
3731         for (i = 0; i < dd->nnodes; i++)
3732         {
3733             nat_sum  += ma->nat[i];
3734             // cast to double to avoid integer overflows when squaring
3735             nat2_sum += gmx::square(static_cast<double>(ma->nat[i]));
3736             nat_min   = std::min(nat_min, ma->nat[i]);
3737             nat_max   = std::max(nat_max, ma->nat[i]);
3738         }
3739         nat_sum  /= dd->nnodes;
3740         nat2_sum /= dd->nnodes;
3741
3742         fprintf(fplog, "Atom distribution over %d domains: av %d stddev %d min %d max %d\n",
3743                 dd->nnodes,
3744                 nat_sum,
3745                 static_cast<int>(std::sqrt(nat2_sum - gmx::square(static_cast<double>(nat_sum)) + 0.5)),
3746                 nat_min, nat_max);
3747     }
3748 }
3749
3750 static void get_cg_distribution(FILE *fplog, gmx_domdec_t *dd,
3751                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
3752                                 rvec pos[])
3753 {
3754     gmx_domdec_master_t *ma = nullptr;
3755     ivec                 npulse;
3756     int                  i, cg_gl;
3757     int                 *ibuf, buf2[2] = { 0, 0 };
3758     gmx_bool             bMaster = DDMASTER(dd);
3759
3760     if (bMaster)
3761     {
3762         ma = dd->ma;
3763
3764         if (dd->bScrewPBC)
3765         {
3766             check_screw_box(box);
3767         }
3768
3769         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
3770
3771         distribute_cg(fplog, box, ddbox->tric_dir, cgs, pos, dd);
3772         for (i = 0; i < dd->nnodes; i++)
3773         {
3774             ma->ibuf[2*i]   = ma->ncg[i];
3775             ma->ibuf[2*i+1] = ma->nat[i];
3776         }
3777         ibuf = ma->ibuf;
3778     }
3779     else
3780     {
3781         ibuf = nullptr;
3782     }
3783     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
3784
3785     dd->ncg_home = buf2[0];
3786     dd->nat_home = buf2[1];
3787     dd->ncg_tot  = dd->ncg_home;
3788     dd->nat_tot  = dd->nat_home;
3789     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3790     {
3791         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3792         srenew(dd->index_gl, dd->cg_nalloc);
3793         srenew(dd->cgindex, dd->cg_nalloc+1);
3794     }
3795     if (bMaster)
3796     {
3797         for (i = 0; i < dd->nnodes; i++)
3798         {
3799             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
3800             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3801         }
3802     }
3803
3804     dd_scatterv(dd,
3805                 bMaster ? ma->ibuf : nullptr,
3806                 bMaster ? ma->ibuf+dd->nnodes : nullptr,
3807                 bMaster ? ma->cg : nullptr,
3808                 dd->ncg_home*sizeof(int), dd->index_gl);
3809
3810     /* Determine the home charge group sizes */
3811     dd->cgindex[0] = 0;
3812     for (i = 0; i < dd->ncg_home; i++)
3813     {
3814         cg_gl            = dd->index_gl[i];
3815         dd->cgindex[i+1] =
3816             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3817     }
3818
3819     if (debug)
3820     {
3821         fprintf(debug, "Home charge groups:\n");
3822         for (i = 0; i < dd->ncg_home; i++)
3823         {
3824             fprintf(debug, " %d", dd->index_gl[i]);
3825             if (i % 10 == 9)
3826             {
3827                 fprintf(debug, "\n");
3828             }
3829         }
3830         fprintf(debug, "\n");
3831     }
3832 }
3833
3834 static int compact_and_copy_vec_at(int ncg, int *move,
3835                                    int *cgindex,
3836                                    int nvec, int vec,
3837                                    rvec *src, gmx_domdec_comm_t *comm,
3838                                    gmx_bool bCompact)
3839 {
3840     int m, icg, i, i0, i1, nrcg;
3841     int home_pos;
3842     int pos_vec[DIM*2];
3843
3844     home_pos = 0;
3845
3846     for (m = 0; m < DIM*2; m++)
3847     {
3848         pos_vec[m] = 0;
3849     }
3850
3851     i0 = 0;
3852     for (icg = 0; icg < ncg; icg++)
3853     {
3854         i1 = cgindex[icg+1];
3855         m  = move[icg];
3856         if (m == -1)
3857         {
3858             if (bCompact)
3859             {
3860                 /* Compact the home array in place */
3861                 for (i = i0; i < i1; i++)
3862                 {
3863                     copy_rvec(src[i], src[home_pos++]);
3864                 }
3865             }
3866         }
3867         else
3868         {
3869             /* Copy to the communication buffer */
3870             nrcg        = i1 - i0;
3871             pos_vec[m] += 1 + vec*nrcg;
3872             for (i = i0; i < i1; i++)
3873             {
3874                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
3875             }
3876             pos_vec[m] += (nvec - vec - 1)*nrcg;
3877         }
3878         if (!bCompact)
3879         {
3880             home_pos += i1 - i0;
3881         }
3882         i0 = i1;
3883     }
3884
3885     return home_pos;
3886 }
3887
3888 static int compact_and_copy_vec_cg(int ncg, int *move,
3889                                    int *cgindex,
3890                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
3891                                    gmx_bool bCompact)
3892 {
3893     int m, icg, i0, i1, nrcg;
3894     int home_pos;
3895     int pos_vec[DIM*2];
3896
3897     home_pos = 0;
3898
3899     for (m = 0; m < DIM*2; m++)
3900     {
3901         pos_vec[m] = 0;
3902     }
3903
3904     i0 = 0;
3905     for (icg = 0; icg < ncg; icg++)
3906     {
3907         i1 = cgindex[icg+1];
3908         m  = move[icg];
3909         if (m == -1)
3910         {
3911             if (bCompact)
3912             {
3913                 /* Compact the home array in place */
3914                 copy_rvec(src[icg], src[home_pos++]);
3915             }
3916         }
3917         else
3918         {
3919             nrcg = i1 - i0;
3920             /* Copy to the communication buffer */
3921             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
3922             pos_vec[m] += 1 + nrcg*nvec;
3923         }
3924         i0 = i1;
3925     }
3926     if (!bCompact)
3927     {
3928         home_pos = ncg;
3929     }
3930
3931     return home_pos;
3932 }
3933
3934 static int compact_ind(int ncg, int *move,
3935                        int *index_gl, int *cgindex,
3936                        int *gatindex,
3937                        gmx_ga2la_t *ga2la, char *bLocalCG,
3938                        int *cginfo)
3939 {
3940     int cg, nat, a0, a1, a, a_gl;
3941     int home_pos;
3942
3943     home_pos = 0;
3944     nat      = 0;
3945     for (cg = 0; cg < ncg; cg++)
3946     {
3947         a0 = cgindex[cg];
3948         a1 = cgindex[cg+1];
3949         if (move[cg] == -1)
3950         {
3951             /* Compact the home arrays in place.
3952              * Anything that can be done here avoids access to global arrays.
3953              */
3954             cgindex[home_pos] = nat;
3955             for (a = a0; a < a1; a++)
3956             {
3957                 a_gl          = gatindex[a];
3958                 gatindex[nat] = a_gl;
3959                 /* The cell number stays 0, so we don't need to set it */
3960                 ga2la_change_la(ga2la, a_gl, nat);
3961                 nat++;
3962             }
3963             index_gl[home_pos] = index_gl[cg];
3964             cginfo[home_pos]   = cginfo[cg];
3965             /* The charge group remains local, so bLocalCG does not change */
3966             home_pos++;
3967         }
3968         else
3969         {
3970             /* Clear the global indices */
3971             for (a = a0; a < a1; a++)
3972             {
3973                 ga2la_del(ga2la, gatindex[a]);
3974             }
3975             if (bLocalCG)
3976             {
3977                 bLocalCG[index_gl[cg]] = FALSE;
3978             }
3979         }
3980     }
3981     cgindex[home_pos] = nat;
3982
3983     return home_pos;
3984 }
3985
3986 static void clear_and_mark_ind(int ncg, int *move,
3987                                int *index_gl, int *cgindex, int *gatindex,
3988                                gmx_ga2la_t *ga2la, char *bLocalCG,
3989                                int *cell_index)
3990 {
3991     int cg, a0, a1, a;
3992
3993     for (cg = 0; cg < ncg; cg++)
3994     {
3995         if (move[cg] >= 0)
3996         {
3997             a0 = cgindex[cg];
3998             a1 = cgindex[cg+1];
3999             /* Clear the global indices */
4000             for (a = a0; a < a1; a++)
4001             {
4002                 ga2la_del(ga2la, gatindex[a]);
4003             }
4004             if (bLocalCG)
4005             {
4006                 bLocalCG[index_gl[cg]] = FALSE;
4007             }
4008             /* Signal that this cg has moved using the ns cell index.
4009              * Here we set it to -1. fill_grid will change it
4010              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4011              */
4012             cell_index[cg] = -1;
4013         }
4014     }
4015 }
4016
4017 static void print_cg_move(FILE *fplog,
4018                           gmx_domdec_t *dd,
4019                           gmx_int64_t step, int cg, int dim, int dir,
4020                           gmx_bool bHaveCgcmOld, real limitd,
4021                           rvec cm_old, rvec cm_new, real pos_d)
4022 {
4023     gmx_domdec_comm_t *comm;
4024     char               buf[22];
4025
4026     comm = dd->comm;
4027
4028     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4029     if (limitd > 0)
4030     {
4031         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4032                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4033                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4034     }
4035     else
4036     {
4037         /* We don't have a limiting distance available: don't print it */
4038         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4039                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4040                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4041     }
4042     fprintf(fplog, "distance out of cell %f\n",
4043             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4044     if (bHaveCgcmOld)
4045     {
4046         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4047                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4048     }
4049     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4050             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4051     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4052             dim2char(dim),
4053             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4054     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4055             dim2char(dim),
4056             comm->cell_x0[dim], comm->cell_x1[dim]);
4057 }
4058
4059 static void cg_move_error(FILE *fplog,
4060                           gmx_domdec_t *dd,
4061                           gmx_int64_t step, int cg, int dim, int dir,
4062                           gmx_bool bHaveCgcmOld, real limitd,
4063                           rvec cm_old, rvec cm_new, real pos_d)
4064 {
4065     if (fplog)
4066     {
4067         print_cg_move(fplog, dd, step, cg, dim, dir,
4068                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4069     }
4070     print_cg_move(stderr, dd, step, cg, dim, dir,
4071                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4072     gmx_fatal(FARGS,
4073               "%s moved too far between two domain decomposition steps\n"
4074               "This usually means that your system is not well equilibrated",
4075               dd->comm->bCGs ? "A charge group" : "An atom");
4076 }
4077
4078 static void rotate_state_atom(t_state *state, int a)
4079 {
4080     if (state->flags & (1 << estX))
4081     {
4082         /* Rotate the complete state; for a rectangular box only */
4083         state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4084         state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4085     }
4086     if (state->flags & (1 << estV))
4087     {
4088         state->v[a][YY] = -state->v[a][YY];
4089         state->v[a][ZZ] = -state->v[a][ZZ];
4090     }
4091     if (state->flags & (1 << estCGP))
4092     {
4093         state->cg_p[a][YY] = -state->cg_p[a][YY];
4094         state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4095     }
4096 }
4097
4098 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4099 {
4100     if (natoms > comm->moved_nalloc)
4101     {
4102         /* Contents should be preserved here */
4103         comm->moved_nalloc = over_alloc_dd(natoms);
4104         srenew(comm->moved, comm->moved_nalloc);
4105     }
4106
4107     return comm->moved;
4108 }
4109
4110 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4111                          gmx_domdec_t *dd,
4112                          t_state *state,
4113                          ivec tric_dir, matrix tcm,
4114                          rvec cell_x0, rvec cell_x1,
4115                          rvec limitd, rvec limit0, rvec limit1,
4116                          const int *cgindex,
4117                          int cg_start, int cg_end,
4118                          rvec *cg_cm,
4119                          int *move)
4120 {
4121     int      npbcdim;
4122     int      cg, k, k0, k1, d, dim, d2;
4123     int      mc, nrcg;
4124     int      flag;
4125     gmx_bool bScrew;
4126     ivec     dev;
4127     real     inv_ncg, pos_d;
4128     rvec     cm_new;
4129
4130     npbcdim = dd->npbcdim;
4131
4132     for (cg = cg_start; cg < cg_end; cg++)
4133     {
4134         k0   = cgindex[cg];
4135         k1   = cgindex[cg+1];
4136         nrcg = k1 - k0;
4137         if (nrcg == 1)
4138         {
4139             copy_rvec(state->x[k0], cm_new);
4140         }
4141         else
4142         {
4143             inv_ncg = 1.0/nrcg;
4144
4145             clear_rvec(cm_new);
4146             for (k = k0; (k < k1); k++)
4147             {
4148                 rvec_inc(cm_new, state->x[k]);
4149             }
4150             for (d = 0; (d < DIM); d++)
4151             {
4152                 cm_new[d] = inv_ncg*cm_new[d];
4153             }
4154         }
4155
4156         clear_ivec(dev);
4157         /* Do pbc and check DD cell boundary crossings */
4158         for (d = DIM-1; d >= 0; d--)
4159         {
4160             if (dd->nc[d] > 1)
4161             {
4162                 bScrew = (dd->bScrewPBC && d == XX);
4163                 /* Determine the location of this cg in lattice coordinates */
4164                 pos_d = cm_new[d];
4165                 if (tric_dir[d])
4166                 {
4167                     for (d2 = d+1; d2 < DIM; d2++)
4168                     {
4169                         pos_d += cm_new[d2]*tcm[d2][d];
4170                     }
4171                 }
4172                 /* Put the charge group in the triclinic unit-cell */
4173                 if (pos_d >= cell_x1[d])
4174                 {
4175                     if (pos_d >= limit1[d])
4176                     {
4177                         cg_move_error(fplog, dd, step, cg, d, 1,
4178                                       cg_cm != as_rvec_array(state->x.data()), limitd[d],
4179                                       cg_cm[cg], cm_new, pos_d);
4180                     }
4181                     dev[d] = 1;
4182                     if (dd->ci[d] == dd->nc[d] - 1)
4183                     {
4184                         rvec_dec(cm_new, state->box[d]);
4185                         if (bScrew)
4186                         {
4187                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4188                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4189                         }
4190                         for (k = k0; (k < k1); k++)
4191                         {
4192                             rvec_dec(state->x[k], state->box[d]);
4193                             if (bScrew)
4194                             {
4195                                 rotate_state_atom(state, k);
4196                             }
4197                         }
4198                     }
4199                 }
4200                 else if (pos_d < cell_x0[d])
4201                 {
4202                     if (pos_d < limit0[d])
4203                     {
4204                         cg_move_error(fplog, dd, step, cg, d, -1,
4205                                       cg_cm != as_rvec_array(state->x.data()), limitd[d],
4206                                       cg_cm[cg], cm_new, pos_d);
4207                     }
4208                     dev[d] = -1;
4209                     if (dd->ci[d] == 0)
4210                     {
4211                         rvec_inc(cm_new, state->box[d]);
4212                         if (bScrew)
4213                         {
4214                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4215                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4216                         }
4217                         for (k = k0; (k < k1); k++)
4218                         {
4219                             rvec_inc(state->x[k], state->box[d]);
4220                             if (bScrew)
4221                             {
4222                                 rotate_state_atom(state, k);
4223                             }
4224                         }
4225                     }
4226                 }
4227             }
4228             else if (d < npbcdim)
4229             {
4230                 /* Put the charge group in the rectangular unit-cell */
4231                 while (cm_new[d] >= state->box[d][d])
4232                 {
4233                     rvec_dec(cm_new, state->box[d]);
4234                     for (k = k0; (k < k1); k++)
4235                     {
4236                         rvec_dec(state->x[k], state->box[d]);
4237                     }
4238                 }
4239                 while (cm_new[d] < 0)
4240                 {
4241                     rvec_inc(cm_new, state->box[d]);
4242                     for (k = k0; (k < k1); k++)
4243                     {
4244                         rvec_inc(state->x[k], state->box[d]);
4245                     }
4246                 }
4247             }
4248         }
4249
4250         copy_rvec(cm_new, cg_cm[cg]);
4251
4252         /* Determine where this cg should go */
4253         flag = 0;
4254         mc   = -1;
4255         for (d = 0; d < dd->ndim; d++)
4256         {
4257             dim = dd->dim[d];
4258             if (dev[dim] == 1)
4259             {
4260                 flag |= DD_FLAG_FW(d);
4261                 if (mc == -1)
4262                 {
4263                     mc = d*2;
4264                 }
4265             }
4266             else if (dev[dim] == -1)
4267             {
4268                 flag |= DD_FLAG_BW(d);
4269                 if (mc == -1)
4270                 {
4271                     if (dd->nc[dim] > 2)
4272                     {
4273                         mc = d*2 + 1;
4274                     }
4275                     else
4276                     {
4277                         mc = d*2;
4278                     }
4279                 }
4280             }
4281         }
4282         /* Temporarily store the flag in move */
4283         move[cg] = mc + flag;
4284     }
4285 }
4286
4287 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4288                                gmx_domdec_t *dd, ivec tric_dir,
4289                                t_state *state, PaddedRVecVector *f,
4290                                t_forcerec *fr,
4291                                gmx_bool bCompact,
4292                                t_nrnb *nrnb,
4293                                int *ncg_stay_home,
4294                                int *ncg_moved)
4295 {
4296     int               *move;
4297     int                npbcdim;
4298     int                ncg[DIM*2] = { 0 }, nat[DIM*2] = { 0 };
4299     int                i, cg, k, d, dim, dim2, dir, d2, d3;
4300     int                mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
4301     int                sbuf[2], rbuf[2];
4302     int                home_pos_cg, home_pos_at, buf_pos;
4303     int                flag;
4304     real               pos_d;
4305     matrix             tcm;
4306     rvec              *cg_cm = nullptr, cell_x0, cell_x1, limitd, limit0, limit1;
4307     int               *cgindex;
4308     cginfo_mb_t       *cginfo_mb;
4309     gmx_domdec_comm_t *comm;
4310     int               *moved;
4311     int                nthread, thread;
4312
4313     if (dd->bScrewPBC)
4314     {
4315         check_screw_box(state->box);
4316     }
4317
4318     comm  = dd->comm;
4319     if (fr->cutoff_scheme == ecutsGROUP)
4320     {
4321         cg_cm = fr->cg_cm;
4322     }
4323
4324     // Positions are always present, so there's nothing to flag
4325     bool bV   = state->flags & (1<<estV);
4326     bool bCGP = state->flags & (1<<estCGP);
4327
4328     if (dd->ncg_tot > comm->nalloc_int)
4329     {
4330         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4331         srenew(comm->buf_int, comm->nalloc_int);
4332     }
4333     move = comm->buf_int;
4334
4335     npbcdim = dd->npbcdim;
4336
4337     for (d = 0; (d < DIM); d++)
4338     {
4339         limitd[d] = dd->comm->cellsize_min[d];
4340         if (d >= npbcdim && dd->ci[d] == 0)
4341         {
4342             cell_x0[d] = -GMX_FLOAT_MAX;
4343         }
4344         else
4345         {
4346             cell_x0[d] = comm->cell_x0[d];
4347         }
4348         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4349         {
4350             cell_x1[d] = GMX_FLOAT_MAX;
4351         }
4352         else
4353         {
4354             cell_x1[d] = comm->cell_x1[d];
4355         }
4356         if (d < npbcdim)
4357         {
4358             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4359             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4360         }
4361         else
4362         {
4363             /* We check after communication if a charge group moved
4364              * more than one cell. Set the pre-comm check limit to float_max.
4365              */
4366             limit0[d] = -GMX_FLOAT_MAX;
4367             limit1[d] =  GMX_FLOAT_MAX;
4368         }
4369     }
4370
4371     make_tric_corr_matrix(npbcdim, state->box, tcm);
4372
4373     cgindex = dd->cgindex;
4374
4375     nthread = gmx_omp_nthreads_get(emntDomdec);
4376
4377     /* Compute the center of geometry for all home charge groups
4378      * and put them in the box and determine where they should go.
4379      */
4380 #pragma omp parallel for num_threads(nthread) schedule(static)
4381     for (thread = 0; thread < nthread; thread++)
4382     {
4383         try
4384         {
4385             calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4386                          cell_x0, cell_x1, limitd, limit0, limit1,
4387                          cgindex,
4388                          ( thread   *dd->ncg_home)/nthread,
4389                          ((thread+1)*dd->ncg_home)/nthread,
4390                          fr->cutoff_scheme == ecutsGROUP ? cg_cm : as_rvec_array(state->x.data()),
4391                          move);
4392         }
4393         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4394     }
4395
4396     for (cg = 0; cg < dd->ncg_home; cg++)
4397     {
4398         if (move[cg] >= 0)
4399         {
4400             mc       = move[cg];
4401             flag     = mc & ~DD_FLAG_NRCG;
4402             mc       = mc & DD_FLAG_NRCG;
4403             move[cg] = mc;
4404
4405             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4406             {
4407                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4408                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4409             }
4410             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4411             /* We store the cg size in the lower 16 bits
4412              * and the place where the charge group should go
4413              * in the next 6 bits. This saves some communication volume.
4414              */
4415             nrcg = cgindex[cg+1] - cgindex[cg];
4416             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4417             ncg[mc] += 1;
4418             nat[mc] += nrcg;
4419         }
4420     }
4421
4422     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4423     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4424
4425     *ncg_moved = 0;
4426     for (i = 0; i < dd->ndim*2; i++)
4427     {
4428         *ncg_moved += ncg[i];
4429     }
4430
4431     nvec = 1;
4432     if (bV)
4433     {
4434         nvec++;
4435     }
4436     if (bCGP)
4437     {
4438         nvec++;
4439     }
4440
4441     /* Make sure the communication buffers are large enough */
4442     for (mc = 0; mc < dd->ndim*2; mc++)
4443     {
4444         nvr = ncg[mc] + nat[mc]*nvec;
4445         if (nvr > comm->cgcm_state_nalloc[mc])
4446         {
4447             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4448             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4449         }
4450     }
4451
4452     switch (fr->cutoff_scheme)
4453     {
4454         case ecutsGROUP:
4455             /* Recalculating cg_cm might be cheaper than communicating,
4456              * but that could give rise to rounding issues.
4457              */
4458             home_pos_cg =
4459                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4460                                         nvec, cg_cm, comm, bCompact);
4461             break;
4462         case ecutsVERLET:
4463             /* Without charge groups we send the moved atom coordinates
4464              * over twice. This is so the code below can be used without
4465              * many conditionals for both for with and without charge groups.
4466              */
4467             home_pos_cg =
4468                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4469                                         nvec, as_rvec_array(state->x.data()), comm, FALSE);
4470             if (bCompact)
4471             {
4472                 home_pos_cg -= *ncg_moved;
4473             }
4474             break;
4475         default:
4476             gmx_incons("unimplemented");
4477             home_pos_cg = 0;
4478     }
4479
4480     vec         = 0;
4481     home_pos_at =
4482         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4483                                 nvec, vec++, as_rvec_array(state->x.data()),
4484                                 comm, bCompact);
4485     if (bV)
4486     {
4487         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4488                                 nvec, vec++, as_rvec_array(state->v.data()),
4489                                 comm, bCompact);
4490     }
4491     if (bCGP)
4492     {
4493         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4494                                 nvec, vec++, as_rvec_array(state->cg_p.data()),
4495                                 comm, bCompact);
4496     }
4497
4498     if (bCompact)
4499     {
4500         compact_ind(dd->ncg_home, move,
4501                     dd->index_gl, dd->cgindex, dd->gatindex,
4502                     dd->ga2la, comm->bLocalCG,
4503                     fr->cginfo);
4504     }
4505     else
4506     {
4507         if (fr->cutoff_scheme == ecutsVERLET)
4508         {
4509             moved = get_moved(comm, dd->ncg_home);
4510
4511             for (k = 0; k < dd->ncg_home; k++)
4512             {
4513                 moved[k] = 0;
4514             }
4515         }
4516         else
4517         {
4518             moved = fr->ns->grid->cell_index;
4519         }
4520
4521         clear_and_mark_ind(dd->ncg_home, move,
4522                            dd->index_gl, dd->cgindex, dd->gatindex,
4523                            dd->ga2la, comm->bLocalCG,
4524                            moved);
4525     }
4526
4527     cginfo_mb = fr->cginfo_mb;
4528
4529     *ncg_stay_home = home_pos_cg;
4530     for (d = 0; d < dd->ndim; d++)
4531     {
4532         dim      = dd->dim[d];
4533         ncg_recv = 0;
4534         nvr      = 0;
4535         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4536         {
4537             cdd = d*2 + dir;
4538             /* Communicate the cg and atom counts */
4539             sbuf[0] = ncg[cdd];
4540             sbuf[1] = nat[cdd];
4541             if (debug)
4542             {
4543                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4544                         d, dir, sbuf[0], sbuf[1]);
4545             }
4546             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4547
4548             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4549             {
4550                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4551                 srenew(comm->buf_int, comm->nalloc_int);
4552             }
4553
4554             /* Communicate the charge group indices, sizes and flags */
4555             dd_sendrecv_int(dd, d, dir,
4556                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4557                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4558
4559             nvs = ncg[cdd] + nat[cdd]*nvec;
4560             i   = rbuf[0]  + rbuf[1] *nvec;
4561             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4562
4563             /* Communicate cgcm and state */
4564             dd_sendrecv_rvec(dd, d, dir,
4565                              comm->cgcm_state[cdd], nvs,
4566                              comm->vbuf.v+nvr, i);
4567             ncg_recv += rbuf[0];
4568             nvr      += i;
4569         }
4570
4571         dd_check_alloc_ncg(fr, state, f, home_pos_cg + ncg_recv);
4572         if (fr->cutoff_scheme == ecutsGROUP)
4573         {
4574             /* Here we resize to more than necessary and shrink later */
4575             dd_resize_state(state, f, home_pos_at + ncg_recv*MAX_CGCGSIZE);
4576         }
4577
4578         /* Process the received charge groups */
4579         buf_pos = 0;
4580         for (cg = 0; cg < ncg_recv; cg++)
4581         {
4582             flag = comm->buf_int[cg*DD_CGIBS+1];
4583
4584             if (dim >= npbcdim && dd->nc[dim] > 2)
4585             {
4586                 /* No pbc in this dim and more than one domain boundary.
4587                  * We do a separate check if a charge group didn't move too far.
4588                  */
4589                 if (((flag & DD_FLAG_FW(d)) &&
4590                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4591                     ((flag & DD_FLAG_BW(d)) &&
4592                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4593                 {
4594                     cg_move_error(fplog, dd, step, cg, dim,
4595                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4596                                   fr->cutoff_scheme == ecutsGROUP, 0,
4597                                   comm->vbuf.v[buf_pos],
4598                                   comm->vbuf.v[buf_pos],
4599                                   comm->vbuf.v[buf_pos][dim]);
4600                 }
4601             }
4602
4603             mc = -1;
4604             if (d < dd->ndim-1)
4605             {
4606                 /* Check which direction this cg should go */
4607                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4608                 {
4609                     if (dlbIsOn(dd->comm))
4610                     {
4611                         /* The cell boundaries for dimension d2 are not equal
4612                          * for each cell row of the lower dimension(s),
4613                          * therefore we might need to redetermine where
4614                          * this cg should go.
4615                          */
4616                         dim2 = dd->dim[d2];
4617                         /* If this cg crosses the box boundary in dimension d2
4618                          * we can use the communicated flag, so we do not
4619                          * have to worry about pbc.
4620                          */
4621                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4622                                (flag & DD_FLAG_FW(d2))) ||
4623                               (dd->ci[dim2] == 0 &&
4624                                (flag & DD_FLAG_BW(d2)))))
4625                         {
4626                             /* Clear the two flags for this dimension */
4627                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4628                             /* Determine the location of this cg
4629                              * in lattice coordinates
4630                              */
4631                             pos_d = comm->vbuf.v[buf_pos][dim2];
4632                             if (tric_dir[dim2])
4633                             {
4634                                 for (d3 = dim2+1; d3 < DIM; d3++)
4635                                 {
4636                                     pos_d +=
4637                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4638                                 }
4639                             }
4640                             /* Check of we are not at the box edge.
4641                              * pbc is only handled in the first step above,
4642                              * but this check could move over pbc while
4643                              * the first step did not due to different rounding.
4644                              */
4645                             if (pos_d >= cell_x1[dim2] &&
4646                                 dd->ci[dim2] != dd->nc[dim2]-1)
4647                             {
4648                                 flag |= DD_FLAG_FW(d2);
4649                             }
4650                             else if (pos_d < cell_x0[dim2] &&
4651                                      dd->ci[dim2] != 0)
4652                             {
4653                                 flag |= DD_FLAG_BW(d2);
4654                             }
4655                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4656                         }
4657                     }
4658                     /* Set to which neighboring cell this cg should go */
4659                     if (flag & DD_FLAG_FW(d2))
4660                     {
4661                         mc = d2*2;
4662                     }
4663                     else if (flag & DD_FLAG_BW(d2))
4664                     {
4665                         if (dd->nc[dd->dim[d2]] > 2)
4666                         {
4667                             mc = d2*2+1;
4668                         }
4669                         else
4670                         {
4671                             mc = d2*2;
4672                         }
4673                     }
4674                 }
4675             }
4676
4677             nrcg = flag & DD_FLAG_NRCG;
4678             if (mc == -1)
4679             {
4680                 if (home_pos_cg+1 > dd->cg_nalloc)
4681                 {
4682                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4683                     srenew(dd->index_gl, dd->cg_nalloc);
4684                     srenew(dd->cgindex, dd->cg_nalloc+1);
4685                 }
4686                 /* Set the global charge group index and size */
4687                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
4688                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4689                 /* Copy the state from the buffer */
4690                 if (fr->cutoff_scheme == ecutsGROUP)
4691                 {
4692                     cg_cm = fr->cg_cm;
4693                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4694                 }
4695                 buf_pos++;
4696
4697                 /* Set the cginfo */
4698                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4699                                                    dd->index_gl[home_pos_cg]);
4700                 if (comm->bLocalCG)
4701                 {
4702                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4703                 }
4704
4705                 for (i = 0; i < nrcg; i++)
4706                 {
4707                     copy_rvec(comm->vbuf.v[buf_pos++],
4708                               state->x[home_pos_at+i]);
4709                 }
4710                 if (bV)
4711                 {
4712                     for (i = 0; i < nrcg; i++)
4713                     {
4714                         copy_rvec(comm->vbuf.v[buf_pos++],
4715                                   state->v[home_pos_at+i]);
4716                     }
4717                 }
4718                 if (bCGP)
4719                 {
4720                     for (i = 0; i < nrcg; i++)
4721                     {
4722                         copy_rvec(comm->vbuf.v[buf_pos++],
4723                                   state->cg_p[home_pos_at+i]);
4724                     }
4725                 }
4726                 home_pos_cg += 1;
4727                 home_pos_at += nrcg;
4728             }
4729             else
4730             {
4731                 /* Reallocate the buffers if necessary  */
4732                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4733                 {
4734                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4735                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4736                 }
4737                 nvr = ncg[mc] + nat[mc]*nvec;
4738                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4739                 {
4740                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4741                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4742                 }
4743                 /* Copy from the receive to the send buffers */
4744                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4745                        comm->buf_int + cg*DD_CGIBS,
4746                        DD_CGIBS*sizeof(int));
4747                 memcpy(comm->cgcm_state[mc][nvr],
4748                        comm->vbuf.v[buf_pos],
4749                        (1+nrcg*nvec)*sizeof(rvec));
4750                 buf_pos += 1 + nrcg*nvec;
4751                 ncg[mc] += 1;
4752                 nat[mc] += nrcg;
4753             }
4754         }
4755     }
4756
4757     /* With sorting (!bCompact) the indices are now only partially up to date
4758      * and ncg_home and nat_home are not the real count, since there are
4759      * "holes" in the arrays for the charge groups that moved to neighbors.
4760      */
4761     if (fr->cutoff_scheme == ecutsVERLET)
4762     {
4763         moved = get_moved(comm, home_pos_cg);
4764
4765         for (i = dd->ncg_home; i < home_pos_cg; i++)
4766         {
4767             moved[i] = 0;
4768         }
4769     }
4770     dd->ncg_home = home_pos_cg;
4771     dd->nat_home = home_pos_at;
4772
4773     if (fr->cutoff_scheme == ecutsGROUP && !bCompact)
4774     {
4775         /* We overallocated before, we need to set the right size here */
4776         dd_resize_state(state, f, dd->nat_home);
4777     }
4778
4779     if (debug)
4780     {
4781         fprintf(debug,
4782                 "Finished repartitioning: cgs moved out %d, new home %d\n",
4783                 *ncg_moved, dd->ncg_home-*ncg_moved);
4784
4785     }
4786 }
4787
4788 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
4789 {
4790     /* Note that the cycles value can be incorrect, either 0 or some
4791      * extremely large value, when our thread migrated to another core
4792      * with an unsynchronized cycle counter. If this happens less often
4793      * that once per nstlist steps, this will not cause issues, since
4794      * we later subtract the maximum value from the sum over nstlist steps.
4795      * A zero count will slightly lower the total, but that's a small effect.
4796      * Note that the main purpose of the subtraction of the maximum value
4797      * is to avoid throwing off the load balancing when stalls occur due
4798      * e.g. system activity or network congestion.
4799      */
4800     dd->comm->cycl[ddCycl] += cycles;
4801     dd->comm->cycl_n[ddCycl]++;
4802     if (cycles > dd->comm->cycl_max[ddCycl])
4803     {
4804         dd->comm->cycl_max[ddCycl] = cycles;
4805     }
4806 }
4807
4808 static double force_flop_count(t_nrnb *nrnb)
4809 {
4810     int         i;
4811     double      sum;
4812     const char *name;
4813
4814     sum = 0;
4815     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
4816     {
4817         /* To get closer to the real timings, we half the count
4818          * for the normal loops and again half it for water loops.
4819          */
4820         name = nrnb_str(i);
4821         if (strstr(name, "W3") != nullptr || strstr(name, "W4") != nullptr)
4822         {
4823             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4824         }
4825         else
4826         {
4827             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4828         }
4829     }
4830     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
4831     {
4832         name = nrnb_str(i);
4833         if (strstr(name, "W3") != nullptr || strstr(name, "W4") != nullptr)
4834         {
4835             sum += nrnb->n[i]*cost_nrnb(i);
4836         }
4837     }
4838     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
4839     {
4840         sum += nrnb->n[i]*cost_nrnb(i);
4841     }
4842
4843     return sum;
4844 }
4845
4846 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
4847 {
4848     if (dd->comm->eFlop)
4849     {
4850         dd->comm->flop -= force_flop_count(nrnb);
4851     }
4852 }
4853 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
4854 {
4855     if (dd->comm->eFlop)
4856     {
4857         dd->comm->flop += force_flop_count(nrnb);
4858         dd->comm->flop_n++;
4859     }
4860 }
4861
4862 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4863 {
4864     int i;
4865
4866     for (i = 0; i < ddCyclNr; i++)
4867     {
4868         dd->comm->cycl[i]     = 0;
4869         dd->comm->cycl_n[i]   = 0;
4870         dd->comm->cycl_max[i] = 0;
4871     }
4872     dd->comm->flop   = 0;
4873     dd->comm->flop_n = 0;
4874 }
4875
4876 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
4877 {
4878     gmx_domdec_comm_t *comm;
4879     domdec_load_t     *load;
4880     domdec_root_t     *root = nullptr;
4881     int                d, dim, i, pos;
4882     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
4883     gmx_bool           bSepPME;
4884
4885     if (debug)
4886     {
4887         fprintf(debug, "get_load_distribution start\n");
4888     }
4889
4890     wallcycle_start(wcycle, ewcDDCOMMLOAD);
4891
4892     comm = dd->comm;
4893
4894     bSepPME = (dd->pme_nodeid >= 0);
4895
4896     if (dd->ndim == 0 && bSepPME)
4897     {
4898         /* Without decomposition, but with PME nodes, we need the load */
4899         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
4900         comm->load[0].pme = comm->cycl[ddCyclPME];
4901     }
4902
4903     for (d = dd->ndim-1; d >= 0; d--)
4904     {
4905         dim = dd->dim[d];
4906         /* Check if we participate in the communication in this dimension */
4907         if (d == dd->ndim-1 ||
4908             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
4909         {
4910             load = &comm->load[d];
4911             if (dlbIsOn(dd->comm))
4912             {
4913                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4914             }
4915             pos = 0;
4916             if (d == dd->ndim-1)
4917             {
4918                 sbuf[pos++] = dd_force_load(comm);
4919                 sbuf[pos++] = sbuf[0];
4920                 if (dlbIsOn(dd->comm))
4921                 {
4922                     sbuf[pos++] = sbuf[0];
4923                     sbuf[pos++] = cell_frac;
4924                     if (d > 0)
4925                     {
4926                         sbuf[pos++] = comm->cell_f_max0[d];
4927                         sbuf[pos++] = comm->cell_f_min1[d];
4928                     }
4929                 }
4930                 if (bSepPME)
4931                 {
4932                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4933                     sbuf[pos++] = comm->cycl[ddCyclPME];
4934                 }
4935             }
4936             else
4937             {
4938                 sbuf[pos++] = comm->load[d+1].sum;
4939                 sbuf[pos++] = comm->load[d+1].max;
4940                 if (dlbIsOn(dd->comm))
4941                 {
4942                     sbuf[pos++] = comm->load[d+1].sum_m;
4943                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4944                     sbuf[pos++] = comm->load[d+1].flags;
4945                     if (d > 0)
4946                     {
4947                         sbuf[pos++] = comm->cell_f_max0[d];
4948                         sbuf[pos++] = comm->cell_f_min1[d];
4949                     }
4950                 }
4951                 if (bSepPME)
4952                 {
4953                     sbuf[pos++] = comm->load[d+1].mdf;
4954                     sbuf[pos++] = comm->load[d+1].pme;
4955                 }
4956             }
4957             load->nload = pos;
4958             /* Communicate a row in DD direction d.
4959              * The communicators are setup such that the root always has rank 0.
4960              */
4961 #if GMX_MPI
4962             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
4963                        load->load, load->nload*sizeof(float), MPI_BYTE,
4964                        0, comm->mpi_comm_load[d]);
4965 #endif
4966             if (dd->ci[dim] == dd->master_ci[dim])
4967             {
4968                 /* We are the root, process this row */
4969                 if (dlbIsOn(comm))
4970                 {
4971                     root = comm->root[d];
4972                 }
4973                 load->sum      = 0;
4974                 load->max      = 0;
4975                 load->sum_m    = 0;
4976                 load->cvol_min = 1;
4977                 load->flags    = 0;
4978                 load->mdf      = 0;
4979                 load->pme      = 0;
4980                 pos            = 0;
4981                 for (i = 0; i < dd->nc[dim]; i++)
4982                 {
4983                     load->sum += load->load[pos++];
4984                     load->max  = std::max(load->max, load->load[pos]);
4985                     pos++;
4986                     if (dlbIsOn(dd->comm))
4987                     {
4988                         if (root->bLimited)
4989                         {
4990                             /* This direction could not be load balanced properly,
4991                              * therefore we need to use the maximum iso the average load.
4992                              */
4993                             load->sum_m = std::max(load->sum_m, load->load[pos]);
4994                         }
4995                         else
4996                         {
4997                             load->sum_m += load->load[pos];
4998                         }
4999                         pos++;
5000                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
5001                         pos++;
5002                         if (d < dd->ndim-1)
5003                         {
5004                             load->flags = (int)(load->load[pos++] + 0.5);
5005                         }
5006                         if (d > 0)
5007                         {
5008                             root->cell_f_max0[i] = load->load[pos++];
5009                             root->cell_f_min1[i] = load->load[pos++];
5010                         }
5011                     }
5012                     if (bSepPME)
5013                     {
5014                         load->mdf = std::max(load->mdf, load->load[pos]);
5015                         pos++;
5016                         load->pme = std::max(load->pme, load->load[pos]);
5017                         pos++;
5018                     }
5019                 }
5020                 if (dlbIsOn(comm) && root->bLimited)
5021                 {
5022                     load->sum_m *= dd->nc[dim];
5023                     load->flags |= (1<<d);
5024                 }
5025             }
5026         }
5027     }
5028
5029     if (DDMASTER(dd))
5030     {
5031         comm->nload      += dd_load_count(comm);
5032         comm->load_step  += comm->cycl[ddCyclStep];
5033         comm->load_sum   += comm->load[0].sum;
5034         comm->load_max   += comm->load[0].max;
5035         if (dlbIsOn(comm))
5036         {
5037             for (d = 0; d < dd->ndim; d++)
5038             {
5039                 if (comm->load[0].flags & (1<<d))
5040                 {
5041                     comm->load_lim[d]++;
5042                 }
5043             }
5044         }
5045         if (bSepPME)
5046         {
5047             comm->load_mdf += comm->load[0].mdf;
5048             comm->load_pme += comm->load[0].pme;
5049         }
5050     }
5051
5052     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5053
5054     if (debug)
5055     {
5056         fprintf(debug, "get_load_distribution finished\n");
5057     }
5058 }
5059
5060 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5061 {
5062     /* Return the relative performance loss on the total run time
5063      * due to the force calculation load imbalance.
5064      */
5065     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5066     {
5067         return
5068             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5069             (dd->comm->load_step*dd->nnodes);
5070     }
5071     else
5072     {
5073         return 0;
5074     }
5075 }
5076
5077 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5078 {
5079     char               buf[STRLEN];
5080     int                npp, npme, nnodes, d, limp;
5081     float              imbal, pme_f_ratio, lossf = 0, lossp = 0;
5082     gmx_bool           bLim;
5083     gmx_domdec_comm_t *comm;
5084
5085     comm = dd->comm;
5086     if (DDMASTER(dd) && comm->nload > 0)
5087     {
5088         npp    = dd->nnodes;
5089         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5090         nnodes = npp + npme;
5091         if (dd->nnodes > 1 && comm->load_sum > 0)
5092         {
5093             imbal  = comm->load_max*npp/comm->load_sum - 1;
5094             lossf  = dd_force_imb_perf_loss(dd);
5095             sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5096             fprintf(fplog, "%s", buf);
5097             fprintf(stderr, "\n");
5098             fprintf(stderr, "%s", buf);
5099             sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5100             fprintf(fplog, "%s", buf);
5101             fprintf(stderr, "%s", buf);
5102         }
5103         bLim = FALSE;
5104         if (dlbIsOn(comm))
5105         {
5106             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5107             for (d = 0; d < dd->ndim; d++)
5108             {
5109                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5110                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5111                 if (limp >= 50)
5112                 {
5113                     bLim = TRUE;
5114                 }
5115             }
5116             sprintf(buf+strlen(buf), "\n");
5117             fprintf(fplog, "%s", buf);
5118             fprintf(stderr, "%s", buf);
5119         }
5120         if (npme > 0 && comm->load_mdf > 0 && comm->load_step > 0)
5121         {
5122             pme_f_ratio = comm->load_pme/comm->load_mdf;
5123             lossp       = (comm->load_pme - comm->load_mdf)/comm->load_step;
5124             if (lossp <= 0)
5125             {
5126                 lossp *= (float)npme/(float)nnodes;
5127             }
5128             else
5129             {
5130                 lossp *= (float)npp/(float)nnodes;
5131             }
5132             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5133             fprintf(fplog, "%s", buf);
5134             fprintf(stderr, "%s", buf);
5135             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5136             fprintf(fplog, "%s", buf);
5137             fprintf(stderr, "%s", buf);
5138         }
5139         fprintf(fplog, "\n");
5140         fprintf(stderr, "\n");
5141
5142         if (lossf >= DD_PERF_LOSS_WARN)
5143         {
5144             sprintf(buf,
5145                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5146                     "      in the domain decomposition.\n", lossf*100);
5147             if (!dlbIsOn(comm))
5148             {
5149                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5150             }
5151             else if (bLim)
5152             {
5153                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5154             }
5155             fprintf(fplog, "%s\n", buf);
5156             fprintf(stderr, "%s\n", buf);
5157         }
5158         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS_WARN)
5159         {
5160             sprintf(buf,
5161                     "NOTE: %.1f %% performance was lost because the PME ranks\n"
5162                     "      had %s work to do than the PP ranks.\n"
5163                     "      You might want to %s the number of PME ranks\n"
5164                     "      or %s the cut-off and the grid spacing.\n",
5165                     fabs(lossp*100),
5166                     (lossp < 0) ? "less"     : "more",
5167                     (lossp < 0) ? "decrease" : "increase",
5168                     (lossp < 0) ? "decrease" : "increase");
5169             fprintf(fplog, "%s\n", buf);
5170             fprintf(stderr, "%s\n", buf);
5171         }
5172     }
5173 }
5174
5175 static float dd_vol_min(gmx_domdec_t *dd)
5176 {
5177     return dd->comm->load[0].cvol_min*dd->nnodes;
5178 }
5179
5180 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5181 {
5182     return dd->comm->load[0].flags;
5183 }
5184
5185 static float dd_f_imbal(gmx_domdec_t *dd)
5186 {
5187     if (dd->comm->load[0].sum > 0)
5188     {
5189         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
5190     }
5191     else
5192     {
5193         /* Something is wrong in the cycle counting, report no load imbalance */
5194         return 0.0f;
5195     }
5196 }
5197
5198 float dd_pme_f_ratio(gmx_domdec_t *dd)
5199 {
5200     /* Should only be called on the DD master rank */
5201     assert(DDMASTER(dd));
5202
5203     if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
5204     {
5205         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5206     }
5207     else
5208     {
5209         return -1.0;
5210     }
5211 }
5212
5213 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5214 {
5215     int  flags, d;
5216     char buf[22];
5217
5218     flags = dd_load_flags(dd);
5219     if (flags)
5220     {
5221         fprintf(fplog,
5222                 "DD  load balancing is limited by minimum cell size in dimension");
5223         for (d = 0; d < dd->ndim; d++)
5224         {
5225             if (flags & (1<<d))
5226             {
5227                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5228             }
5229         }
5230         fprintf(fplog, "\n");
5231     }
5232     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5233     if (dlbIsOn(dd->comm))
5234     {
5235         fprintf(fplog, "  vol min/aver %5.3f%c",
5236                 dd_vol_min(dd), flags ? '!' : ' ');
5237     }
5238     if (dd->nnodes > 1)
5239     {
5240         fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5241     }
5242     if (dd->comm->cycl_n[ddCyclPME])
5243     {
5244         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5245     }
5246     fprintf(fplog, "\n\n");
5247 }
5248
5249 static void dd_print_load_verbose(gmx_domdec_t *dd)
5250 {
5251     if (dlbIsOn(dd->comm))
5252     {
5253         fprintf(stderr, "vol %4.2f%c ",
5254                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5255     }
5256     if (dd->nnodes > 1)
5257     {
5258         fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5259     }
5260     if (dd->comm->cycl_n[ddCyclPME])
5261     {
5262         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5263     }
5264 }
5265
5266 #if GMX_MPI
5267 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5268 {
5269     MPI_Comm           c_row;
5270     int                dim, i, rank;
5271     ivec               loc_c;
5272     domdec_root_t     *root;
5273     gmx_bool           bPartOfGroup = FALSE;
5274
5275     dim = dd->dim[dim_ind];
5276     copy_ivec(loc, loc_c);
5277     for (i = 0; i < dd->nc[dim]; i++)
5278     {
5279         loc_c[dim] = i;
5280         rank       = dd_index(dd->nc, loc_c);
5281         if (rank == dd->rank)
5282         {
5283             /* This process is part of the group */
5284             bPartOfGroup = TRUE;
5285         }
5286     }
5287     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5288                    &c_row);
5289     if (bPartOfGroup)
5290     {
5291         dd->comm->mpi_comm_load[dim_ind] = c_row;
5292         if (dd->comm->dlbState != edlbsOffForever)
5293         {
5294             if (dd->ci[dim] == dd->master_ci[dim])
5295             {
5296                 /* This is the root process of this row */
5297                 snew(dd->comm->root[dim_ind], 1);
5298                 root = dd->comm->root[dim_ind];
5299                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5300                 snew(root->old_cell_f, dd->nc[dim]+1);
5301                 snew(root->bCellMin, dd->nc[dim]);
5302                 if (dim_ind > 0)
5303                 {
5304                     snew(root->cell_f_max0, dd->nc[dim]);
5305                     snew(root->cell_f_min1, dd->nc[dim]);
5306                     snew(root->bound_min, dd->nc[dim]);
5307                     snew(root->bound_max, dd->nc[dim]);
5308                 }
5309                 snew(root->buf_ncd, dd->nc[dim]);
5310             }
5311             else
5312             {
5313                 /* This is not a root process, we only need to receive cell_f */
5314                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5315             }
5316         }
5317         if (dd->ci[dim] == dd->master_ci[dim])
5318         {
5319             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5320         }
5321     }
5322 }
5323 #endif
5324
5325 void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
5326                                    const gmx_hw_info_t gmx_unused *hwinfo,
5327                                    const gmx_hw_opt_t  gmx_unused *hw_opt)
5328 {
5329 #if GMX_MPI
5330     int           physicalnode_id_hash;
5331     int           gpu_id;
5332     gmx_domdec_t *dd;
5333     MPI_Comm      mpi_comm_pp_physicalnode;
5334
5335     if (!(cr->duty & DUTY_PP) || hw_opt->gpu_opt.n_dev_use == 0)
5336     {
5337         /* Only PP nodes (currently) use GPUs.
5338          * If we don't have GPUs, there are no resources to share.
5339          */
5340         return;
5341     }
5342
5343     physicalnode_id_hash = gmx_physicalnode_id_hash();
5344
5345     gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
5346
5347     dd = cr->dd;
5348
5349     if (debug)
5350     {
5351         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5352         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5353                 dd->rank, physicalnode_id_hash, gpu_id);
5354     }
5355     /* Split the PP communicator over the physical nodes */
5356     /* TODO: See if we should store this (before), as it's also used for
5357      * for the nodecomm summution.
5358      */
5359     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5360                    &mpi_comm_pp_physicalnode);
5361     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5362                    &dd->comm->mpi_comm_gpu_shared);
5363     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5364     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5365
5366     if (debug)
5367     {
5368         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5369     }
5370
5371     /* Note that some ranks could share a GPU, while others don't */
5372
5373     if (dd->comm->nrank_gpu_shared == 1)
5374     {
5375         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5376     }
5377 #endif
5378 }
5379
5380 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5381 {
5382 #if GMX_MPI
5383     int  dim0, dim1, i, j;
5384     ivec loc;
5385
5386     if (debug)
5387     {
5388         fprintf(debug, "Making load communicators\n");
5389     }
5390
5391     snew(dd->comm->load,          std::max(dd->ndim, 1));
5392     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
5393
5394     if (dd->ndim == 0)
5395     {
5396         return;
5397     }
5398
5399     clear_ivec(loc);
5400     make_load_communicator(dd, 0, loc);
5401     if (dd->ndim > 1)
5402     {
5403         dim0 = dd->dim[0];
5404         for (i = 0; i < dd->nc[dim0]; i++)
5405         {
5406             loc[dim0] = i;
5407             make_load_communicator(dd, 1, loc);
5408         }
5409     }
5410     if (dd->ndim > 2)
5411     {
5412         dim0 = dd->dim[0];
5413         for (i = 0; i < dd->nc[dim0]; i++)
5414         {
5415             loc[dim0] = i;
5416             dim1      = dd->dim[1];
5417             for (j = 0; j < dd->nc[dim1]; j++)
5418             {
5419                 loc[dim1] = j;
5420                 make_load_communicator(dd, 2, loc);
5421             }
5422         }
5423     }
5424
5425     if (debug)
5426     {
5427         fprintf(debug, "Finished making load communicators\n");
5428     }
5429 #endif
5430 }
5431
5432 /*! \brief Sets up the relation between neighboring domains and zones */
5433 static void setup_neighbor_relations(gmx_domdec_t *dd)
5434 {
5435     int                     d, dim, i, j, m;
5436     ivec                    tmp, s;
5437     gmx_domdec_zones_t     *zones;
5438     gmx_domdec_ns_ranges_t *izone;
5439
5440     for (d = 0; d < dd->ndim; d++)
5441     {
5442         dim = dd->dim[d];
5443         copy_ivec(dd->ci, tmp);
5444         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5445         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5446         copy_ivec(dd->ci, tmp);
5447         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5448         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5449         if (debug)
5450         {
5451             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5452                     dd->rank, dim,
5453                     dd->neighbor[d][0],
5454                     dd->neighbor[d][1]);
5455         }
5456     }
5457
5458     int nzone  = (1 << dd->ndim);
5459     int nizone = (1 << std::max(dd->ndim - 1, 0));
5460     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
5461
5462     zones = &dd->comm->zones;
5463
5464     for (i = 0; i < nzone; i++)
5465     {
5466         m = 0;
5467         clear_ivec(zones->shift[i]);
5468         for (d = 0; d < dd->ndim; d++)
5469         {
5470             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5471         }
5472     }
5473
5474     zones->n = nzone;
5475     for (i = 0; i < nzone; i++)
5476     {
5477         for (d = 0; d < DIM; d++)
5478         {
5479             s[d] = dd->ci[d] - zones->shift[i][d];
5480             if (s[d] < 0)
5481             {
5482                 s[d] += dd->nc[d];
5483             }
5484             else if (s[d] >= dd->nc[d])
5485             {
5486                 s[d] -= dd->nc[d];
5487             }
5488         }
5489     }
5490     zones->nizone = nizone;
5491     for (i = 0; i < zones->nizone; i++)
5492     {
5493         assert(ddNonbondedZonePairRanges[i][0] == i);
5494
5495         izone     = &zones->izone[i];
5496         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
5497          * j-zones up to nzone.
5498          */
5499         izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
5500         izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
5501         for (dim = 0; dim < DIM; dim++)
5502         {
5503             if (dd->nc[dim] == 1)
5504             {
5505                 /* All shifts should be allowed */
5506                 izone->shift0[dim] = -1;
5507                 izone->shift1[dim] = 1;
5508             }
5509             else
5510             {
5511                 /* Determine the min/max j-zone shift wrt the i-zone */
5512                 izone->shift0[dim] = 1;
5513                 izone->shift1[dim] = -1;
5514                 for (j = izone->j0; j < izone->j1; j++)
5515                 {
5516                     int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5517                     if (shift_diff < izone->shift0[dim])
5518                     {
5519                         izone->shift0[dim] = shift_diff;
5520                     }
5521                     if (shift_diff > izone->shift1[dim])
5522                     {
5523                         izone->shift1[dim] = shift_diff;
5524                     }
5525                 }
5526             }
5527         }
5528     }
5529
5530     if (dd->comm->dlbState != edlbsOffForever)
5531     {
5532         snew(dd->comm->root, dd->ndim);
5533     }
5534
5535     if (dd->comm->bRecordLoad)
5536     {
5537         make_load_communicators(dd);
5538     }
5539 }
5540
5541 static void make_pp_communicator(FILE                 *fplog,
5542                                  gmx_domdec_t         *dd,
5543                                  t_commrec gmx_unused *cr,
5544                                  int gmx_unused        reorder)
5545 {
5546 #if GMX_MPI
5547     gmx_domdec_comm_t *comm;
5548     int                rank, *buf;
5549     ivec               periods;
5550     MPI_Comm           comm_cart;
5551
5552     comm = dd->comm;
5553
5554     if (comm->bCartesianPP)
5555     {
5556         /* Set up cartesian communication for the particle-particle part */
5557         if (fplog)
5558         {
5559             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5560                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5561         }
5562
5563         for (int i = 0; i < DIM; i++)
5564         {
5565             periods[i] = TRUE;
5566         }
5567         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5568                         &comm_cart);
5569         /* We overwrite the old communicator with the new cartesian one */
5570         cr->mpi_comm_mygroup = comm_cart;
5571     }
5572
5573     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5574     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5575
5576     if (comm->bCartesianPP_PME)
5577     {
5578         /* Since we want to use the original cartesian setup for sim,
5579          * and not the one after split, we need to make an index.
5580          */
5581         snew(comm->ddindex2ddnodeid, dd->nnodes);
5582         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5583         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5584         /* Get the rank of the DD master,
5585          * above we made sure that the master node is a PP node.
5586          */
5587         if (MASTER(cr))
5588         {
5589             rank = dd->rank;
5590         }
5591         else
5592         {
5593             rank = 0;
5594         }
5595         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5596     }
5597     else if (comm->bCartesianPP)
5598     {
5599         if (cr->npmenodes == 0)
5600         {
5601             /* The PP communicator is also
5602              * the communicator for this simulation
5603              */
5604             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5605         }
5606         cr->nodeid = dd->rank;
5607
5608         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5609
5610         /* We need to make an index to go from the coordinates
5611          * to the nodeid of this simulation.
5612          */
5613         snew(comm->ddindex2simnodeid, dd->nnodes);
5614         snew(buf, dd->nnodes);
5615         if (cr->duty & DUTY_PP)
5616         {
5617             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5618         }
5619         /* Communicate the ddindex to simulation nodeid index */
5620         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5621                       cr->mpi_comm_mysim);
5622         sfree(buf);
5623
5624         /* Determine the master coordinates and rank.
5625          * The DD master should be the same node as the master of this sim.
5626          */
5627         for (int i = 0; i < dd->nnodes; i++)
5628         {
5629             if (comm->ddindex2simnodeid[i] == 0)
5630             {
5631                 ddindex2xyz(dd->nc, i, dd->master_ci);
5632                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5633             }
5634         }
5635         if (debug)
5636         {
5637             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5638         }
5639     }
5640     else
5641     {
5642         /* No Cartesian communicators */
5643         /* We use the rank in dd->comm->all as DD index */
5644         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5645         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5646         dd->masterrank = 0;
5647         clear_ivec(dd->master_ci);
5648     }
5649 #endif
5650
5651     if (fplog)
5652     {
5653         fprintf(fplog,
5654                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5655                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5656     }
5657     if (debug)
5658     {
5659         fprintf(debug,
5660                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5661                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5662     }
5663 }
5664
5665 static void receive_ddindex2simnodeid(gmx_domdec_t         *dd,
5666                                       t_commrec            *cr)
5667 {
5668 #if GMX_MPI
5669     gmx_domdec_comm_t *comm = dd->comm;
5670
5671     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5672     {
5673         int *buf;
5674         snew(comm->ddindex2simnodeid, dd->nnodes);
5675         snew(buf, dd->nnodes);
5676         if (cr->duty & DUTY_PP)
5677         {
5678             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5679         }
5680         /* Communicate the ddindex to simulation nodeid index */
5681         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5682                       cr->mpi_comm_mysim);
5683         sfree(buf);
5684     }
5685 #else
5686     GMX_UNUSED_VALUE(dd);
5687     GMX_UNUSED_VALUE(cr);
5688 #endif
5689 }
5690
5691 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5692                                                      int ncg, int natoms)
5693 {
5694     gmx_domdec_master_t *ma;
5695     int                  i;
5696
5697     snew(ma, 1);
5698
5699     snew(ma->ncg, dd->nnodes);
5700     snew(ma->index, dd->nnodes+1);
5701     snew(ma->cg, ncg);
5702     snew(ma->nat, dd->nnodes);
5703     snew(ma->ibuf, dd->nnodes*2);
5704     snew(ma->cell_x, DIM);
5705     for (i = 0; i < DIM; i++)
5706     {
5707         snew(ma->cell_x[i], dd->nc[i]+1);
5708     }
5709
5710     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5711     {
5712         ma->vbuf = nullptr;
5713     }
5714     else
5715     {
5716         snew(ma->vbuf, natoms);
5717     }
5718
5719     return ma;
5720 }
5721
5722 static void split_communicator(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
5723                                int gmx_unused dd_rank_order,
5724                                int gmx_unused reorder)
5725 {
5726     gmx_domdec_comm_t *comm;
5727     int                i;
5728     gmx_bool           bDiv[DIM];
5729 #if GMX_MPI
5730     MPI_Comm           comm_cart;
5731 #endif
5732
5733     comm = dd->comm;
5734
5735     if (comm->bCartesianPP)
5736     {
5737         for (i = 1; i < DIM; i++)
5738         {
5739             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5740         }
5741         if (bDiv[YY] || bDiv[ZZ])
5742         {
5743             comm->bCartesianPP_PME = TRUE;
5744             /* If we have 2D PME decomposition, which is always in x+y,
5745              * we stack the PME only nodes in z.
5746              * Otherwise we choose the direction that provides the thinnest slab
5747              * of PME only nodes as this will have the least effect
5748              * on the PP communication.
5749              * But for the PME communication the opposite might be better.
5750              */
5751             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5752                              !bDiv[YY] ||
5753                              dd->nc[YY] > dd->nc[ZZ]))
5754             {
5755                 comm->cartpmedim = ZZ;
5756             }
5757             else
5758             {
5759                 comm->cartpmedim = YY;
5760             }
5761             comm->ntot[comm->cartpmedim]
5762                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5763         }
5764         else if (fplog)
5765         {
5766             fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
5767             fprintf(fplog,
5768                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5769         }
5770     }
5771
5772 #if GMX_MPI
5773     if (comm->bCartesianPP_PME)
5774     {
5775         int  rank;
5776         ivec periods;
5777
5778         if (fplog)
5779         {
5780             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
5781         }
5782
5783         for (i = 0; i < DIM; i++)
5784         {
5785             periods[i] = TRUE;
5786         }
5787         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
5788                         &comm_cart);
5789         MPI_Comm_rank(comm_cart, &rank);
5790         if (MASTER(cr) && rank != 0)
5791         {
5792             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5793         }
5794
5795         /* With this assigment we loose the link to the original communicator
5796          * which will usually be MPI_COMM_WORLD, unless have multisim.
5797          */
5798         cr->mpi_comm_mysim = comm_cart;
5799         cr->sim_nodeid     = rank;
5800
5801         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
5802
5803         if (fplog)
5804         {
5805             fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
5806                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5807         }
5808
5809         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5810         {
5811             cr->duty = DUTY_PP;
5812         }
5813         if (cr->npmenodes == 0 ||
5814             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5815         {
5816             cr->duty = DUTY_PME;
5817         }
5818
5819         /* Split the sim communicator into PP and PME only nodes */
5820         MPI_Comm_split(cr->mpi_comm_mysim,
5821                        cr->duty,
5822                        dd_index(comm->ntot, dd->ci),
5823                        &cr->mpi_comm_mygroup);
5824     }
5825     else
5826     {
5827         switch (dd_rank_order)
5828         {
5829             case ddrankorderPP_PME:
5830                 if (fplog)
5831                 {
5832                     fprintf(fplog, "Order of the ranks: PP first, PME last\n");
5833                 }
5834                 break;
5835             case ddrankorderINTERLEAVE:
5836                 /* Interleave the PP-only and PME-only ranks */
5837                 if (fplog)
5838                 {
5839                     fprintf(fplog, "Interleaving PP and PME ranks\n");
5840                 }
5841                 comm->pmenodes = dd_interleaved_pme_ranks(dd);
5842                 break;
5843             case ddrankorderCARTESIAN:
5844                 break;
5845             default:
5846                 gmx_fatal(FARGS, "Unknown dd_rank_order=%d", dd_rank_order);
5847         }
5848
5849         if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
5850         {
5851             cr->duty = DUTY_PME;
5852         }
5853         else
5854         {
5855             cr->duty = DUTY_PP;
5856         }
5857
5858         /* Split the sim communicator into PP and PME only nodes */
5859         MPI_Comm_split(cr->mpi_comm_mysim,
5860                        cr->duty,
5861                        cr->nodeid,
5862                        &cr->mpi_comm_mygroup);
5863         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
5864     }
5865 #endif
5866
5867     if (fplog)
5868     {
5869         fprintf(fplog, "This rank does only %s work.\n\n",
5870                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5871     }
5872 }
5873
5874 /*! \brief Generates the MPI communicators for domain decomposition */
5875 static void make_dd_communicators(FILE *fplog, t_commrec *cr,
5876                                   gmx_domdec_t *dd, int dd_rank_order)
5877 {
5878     gmx_domdec_comm_t *comm;
5879     int                CartReorder;
5880
5881     comm = dd->comm;
5882
5883     copy_ivec(dd->nc, comm->ntot);
5884
5885     comm->bCartesianPP     = (dd_rank_order == ddrankorderCARTESIAN);
5886     comm->bCartesianPP_PME = FALSE;
5887
5888     /* Reorder the nodes by default. This might change the MPI ranks.
5889      * Real reordering is only supported on very few architectures,
5890      * Blue Gene is one of them.
5891      */
5892     CartReorder = (getenv("GMX_NO_CART_REORDER") == nullptr);
5893
5894     if (cr->npmenodes > 0)
5895     {
5896         /* Split the communicator into a PP and PME part */
5897         split_communicator(fplog, cr, dd, dd_rank_order, CartReorder);
5898         if (comm->bCartesianPP_PME)
5899         {
5900             /* We (possibly) reordered the nodes in split_communicator,
5901              * so it is no longer required in make_pp_communicator.
5902              */
5903             CartReorder = FALSE;
5904         }
5905     }
5906     else
5907     {
5908         /* All nodes do PP and PME */
5909 #if GMX_MPI
5910         /* We do not require separate communicators */
5911         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5912 #endif
5913     }
5914
5915     if (cr->duty & DUTY_PP)
5916     {
5917         /* Copy or make a new PP communicator */
5918         make_pp_communicator(fplog, dd, cr, CartReorder);
5919     }
5920     else
5921     {
5922         receive_ddindex2simnodeid(dd, cr);
5923     }
5924
5925     if (!(cr->duty & DUTY_PME))
5926     {
5927         /* Set up the commnuication to our PME node */
5928         dd->pme_nodeid           = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
5929         dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
5930         if (debug)
5931         {
5932             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
5933                     dd->pme_nodeid, dd->pme_receive_vir_ener);
5934         }
5935     }
5936     else
5937     {
5938         dd->pme_nodeid = -1;
5939     }
5940
5941     if (DDMASTER(dd))
5942     {
5943         dd->ma = init_gmx_domdec_master_t(dd,
5944                                           comm->cgs_gl.nr,
5945                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
5946     }
5947 }
5948
5949 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
5950 {
5951     real  *slb_frac, tot;
5952     int    i, n;
5953     double dbl;
5954
5955     slb_frac = nullptr;
5956     if (nc > 1 && size_string != nullptr)
5957     {
5958         if (fplog)
5959         {
5960             fprintf(fplog, "Using static load balancing for the %s direction\n",
5961                     dir);
5962         }
5963         snew(slb_frac, nc);
5964         tot = 0;
5965         for (i = 0; i < nc; i++)
5966         {
5967             dbl = 0;
5968             sscanf(size_string, "%20lf%n", &dbl, &n);
5969             if (dbl == 0)
5970             {
5971                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
5972             }
5973             slb_frac[i]  = dbl;
5974             size_string += n;
5975             tot         += slb_frac[i];
5976         }
5977         /* Normalize */
5978         if (fplog)
5979         {
5980             fprintf(fplog, "Relative cell sizes:");
5981         }
5982         for (i = 0; i < nc; i++)
5983         {
5984             slb_frac[i] /= tot;
5985             if (fplog)
5986             {
5987                 fprintf(fplog, " %5.3f", slb_frac[i]);
5988             }
5989         }
5990         if (fplog)
5991         {
5992             fprintf(fplog, "\n");
5993         }
5994     }
5995
5996     return slb_frac;
5997 }
5998
5999 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
6000 {
6001     int                  n, nmol, ftype;
6002     gmx_mtop_ilistloop_t iloop;
6003     t_ilist             *il;
6004
6005     n     = 0;
6006     iloop = gmx_mtop_ilistloop_init(mtop);
6007     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6008     {
6009         for (ftype = 0; ftype < F_NRE; ftype++)
6010         {
6011             if ((interaction_function[ftype].flags & IF_BOND) &&
6012                 NRAL(ftype) >  2)
6013             {
6014                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6015             }
6016         }
6017     }
6018
6019     return n;
6020 }
6021
6022 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6023 {
6024     char *val;
6025     int   nst;
6026
6027     nst = def;
6028     val = getenv(env_var);
6029     if (val)
6030     {
6031         if (sscanf(val, "%20d", &nst) <= 0)
6032         {
6033             nst = 1;
6034         }
6035         if (fplog)
6036         {
6037             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6038                     env_var, val, nst);
6039         }
6040     }
6041
6042     return nst;
6043 }
6044
6045 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6046 {
6047     if (MASTER(cr))
6048     {
6049         fprintf(stderr, "\n%s\n", warn_string);
6050     }
6051     if (fplog)
6052     {
6053         fprintf(fplog, "\n%s\n", warn_string);
6054     }
6055 }
6056
6057 static void check_dd_restrictions(t_commrec *cr, const gmx_domdec_t *dd,
6058                                   const t_inputrec *ir, FILE *fplog)
6059 {
6060     if (ir->ePBC == epbcSCREW &&
6061         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6062     {
6063         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6064     }
6065
6066     if (ir->ns_type == ensSIMPLE)
6067     {
6068         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6069     }
6070
6071     if (ir->nstlist == 0)
6072     {
6073         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6074     }
6075
6076     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6077     {
6078         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6079     }
6080 }
6081
6082 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6083 {
6084     int  di, d;
6085     real r;
6086
6087     r = ddbox->box_size[XX];
6088     for (di = 0; di < dd->ndim; di++)
6089     {
6090         d = dd->dim[di];
6091         /* Check using the initial average cell size */
6092         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6093     }
6094
6095     return r;
6096 }
6097
6098 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6099                              const char *dlb_opt, gmx_bool bRecordLoad,
6100                              unsigned long Flags, const t_inputrec *ir)
6101 {
6102     int           dlbState = -1;
6103     char          buf[STRLEN];
6104
6105     switch (dlb_opt[0])
6106     {
6107         case 'a': dlbState = edlbsOffCanTurnOn; break;
6108         case 'n': dlbState = edlbsOffForever;   break;
6109         case 'y': dlbState = edlbsOnForever;    break;
6110         default: gmx_incons("Unknown dlb_opt");
6111     }
6112
6113     if (Flags & MD_RERUN)
6114     {
6115         return edlbsOffForever;
6116     }
6117
6118     if (!EI_DYNAMICS(ir->eI))
6119     {
6120         if (dlbState == edlbsOnForever)
6121         {
6122             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6123             dd_warning(cr, fplog, buf);
6124         }
6125
6126         return edlbsOffForever;
6127     }
6128
6129     if (!bRecordLoad)
6130     {
6131         dd_warning(cr, fplog, "NOTE: Cycle counters unsupported or not enabled in kernel. Cannot use dynamic load balancing.\n");
6132         return edlbsOffForever;
6133     }
6134
6135     if (Flags & MD_REPRODUCIBLE)
6136     {
6137         switch (dlbState)
6138         {
6139             case edlbsOffForever:
6140                 break;
6141             case edlbsOffCanTurnOn:
6142             case edlbsOnCanTurnOff:
6143                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6144                 dlbState = edlbsOffForever;
6145                 break;
6146             case edlbsOnForever:
6147                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6148                 break;
6149             default:
6150                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
6151                 break;
6152         }
6153     }
6154
6155     return dlbState;
6156 }
6157
6158 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6159 {
6160     int dim;
6161
6162     dd->ndim = 0;
6163     if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
6164     {
6165         /* Decomposition order z,y,x */
6166         if (fplog)
6167         {
6168             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6169         }
6170         for (dim = DIM-1; dim >= 0; dim--)
6171         {
6172             if (dd->nc[dim] > 1)
6173             {
6174                 dd->dim[dd->ndim++] = dim;
6175             }
6176         }
6177     }
6178     else
6179     {
6180         /* Decomposition order x,y,z */
6181         for (dim = 0; dim < DIM; dim++)
6182         {
6183             if (dd->nc[dim] > 1)
6184             {
6185                 dd->dim[dd->ndim++] = dim;
6186             }
6187         }
6188     }
6189 }
6190
6191 static gmx_domdec_comm_t *init_dd_comm()
6192 {
6193     gmx_domdec_comm_t *comm;
6194     int                i;
6195
6196     snew(comm, 1);
6197     snew(comm->cggl_flag, DIM*2);
6198     snew(comm->cgcm_state, DIM*2);
6199     for (i = 0; i < DIM*2; i++)
6200     {
6201         comm->cggl_flag_nalloc[i]  = 0;
6202         comm->cgcm_state_nalloc[i] = 0;
6203     }
6204
6205     comm->nalloc_int = 0;
6206     comm->buf_int    = nullptr;
6207
6208     vec_rvec_init(&comm->vbuf);
6209
6210     comm->n_load_have    = 0;
6211     comm->n_load_collect = 0;
6212
6213     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6214     {
6215         comm->sum_nat[i] = 0;
6216     }
6217     comm->ndecomp   = 0;
6218     comm->nload     = 0;
6219     comm->load_step = 0;
6220     comm->load_sum  = 0;
6221     comm->load_max  = 0;
6222     clear_ivec(comm->load_lim);
6223     comm->load_mdf  = 0;
6224     comm->load_pme  = 0;
6225
6226     return comm;
6227 }
6228
6229 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
6230 static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
6231                                    unsigned long Flags,
6232                                    ivec nc, int nPmeRanks,
6233                                    real comm_distance_min, real rconstr,
6234                                    const char *dlb_opt, real dlb_scale,
6235                                    const char *sizex, const char *sizey, const char *sizez,
6236                                    const gmx_mtop_t *mtop,
6237                                    const t_inputrec *ir,
6238                                    matrix box, const rvec *x,
6239                                    gmx_ddbox_t *ddbox,
6240                                    int *npme_x, int *npme_y)
6241 {
6242     real               r_bonded         = -1;
6243     real               r_bonded_limit   = -1;
6244     const real         tenPercentMargin = 1.1;
6245     gmx_domdec_comm_t *comm             = dd->comm;
6246
6247     snew(comm->cggl_flag, DIM*2);
6248     snew(comm->cgcm_state, DIM*2);
6249
6250     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6251     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6252
6253     dd->pme_recv_f_alloc = 0;
6254     dd->pme_recv_f_buf   = nullptr;
6255
6256     /* Initialize to GPU share count to 0, might change later */
6257     comm->nrank_gpu_shared = 0;
6258
6259     comm->dlbState         = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6260     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
6261     /* To consider turning DLB on after 2*nstlist steps we need to check
6262      * at partitioning count 3. Thus we need to increase the first count by 2.
6263      */
6264     comm->ddPartioningCountFirstDlbOff += 2;
6265
6266     if (fplog)
6267     {
6268         fprintf(fplog, "Dynamic load balancing: %s\n",
6269                 edlbs_names[comm->dlbState]);
6270     }
6271     comm->bPMELoadBalDLBLimits = FALSE;
6272
6273     /* Allocate the charge group/atom sorting struct */
6274     snew(comm->sort, 1);
6275
6276     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6277
6278     comm->bInterCGBondeds = ((ncg_mtop(mtop) > mtop->mols.nr) ||
6279                              mtop->bIntermolecularInteractions);
6280     if (comm->bInterCGBondeds)
6281     {
6282         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6283     }
6284     else
6285     {
6286         comm->bInterCGMultiBody = FALSE;
6287     }
6288
6289     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6290     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6291
6292     if (ir->rlist == 0)
6293     {
6294         /* Set the cut-off to some very large value,
6295          * so we don't need if statements everywhere in the code.
6296          * We use sqrt, since the cut-off is squared in some places.
6297          */
6298         comm->cutoff   = GMX_CUTOFF_INF;
6299     }
6300     else
6301     {
6302         comm->cutoff   = ir->rlist;
6303     }
6304     comm->cutoff_mbody = 0;
6305
6306     comm->cellsize_limit = 0;
6307     comm->bBondComm      = FALSE;
6308
6309     /* Atoms should be able to move by up to half the list buffer size (if > 0)
6310      * within nstlist steps. Since boundaries are allowed to displace by half
6311      * a cell size, DD cells should be at least the size of the list buffer.
6312      */
6313     comm->cellsize_limit = std::max(comm->cellsize_limit,
6314                                     ir->rlist - std::max(ir->rvdw, ir->rcoulomb));
6315
6316     if (comm->bInterCGBondeds)
6317     {
6318         if (comm_distance_min > 0)
6319         {
6320             comm->cutoff_mbody = comm_distance_min;
6321             if (Flags & MD_DDBONDCOMM)
6322             {
6323                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6324             }
6325             else
6326             {
6327                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6328             }
6329             r_bonded_limit = comm->cutoff_mbody;
6330         }
6331         else if (ir->bPeriodicMols)
6332         {
6333             /* Can not easily determine the required cut-off */
6334             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6335             comm->cutoff_mbody = comm->cutoff/2;
6336             r_bonded_limit     = comm->cutoff_mbody;
6337         }
6338         else
6339         {
6340             real r_2b, r_mb;
6341
6342             if (MASTER(cr))
6343             {
6344                 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6345                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6346             }
6347             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6348             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6349
6350             /* We use an initial margin of 10% for the minimum cell size,
6351              * except when we are just below the non-bonded cut-off.
6352              */
6353             if (Flags & MD_DDBONDCOMM)
6354             {
6355                 if (std::max(r_2b, r_mb) > comm->cutoff)
6356                 {
6357                     r_bonded        = std::max(r_2b, r_mb);
6358                     r_bonded_limit  = tenPercentMargin*r_bonded;
6359                     comm->bBondComm = TRUE;
6360                 }
6361                 else
6362                 {
6363                     r_bonded       = r_mb;
6364                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
6365                 }
6366                 /* We determine cutoff_mbody later */
6367             }
6368             else
6369             {
6370                 /* No special bonded communication,
6371                  * simply increase the DD cut-off.
6372                  */
6373                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
6374                 comm->cutoff_mbody = r_bonded_limit;
6375                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
6376             }
6377         }
6378         if (fplog)
6379         {
6380             fprintf(fplog,
6381                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6382                     r_bonded_limit);
6383         }
6384         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
6385     }
6386
6387     if (dd->bInterCGcons && rconstr <= 0)
6388     {
6389         /* There is a cell size limit due to the constraints (P-LINCS) */
6390         rconstr = constr_r_max(fplog, mtop, ir);
6391         if (fplog)
6392         {
6393             fprintf(fplog,
6394                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6395                     rconstr);
6396             if (rconstr > comm->cellsize_limit)
6397             {
6398                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6399             }
6400         }
6401     }
6402     else if (rconstr > 0 && fplog)
6403     {
6404         /* Here we do not check for dd->bInterCGcons,
6405          * because one can also set a cell size limit for virtual sites only
6406          * and at this point we don't know yet if there are intercg v-sites.
6407          */
6408         fprintf(fplog,
6409                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6410                 rconstr);
6411     }
6412     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
6413
6414     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6415
6416     if (nc[XX] > 0)
6417     {
6418         copy_ivec(nc, dd->nc);
6419         set_dd_dim(fplog, dd);
6420         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6421
6422         if (nPmeRanks >= 0)
6423         {
6424             cr->npmenodes = nPmeRanks;
6425         }
6426         else
6427         {
6428             /* When the DD grid is set explicitly and -npme is set to auto,
6429              * don't use PME ranks. We check later if the DD grid is
6430              * compatible with the total number of ranks.
6431              */
6432             cr->npmenodes = 0;
6433         }
6434
6435         real acs = average_cellsize_min(dd, ddbox);
6436         if (acs < comm->cellsize_limit)
6437         {
6438             if (fplog)
6439             {
6440                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6441             }
6442             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6443                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6444                                  acs, comm->cellsize_limit);
6445         }
6446     }
6447     else
6448     {
6449         set_ddbox_cr(cr, nullptr, ir, box, &comm->cgs_gl, x, ddbox);
6450
6451         /* We need to choose the optimal DD grid and possibly PME nodes */
6452         real limit =
6453             dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6454                            nPmeRanks,
6455                            comm->dlbState != edlbsOffForever, dlb_scale,
6456                            comm->cellsize_limit, comm->cutoff,
6457                            comm->bInterCGBondeds);
6458
6459         if (dd->nc[XX] == 0)
6460         {
6461             char     buf[STRLEN];
6462             gmx_bool bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6463             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6464                     !bC ? "-rdd" : "-rcon",
6465                     comm->dlbState != edlbsOffForever ? " or -dds" : "",
6466                     bC ? " or your LINCS settings" : "");
6467
6468             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6469                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6470                                  "%s\n"
6471                                  "Look in the log file for details on the domain decomposition",
6472                                  cr->nnodes-cr->npmenodes, limit, buf);
6473         }
6474         set_dd_dim(fplog, dd);
6475     }
6476
6477     if (fplog)
6478     {
6479         fprintf(fplog,
6480                 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6481                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6482     }
6483
6484     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6485     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6486     {
6487         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6488                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6489                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6490     }
6491     if (cr->npmenodes > dd->nnodes)
6492     {
6493         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6494                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6495     }
6496     if (cr->npmenodes > 0)
6497     {
6498         comm->npmenodes = cr->npmenodes;
6499     }
6500     else
6501     {
6502         comm->npmenodes = dd->nnodes;
6503     }
6504
6505     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6506     {
6507         /* The following choices should match those
6508          * in comm_cost_est in domdec_setup.c.
6509          * Note that here the checks have to take into account
6510          * that the decomposition might occur in a different order than xyz
6511          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6512          * in which case they will not match those in comm_cost_est,
6513          * but since that is mainly for testing purposes that's fine.
6514          */
6515         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6516             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6517             getenv("GMX_PMEONEDD") == nullptr)
6518         {
6519             comm->npmedecompdim = 2;
6520             comm->npmenodes_x   = dd->nc[XX];
6521             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6522         }
6523         else
6524         {
6525             /* In case nc is 1 in both x and y we could still choose to
6526              * decompose pme in y instead of x, but we use x for simplicity.
6527              */
6528             comm->npmedecompdim = 1;
6529             if (dd->dim[0] == YY)
6530             {
6531                 comm->npmenodes_x = 1;
6532                 comm->npmenodes_y = comm->npmenodes;
6533             }
6534             else
6535             {
6536                 comm->npmenodes_x = comm->npmenodes;
6537                 comm->npmenodes_y = 1;
6538             }
6539         }
6540         if (fplog)
6541         {
6542             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6543                     comm->npmenodes_x, comm->npmenodes_y, 1);
6544         }
6545     }
6546     else
6547     {
6548         comm->npmedecompdim = 0;
6549         comm->npmenodes_x   = 0;
6550         comm->npmenodes_y   = 0;
6551     }
6552
6553     /* Technically we don't need both of these,
6554      * but it simplifies code not having to recalculate it.
6555      */
6556     *npme_x = comm->npmenodes_x;
6557     *npme_y = comm->npmenodes_y;
6558
6559     snew(comm->slb_frac, DIM);
6560     if (comm->dlbState == edlbsOffForever)
6561     {
6562         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6563         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6564         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6565     }
6566
6567     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6568     {
6569         if (comm->bBondComm || comm->dlbState != edlbsOffForever)
6570         {
6571             /* Set the bonded communication distance to halfway
6572              * the minimum and the maximum,
6573              * since the extra communication cost is nearly zero.
6574              */
6575             real acs           = average_cellsize_min(dd, ddbox);
6576             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6577             if (comm->dlbState != edlbsOffForever)
6578             {
6579                 /* Check if this does not limit the scaling */
6580                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
6581             }
6582             if (!comm->bBondComm)
6583             {
6584                 /* Without bBondComm do not go beyond the n.b. cut-off */
6585                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
6586                 if (comm->cellsize_limit >= comm->cutoff)
6587                 {
6588                     /* We don't loose a lot of efficieny
6589                      * when increasing it to the n.b. cut-off.
6590                      * It can even be slightly faster, because we need
6591                      * less checks for the communication setup.
6592                      */
6593                     comm->cutoff_mbody = comm->cutoff;
6594                 }
6595             }
6596             /* Check if we did not end up below our original limit */
6597             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
6598
6599             if (comm->cutoff_mbody > comm->cellsize_limit)
6600             {
6601                 comm->cellsize_limit = comm->cutoff_mbody;
6602             }
6603         }
6604         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6605     }
6606
6607     if (debug)
6608     {
6609         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6610                 "cellsize limit %f\n",
6611                 comm->bBondComm, comm->cellsize_limit);
6612     }
6613
6614     if (MASTER(cr))
6615     {
6616         check_dd_restrictions(cr, dd, ir, fplog);
6617     }
6618 }
6619
6620 static void set_dlb_limits(gmx_domdec_t *dd)
6621
6622 {
6623     int d;
6624
6625     for (d = 0; d < dd->ndim; d++)
6626     {
6627         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6628         dd->comm->cellsize_min[dd->dim[d]] =
6629             dd->comm->cellsize_min_dlb[dd->dim[d]];
6630     }
6631 }
6632
6633
6634 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6635 {
6636     gmx_domdec_t      *dd;
6637     gmx_domdec_comm_t *comm;
6638     real               cellsize_min;
6639     int                d, nc, i;
6640
6641     dd   = cr->dd;
6642     comm = dd->comm;
6643
6644     cellsize_min = comm->cellsize_min[dd->dim[0]];
6645     for (d = 1; d < dd->ndim; d++)
6646     {
6647         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6648     }
6649
6650     if (cellsize_min < comm->cellsize_limit*1.05)
6651     {
6652         char buf[STRLEN];
6653         sprintf(buf, "step %" GMX_PRId64 " Measured %.1f %% performance load due to load imbalance, but the minimum cell size is smaller than 1.05 times the cell size limit. Will no longer try dynamic load balancing.\n", step, dd_force_imb_perf_loss(dd)*100);
6654
6655         /* Change DLB from "auto" to "no". */
6656         comm->dlbState = edlbsOffForever;
6657
6658         return;
6659     }
6660
6661     char buf[STRLEN];
6662     sprintf(buf, "step %" GMX_PRId64 " Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.\n", step, dd_force_imb_perf_loss(dd)*100);
6663     dd_warning(cr, fplog, buf);
6664     comm->dlbState = edlbsOnCanTurnOff;
6665
6666     /* Store the non-DLB performance, so we can check if DLB actually
6667      * improves performance.
6668      */
6669     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
6670     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
6671
6672     set_dlb_limits(dd);
6673
6674     /* We can set the required cell size info here,
6675      * so we do not need to communicate this.
6676      * The grid is completely uniform.
6677      */
6678     for (d = 0; d < dd->ndim; d++)
6679     {
6680         if (comm->root[d])
6681         {
6682             comm->load[d].sum_m = comm->load[d].sum;
6683
6684             nc = dd->nc[dd->dim[d]];
6685             for (i = 0; i < nc; i++)
6686             {
6687                 comm->root[d]->cell_f[i]    = i/(real)nc;
6688                 if (d > 0)
6689                 {
6690                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6691                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6692                 }
6693             }
6694             comm->root[d]->cell_f[nc] = 1.0;
6695         }
6696     }
6697 }
6698
6699 static void turn_off_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6700 {
6701     gmx_domdec_t *dd = cr->dd;
6702
6703     char          buf[STRLEN];
6704     sprintf(buf, "step %" GMX_PRId64 " Turning off dynamic load balancing, because it is degrading performance.\n", step);
6705     dd_warning(cr, fplog, buf);
6706     dd->comm->dlbState                     = edlbsOffCanTurnOn;
6707     dd->comm->haveTurnedOffDlb             = true;
6708     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
6709 }
6710
6711 static void turn_off_dlb_forever(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6712 {
6713     GMX_RELEASE_ASSERT(cr->dd->comm->dlbState == edlbsOffCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
6714     char buf[STRLEN];
6715     sprintf(buf, "step %" GMX_PRId64 " Will no longer try dynamic load balancing, as it degraded performance.\n", step);
6716     dd_warning(cr, fplog, buf);
6717     cr->dd->comm->dlbState = edlbsOffForever;
6718 }
6719
6720 static char *init_bLocalCG(const gmx_mtop_t *mtop)
6721 {
6722     int   ncg, cg;
6723     char *bLocalCG;
6724
6725     ncg = ncg_mtop(mtop);
6726     snew(bLocalCG, ncg);
6727     for (cg = 0; cg < ncg; cg++)
6728     {
6729         bLocalCG[cg] = FALSE;
6730     }
6731
6732     return bLocalCG;
6733 }
6734
6735 void dd_init_bondeds(FILE *fplog,
6736                      gmx_domdec_t *dd,
6737                      const gmx_mtop_t *mtop,
6738                      const gmx_vsite_t *vsite,
6739                      const t_inputrec *ir,
6740                      gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
6741 {
6742     gmx_domdec_comm_t *comm;
6743
6744     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
6745
6746     comm = dd->comm;
6747
6748     if (comm->bBondComm)
6749     {
6750         /* Communicate atoms beyond the cut-off for bonded interactions */
6751         comm = dd->comm;
6752
6753         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
6754
6755         comm->bLocalCG = init_bLocalCG(mtop);
6756     }
6757     else
6758     {
6759         /* Only communicate atoms based on cut-off */
6760         comm->cglink   = nullptr;
6761         comm->bLocalCG = nullptr;
6762     }
6763 }
6764
6765 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
6766                               const gmx_mtop_t *mtop, const t_inputrec *ir,
6767                               gmx_bool bDynLoadBal, real dlb_scale,
6768                               const gmx_ddbox_t *ddbox)
6769 {
6770     gmx_domdec_comm_t *comm;
6771     int                d;
6772     ivec               np;
6773     real               limit, shrink;
6774     char               buf[64];
6775
6776     if (fplog == nullptr)
6777     {
6778         return;
6779     }
6780
6781     comm = dd->comm;
6782
6783     if (bDynLoadBal)
6784     {
6785         fprintf(fplog, "The maximum number of communication pulses is:");
6786         for (d = 0; d < dd->ndim; d++)
6787         {
6788             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
6789         }
6790         fprintf(fplog, "\n");
6791         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
6792         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
6793         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
6794         for (d = 0; d < DIM; d++)
6795         {
6796             if (dd->nc[d] > 1)
6797             {
6798                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6799                 {
6800                     shrink = 0;
6801                 }
6802                 else
6803                 {
6804                     shrink =
6805                         comm->cellsize_min_dlb[d]/
6806                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6807                 }
6808                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
6809             }
6810         }
6811         fprintf(fplog, "\n");
6812     }
6813     else
6814     {
6815         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
6816         fprintf(fplog, "The initial number of communication pulses is:");
6817         for (d = 0; d < dd->ndim; d++)
6818         {
6819             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
6820         }
6821         fprintf(fplog, "\n");
6822         fprintf(fplog, "The initial domain decomposition cell size is:");
6823         for (d = 0; d < DIM; d++)
6824         {
6825             if (dd->nc[d] > 1)
6826             {
6827                 fprintf(fplog, " %c %.2f nm",
6828                         dim2char(d), dd->comm->cellsize_min[d]);
6829             }
6830         }
6831         fprintf(fplog, "\n\n");
6832     }
6833
6834     gmx_bool bInterCGVsites = count_intercg_vsites(mtop);
6835
6836     if (comm->bInterCGBondeds ||
6837         bInterCGVsites ||
6838         dd->bInterCGcons || dd->bInterCGsettles)
6839     {
6840         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
6841         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6842                 "non-bonded interactions", "", comm->cutoff);
6843
6844         if (bDynLoadBal)
6845         {
6846             limit = dd->comm->cellsize_limit;
6847         }
6848         else
6849         {
6850             if (dynamic_dd_box(ddbox, ir))
6851             {
6852                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
6853             }
6854             limit = dd->comm->cellsize_min[XX];
6855             for (d = 1; d < DIM; d++)
6856             {
6857                 limit = std::min(limit, dd->comm->cellsize_min[d]);
6858             }
6859         }
6860
6861         if (comm->bInterCGBondeds)
6862         {
6863             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6864                     "two-body bonded interactions", "(-rdd)",
6865                     std::max(comm->cutoff, comm->cutoff_mbody));
6866             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6867                     "multi-body bonded interactions", "(-rdd)",
6868                     (comm->bBondComm || dlbIsOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
6869         }
6870         if (bInterCGVsites)
6871         {
6872             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6873                     "virtual site constructions", "(-rcon)", limit);
6874         }
6875         if (dd->bInterCGcons || dd->bInterCGsettles)
6876         {
6877             sprintf(buf, "atoms separated by up to %d constraints",
6878                     1+ir->nProjOrder);
6879             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6880                     buf, "(-rcon)", limit);
6881         }
6882         fprintf(fplog, "\n");
6883     }
6884
6885     fflush(fplog);
6886 }
6887
6888 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
6889                                 real               dlb_scale,
6890                                 const t_inputrec  *ir,
6891                                 const gmx_ddbox_t *ddbox)
6892 {
6893     gmx_domdec_comm_t *comm;
6894     int                d, dim, npulse, npulse_d_max, npulse_d;
6895     gmx_bool           bNoCutOff;
6896
6897     comm = dd->comm;
6898
6899     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6900
6901     /* Determine the maximum number of comm. pulses in one dimension */
6902
6903     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
6904
6905     /* Determine the maximum required number of grid pulses */
6906     if (comm->cellsize_limit >= comm->cutoff)
6907     {
6908         /* Only a single pulse is required */
6909         npulse = 1;
6910     }
6911     else if (!bNoCutOff && comm->cellsize_limit > 0)
6912     {
6913         /* We round down slightly here to avoid overhead due to the latency
6914          * of extra communication calls when the cut-off
6915          * would be only slightly longer than the cell size.
6916          * Later cellsize_limit is redetermined,
6917          * so we can not miss interactions due to this rounding.
6918          */
6919         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6920     }
6921     else
6922     {
6923         /* There is no cell size limit */
6924         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
6925     }
6926
6927     if (!bNoCutOff && npulse > 1)
6928     {
6929         /* See if we can do with less pulses, based on dlb_scale */
6930         npulse_d_max = 0;
6931         for (d = 0; d < dd->ndim; d++)
6932         {
6933             dim      = dd->dim[d];
6934             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6935                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6936             npulse_d_max = std::max(npulse_d_max, npulse_d);
6937         }
6938         npulse = std::min(npulse, npulse_d_max);
6939     }
6940
6941     /* This env var can override npulse */
6942     d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
6943     if (d > 0)
6944     {
6945         npulse = d;
6946     }
6947
6948     comm->maxpulse       = 1;
6949     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6950     for (d = 0; d < dd->ndim; d++)
6951     {
6952         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
6953         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6954         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
6955         comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
6956         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6957         {
6958             comm->bVacDLBNoLimit = FALSE;
6959         }
6960     }
6961
6962     /* cellsize_limit is set for LINCS in init_domain_decomposition */
6963     if (!comm->bVacDLBNoLimit)
6964     {
6965         comm->cellsize_limit = std::max(comm->cellsize_limit,
6966                                         comm->cutoff/comm->maxpulse);
6967     }
6968     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
6969     /* Set the minimum cell size for each DD dimension */
6970     for (d = 0; d < dd->ndim; d++)
6971     {
6972         if (comm->bVacDLBNoLimit ||
6973             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
6974         {
6975             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
6976         }
6977         else
6978         {
6979             comm->cellsize_min_dlb[dd->dim[d]] =
6980                 comm->cutoff/comm->cd[d].np_dlb;
6981         }
6982     }
6983     if (comm->cutoff_mbody <= 0)
6984     {
6985         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
6986     }
6987     if (dlbIsOn(comm))
6988     {
6989         set_dlb_limits(dd);
6990     }
6991 }
6992
6993 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
6994 {
6995     /* If each molecule is a single charge group
6996      * or we use domain decomposition for each periodic dimension,
6997      * we do not need to take pbc into account for the bonded interactions.
6998      */
6999     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7000             !(dd->nc[XX] > 1 &&
7001               dd->nc[YY] > 1 &&
7002               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7003 }
7004
7005 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
7006 static void set_ddgrid_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7007                                   const gmx_mtop_t *mtop, const t_inputrec *ir,
7008                                   const gmx_ddbox_t *ddbox)
7009 {
7010     gmx_domdec_comm_t *comm;
7011     int                natoms_tot;
7012     real               vol_frac;
7013
7014     comm = dd->comm;
7015
7016     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7017     {
7018         init_ddpme(dd, &comm->ddpme[0], 0);
7019         if (comm->npmedecompdim >= 2)
7020         {
7021             init_ddpme(dd, &comm->ddpme[1], 1);
7022         }
7023     }
7024     else
7025     {
7026         comm->npmenodes = 0;
7027         if (dd->pme_nodeid >= 0)
7028         {
7029             gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
7030                                  "Can not have separate PME ranks without PME electrostatics");
7031         }
7032     }
7033
7034     if (debug)
7035     {
7036         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7037     }
7038     if (comm->dlbState != edlbsOffForever)
7039     {
7040         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7041     }
7042
7043     print_dd_settings(fplog, dd, mtop, ir, dlbIsOn(comm), dlb_scale, ddbox);
7044     if (comm->dlbState == edlbsOffCanTurnOn)
7045     {
7046         if (fplog)
7047         {
7048             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7049         }
7050         print_dd_settings(fplog, dd, mtop, ir, TRUE, dlb_scale, ddbox);
7051     }
7052
7053     if (ir->ePBC == epbcNONE)
7054     {
7055         vol_frac = 1 - 1/(double)dd->nnodes;
7056     }
7057     else
7058     {
7059         vol_frac =
7060             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7061     }
7062     if (debug)
7063     {
7064         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7065     }
7066     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7067
7068     dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
7069 }
7070
7071 /*! \brief Set some important DD parameters that can be modified by env.vars */
7072 static void set_dd_envvar_options(FILE *fplog, gmx_domdec_t *dd, int rank_mysim)
7073 {
7074     gmx_domdec_comm_t *comm = dd->comm;
7075
7076     dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
7077     comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
7078     comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
7079     int recload         = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
7080     comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
7081     comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
7082     comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
7083
7084     if (dd->bSendRecv2 && fplog)
7085     {
7086         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
7087     }
7088
7089     if (comm->eFlop)
7090     {
7091         if (fplog)
7092         {
7093             fprintf(fplog, "Will load balance based on FLOP count\n");
7094         }
7095         if (comm->eFlop > 1)
7096         {
7097             srand(1 + rank_mysim);
7098         }
7099         comm->bRecordLoad = TRUE;
7100     }
7101     else
7102     {
7103         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
7104     }
7105 }
7106
7107 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
7108                                         unsigned long Flags,
7109                                         ivec nc, int nPmeRanks,
7110                                         int dd_rank_order,
7111                                         real comm_distance_min, real rconstr,
7112                                         const char *dlb_opt, real dlb_scale,
7113                                         const char *sizex, const char *sizey, const char *sizez,
7114                                         const gmx_mtop_t *mtop,
7115                                         const t_inputrec *ir,
7116                                         matrix box, rvec *x,
7117                                         gmx_ddbox_t *ddbox,
7118                                         int *npme_x, int *npme_y)
7119 {
7120     gmx_domdec_t      *dd;
7121
7122     if (fplog)
7123     {
7124         fprintf(fplog,
7125                 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
7126     }
7127
7128     snew(dd, 1);
7129
7130     dd->comm = init_dd_comm();
7131
7132     set_dd_envvar_options(fplog, dd, cr->nodeid);
7133
7134     set_dd_limits_and_grid(fplog, cr, dd, Flags,
7135                            nc, nPmeRanks,
7136                            comm_distance_min, rconstr,
7137                            dlb_opt, dlb_scale,
7138                            sizex, sizey, sizez,
7139                            mtop, ir,
7140                            box, x,
7141                            ddbox,
7142                            npme_x, npme_y);
7143
7144     make_dd_communicators(fplog, cr, dd, dd_rank_order);
7145
7146     if (cr->duty & DUTY_PP)
7147     {
7148         set_ddgrid_parameters(fplog, dd, dlb_scale, mtop, ir, ddbox);
7149
7150         setup_neighbor_relations(dd);
7151     }
7152
7153     /* Set overallocation to avoid frequent reallocation of arrays */
7154     set_over_alloc_dd(TRUE);
7155
7156     /* Initialize DD paritioning counters */
7157     dd->comm->partition_step = INT_MIN;
7158     dd->ddp_count            = 0;
7159
7160     /* We currently don't know the number of threads yet, we set this later */
7161     dd->comm->nth = 0;
7162
7163     clear_dd_cycle_counts(dd);
7164
7165     return dd;
7166 }
7167
7168 static gmx_bool test_dd_cutoff(t_commrec *cr,
7169                                t_state *state, const t_inputrec *ir,
7170                                real cutoff_req)
7171 {
7172     gmx_domdec_t *dd;
7173     gmx_ddbox_t   ddbox;
7174     int           d, dim, np;
7175     real          inv_cell_size;
7176     int           LocallyLimited;
7177
7178     dd = cr->dd;
7179
7180     set_ddbox(dd, FALSE, cr, ir, state->box,
7181               TRUE, &dd->comm->cgs_gl, as_rvec_array(state->x.data()), &ddbox);
7182
7183     LocallyLimited = 0;
7184
7185     for (d = 0; d < dd->ndim; d++)
7186     {
7187         dim = dd->dim[d];
7188
7189         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7190         if (dynamic_dd_box(&ddbox, ir))
7191         {
7192             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7193         }
7194
7195         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7196
7197         if (dd->comm->dlbState != edlbsOffForever && dim < ddbox.npbcdim &&
7198             dd->comm->cd[d].np_dlb > 0)
7199         {
7200             if (np > dd->comm->cd[d].np_dlb)
7201             {
7202                 return FALSE;
7203             }
7204
7205             /* If a current local cell size is smaller than the requested
7206              * cut-off, we could still fix it, but this gets very complicated.
7207              * Without fixing here, we might actually need more checks.
7208              */
7209             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7210             {
7211                 LocallyLimited = 1;
7212             }
7213         }
7214     }
7215
7216     if (dd->comm->dlbState != edlbsOffForever)
7217     {
7218         /* If DLB is not active yet, we don't need to check the grid jumps.
7219          * Actually we shouldn't, because then the grid jump data is not set.
7220          */
7221         if (dlbIsOn(dd->comm) &&
7222             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7223         {
7224             LocallyLimited = 1;
7225         }
7226
7227         gmx_sumi(1, &LocallyLimited, cr);
7228
7229         if (LocallyLimited > 0)
7230         {
7231             return FALSE;
7232         }
7233     }
7234
7235     return TRUE;
7236 }
7237
7238 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, const t_inputrec *ir,
7239                           real cutoff_req)
7240 {
7241     gmx_bool bCutoffAllowed;
7242
7243     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7244
7245     if (bCutoffAllowed)
7246     {
7247         cr->dd->comm->cutoff = cutoff_req;
7248     }
7249
7250     return bCutoffAllowed;
7251 }
7252
7253 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
7254 {
7255     gmx_domdec_comm_t *comm;
7256
7257     comm = cr->dd->comm;
7258
7259     /* Turn on the DLB limiting (might have been on already) */
7260     comm->bPMELoadBalDLBLimits = TRUE;
7261
7262     /* Change the cut-off limit */
7263     comm->PMELoadBal_max_cutoff = cutoff;
7264
7265     if (debug)
7266     {
7267         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
7268                 comm->PMELoadBal_max_cutoff);
7269     }
7270 }
7271
7272 /* Sets whether we should later check the load imbalance data, so that
7273  * we can trigger dynamic load balancing if enough imbalance has
7274  * arisen.
7275  *
7276  * Used after PME load balancing unlocks DLB, so that the check
7277  * whether DLB will be useful can happen immediately.
7278  */
7279 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
7280 {
7281     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7282     {
7283         dd->comm->bCheckWhetherToTurnDlbOn = bValue;
7284
7285         if (bValue == TRUE)
7286         {
7287             /* Store the DD partitioning count, so we can ignore cycle counts
7288              * over the next nstlist steps, which are often slower.
7289              */
7290             dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
7291         }
7292     }
7293 }
7294
7295 /* Returns if we should check whether there has been enough load
7296  * imbalance to trigger dynamic load balancing.
7297  */
7298 static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
7299 {
7300     if (dd->comm->dlbState != edlbsOffCanTurnOn)
7301     {
7302         return FALSE;
7303     }
7304
7305     if (dd->ddp_count <= dd->comm->ddPartioningCountFirstDlbOff)
7306     {
7307         /* We ignore the first nstlist steps at the start of the run
7308          * or after PME load balancing or after turning DLB off, since
7309          * these often have extra allocation or cache miss overhead.
7310          */
7311         return FALSE;
7312     }
7313
7314     /* We should check whether we should use DLB directly after
7315      * unlocking DLB. */
7316     if (dd->comm->bCheckWhetherToTurnDlbOn)
7317     {
7318         /* This flag was set when the PME load-balancing routines
7319            unlocked DLB, and should now be cleared. */
7320         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
7321         return TRUE;
7322     }
7323     /* We check whether we should use DLB every c_checkTurnDlbOnInterval
7324      * partitionings (we do not do this every partioning, so that we
7325      * avoid excessive communication). */
7326     if (dd->comm->n_load_have % c_checkTurnDlbOnInterval == c_checkTurnDlbOnInterval - 1)
7327     {
7328         return TRUE;
7329     }
7330
7331     return FALSE;
7332 }
7333
7334 gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
7335 {
7336     return dlbIsOn(dd->comm);
7337 }
7338
7339 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
7340 {
7341     return (dd->comm->dlbState == edlbsOffTemporarilyLocked);
7342 }
7343
7344 void dd_dlb_lock(gmx_domdec_t *dd)
7345 {
7346     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7347     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7348     {
7349         dd->comm->dlbState = edlbsOffTemporarilyLocked;
7350     }
7351 }
7352
7353 void dd_dlb_unlock(gmx_domdec_t *dd)
7354 {
7355     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7356     if (dd->comm->dlbState == edlbsOffTemporarilyLocked)
7357     {
7358         dd->comm->dlbState = edlbsOffCanTurnOn;
7359         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
7360     }
7361 }
7362
7363 static void merge_cg_buffers(int ncell,
7364                              gmx_domdec_comm_dim_t *cd, int pulse,
7365                              int  *ncg_cell,
7366                              int  *index_gl, int  *recv_i,
7367                              rvec *cg_cm,    rvec *recv_vr,
7368                              int *cgindex,
7369                              cginfo_mb_t *cginfo_mb, int *cginfo)
7370 {
7371     gmx_domdec_ind_t *ind, *ind_p;
7372     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7373     int               shift, shift_at;
7374
7375     ind = &cd->ind[pulse];
7376
7377     /* First correct the already stored data */
7378     shift = ind->nrecv[ncell];
7379     for (cell = ncell-1; cell >= 0; cell--)
7380     {
7381         shift -= ind->nrecv[cell];
7382         if (shift > 0)
7383         {
7384             /* Move the cg's present from previous grid pulses */
7385             cg0                = ncg_cell[ncell+cell];
7386             cg1                = ncg_cell[ncell+cell+1];
7387             cgindex[cg1+shift] = cgindex[cg1];
7388             for (cg = cg1-1; cg >= cg0; cg--)
7389             {
7390                 index_gl[cg+shift] = index_gl[cg];
7391                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7392                 cgindex[cg+shift] = cgindex[cg];
7393                 cginfo[cg+shift]  = cginfo[cg];
7394             }
7395             /* Correct the already stored send indices for the shift */
7396             for (p = 1; p <= pulse; p++)
7397             {
7398                 ind_p = &cd->ind[p];
7399                 cg0   = 0;
7400                 for (c = 0; c < cell; c++)
7401                 {
7402                     cg0 += ind_p->nsend[c];
7403                 }
7404                 cg1 = cg0 + ind_p->nsend[cell];
7405                 for (cg = cg0; cg < cg1; cg++)
7406                 {
7407                     ind_p->index[cg] += shift;
7408                 }
7409             }
7410         }
7411     }
7412
7413     /* Merge in the communicated buffers */
7414     shift    = 0;
7415     shift_at = 0;
7416     cg0      = 0;
7417     for (cell = 0; cell < ncell; cell++)
7418     {
7419         cg1 = ncg_cell[ncell+cell+1] + shift;
7420         if (shift_at > 0)
7421         {
7422             /* Correct the old cg indices */
7423             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7424             {
7425                 cgindex[cg+1] += shift_at;
7426             }
7427         }
7428         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7429         {
7430             /* Copy this charge group from the buffer */
7431             index_gl[cg1] = recv_i[cg0];
7432             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7433             /* Add it to the cgindex */
7434             cg_gl          = index_gl[cg1];
7435             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7436             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7437             cgindex[cg1+1] = cgindex[cg1] + nat;
7438             cg0++;
7439             cg1++;
7440             shift_at += nat;
7441         }
7442         shift                 += ind->nrecv[cell];
7443         ncg_cell[ncell+cell+1] = cg1;
7444     }
7445 }
7446
7447 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7448                                int nzone, int cg0, const int *cgindex)
7449 {
7450     int cg, zone, p;
7451
7452     /* Store the atom block boundaries for easy copying of communication buffers
7453      */
7454     cg = cg0;
7455     for (zone = 0; zone < nzone; zone++)
7456     {
7457         for (p = 0; p < cd->np; p++)
7458         {
7459             cd->ind[p].cell2at0[zone] = cgindex[cg];
7460             cg += cd->ind[p].nrecv[zone];
7461             cd->ind[p].cell2at1[zone] = cgindex[cg];
7462         }
7463     }
7464 }
7465
7466 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7467 {
7468     int      i;
7469     gmx_bool bMiss;
7470
7471     bMiss = FALSE;
7472     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7473     {
7474         if (!bLocalCG[link->a[i]])
7475         {
7476             bMiss = TRUE;
7477         }
7478     }
7479
7480     return bMiss;
7481 }
7482
7483 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7484 typedef struct {
7485     real c[DIM][4]; /* the corners for the non-bonded communication */
7486     real cr0;       /* corner for rounding */
7487     real cr1[4];    /* corners for rounding */
7488     real bc[DIM];   /* corners for bounded communication */
7489     real bcr1;      /* corner for rounding for bonded communication */
7490 } dd_corners_t;
7491
7492 /* Determine the corners of the domain(s) we are communicating with */
7493 static void
7494 set_dd_corners(const gmx_domdec_t *dd,
7495                int dim0, int dim1, int dim2,
7496                gmx_bool bDistMB,
7497                dd_corners_t *c)
7498 {
7499     const gmx_domdec_comm_t  *comm;
7500     const gmx_domdec_zones_t *zones;
7501     int i, j;
7502
7503     comm = dd->comm;
7504
7505     zones = &comm->zones;
7506
7507     /* Keep the compiler happy */
7508     c->cr0  = 0;
7509     c->bcr1 = 0;
7510
7511     /* The first dimension is equal for all cells */
7512     c->c[0][0] = comm->cell_x0[dim0];
7513     if (bDistMB)
7514     {
7515         c->bc[0] = c->c[0][0];
7516     }
7517     if (dd->ndim >= 2)
7518     {
7519         dim1 = dd->dim[1];
7520         /* This cell row is only seen from the first row */
7521         c->c[1][0] = comm->cell_x0[dim1];
7522         /* All rows can see this row */
7523         c->c[1][1] = comm->cell_x0[dim1];
7524         if (dlbIsOn(dd->comm))
7525         {
7526             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7527             if (bDistMB)
7528             {
7529                 /* For the multi-body distance we need the maximum */
7530                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7531             }
7532         }
7533         /* Set the upper-right corner for rounding */
7534         c->cr0 = comm->cell_x1[dim0];
7535
7536         if (dd->ndim >= 3)
7537         {
7538             dim2 = dd->dim[2];
7539             for (j = 0; j < 4; j++)
7540             {
7541                 c->c[2][j] = comm->cell_x0[dim2];
7542             }
7543             if (dlbIsOn(dd->comm))
7544             {
7545                 /* Use the maximum of the i-cells that see a j-cell */
7546                 for (i = 0; i < zones->nizone; i++)
7547                 {
7548                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7549                     {
7550                         if (j >= 4)
7551                         {
7552                             c->c[2][j-4] =
7553                                 std::max(c->c[2][j-4],
7554                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7555                         }
7556                     }
7557                 }
7558                 if (bDistMB)
7559                 {
7560                     /* For the multi-body distance we need the maximum */
7561                     c->bc[2] = comm->cell_x0[dim2];
7562                     for (i = 0; i < 2; i++)
7563                     {
7564                         for (j = 0; j < 2; j++)
7565                         {
7566                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
7567                         }
7568                     }
7569                 }
7570             }
7571
7572             /* Set the upper-right corner for rounding */
7573             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7574              * Only cell (0,0,0) can see cell 7 (1,1,1)
7575              */
7576             c->cr1[0] = comm->cell_x1[dim1];
7577             c->cr1[3] = comm->cell_x1[dim1];
7578             if (dlbIsOn(dd->comm))
7579             {
7580                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7581                 if (bDistMB)
7582                 {
7583                     /* For the multi-body distance we need the maximum */
7584                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7585                 }
7586             }
7587         }
7588     }
7589 }
7590
7591 /* Determine which cg's we need to send in this pulse from this zone */
7592 static void
7593 get_zone_pulse_cgs(gmx_domdec_t *dd,
7594                    int zonei, int zone,
7595                    int cg0, int cg1,
7596                    const int *index_gl,
7597                    const int *cgindex,
7598                    int dim, int dim_ind,
7599                    int dim0, int dim1, int dim2,
7600                    real r_comm2, real r_bcomm2,
7601                    matrix box,
7602                    ivec tric_dist,
7603                    rvec *normal,
7604                    real skew_fac2_d, real skew_fac_01,
7605                    rvec *v_d, rvec *v_0, rvec *v_1,
7606                    const dd_corners_t *c,
7607                    rvec sf2_round,
7608                    gmx_bool bDistBonded,
7609                    gmx_bool bBondComm,
7610                    gmx_bool bDist2B,
7611                    gmx_bool bDistMB,
7612                    rvec *cg_cm,
7613                    int *cginfo,
7614                    gmx_domdec_ind_t *ind,
7615                    int **ibuf, int *ibuf_nalloc,
7616                    vec_rvec_t *vbuf,
7617                    int *nsend_ptr,
7618                    int *nat_ptr,
7619                    int *nsend_z_ptr)
7620 {
7621     gmx_domdec_comm_t *comm;
7622     gmx_bool           bScrew;
7623     gmx_bool           bDistMB_pulse;
7624     int                cg, i;
7625     real               r2, rb2, r, tric_sh;
7626     rvec               rn, rb;
7627     int                dimd;
7628     int                nsend_z, nsend, nat;
7629
7630     comm = dd->comm;
7631
7632     bScrew = (dd->bScrewPBC && dim == XX);
7633
7634     bDistMB_pulse = (bDistMB && bDistBonded);
7635
7636     nsend_z = 0;
7637     nsend   = *nsend_ptr;
7638     nat     = *nat_ptr;
7639
7640     for (cg = cg0; cg < cg1; cg++)
7641     {
7642         r2  = 0;
7643         rb2 = 0;
7644         if (tric_dist[dim_ind] == 0)
7645         {
7646             /* Rectangular direction, easy */
7647             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7648             if (r > 0)
7649             {
7650                 r2 += r*r;
7651             }
7652             if (bDistMB_pulse)
7653             {
7654                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7655                 if (r > 0)
7656                 {
7657                     rb2 += r*r;
7658                 }
7659             }
7660             /* Rounding gives at most a 16% reduction
7661              * in communicated atoms
7662              */
7663             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7664             {
7665                 r = cg_cm[cg][dim0] - c->cr0;
7666                 /* This is the first dimension, so always r >= 0 */
7667                 r2 += r*r;
7668                 if (bDistMB_pulse)
7669                 {
7670                     rb2 += r*r;
7671                 }
7672             }
7673             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7674             {
7675                 r = cg_cm[cg][dim1] - c->cr1[zone];
7676                 if (r > 0)
7677                 {
7678                     r2 += r*r;
7679                 }
7680                 if (bDistMB_pulse)
7681                 {
7682                     r = cg_cm[cg][dim1] - c->bcr1;
7683                     if (r > 0)
7684                     {
7685                         rb2 += r*r;
7686                     }
7687                 }
7688             }
7689         }
7690         else
7691         {
7692             /* Triclinic direction, more complicated */
7693             clear_rvec(rn);
7694             clear_rvec(rb);
7695             /* Rounding, conservative as the skew_fac multiplication
7696              * will slightly underestimate the distance.
7697              */
7698             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7699             {
7700                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7701                 for (i = dim0+1; i < DIM; i++)
7702                 {
7703                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7704                 }
7705                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7706                 if (bDistMB_pulse)
7707                 {
7708                     rb[dim0] = rn[dim0];
7709                     rb2      = r2;
7710                 }
7711                 /* Take care that the cell planes along dim0 might not
7712                  * be orthogonal to those along dim1 and dim2.
7713                  */
7714                 for (i = 1; i <= dim_ind; i++)
7715                 {
7716                     dimd = dd->dim[i];
7717                     if (normal[dim0][dimd] > 0)
7718                     {
7719                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7720                         if (bDistMB_pulse)
7721                         {
7722                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7723                         }
7724                     }
7725                 }
7726             }
7727             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7728             {
7729                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7730                 tric_sh   = 0;
7731                 for (i = dim1+1; i < DIM; i++)
7732                 {
7733                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7734                 }
7735                 rn[dim1] += tric_sh;
7736                 if (rn[dim1] > 0)
7737                 {
7738                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7739                     /* Take care of coupling of the distances
7740                      * to the planes along dim0 and dim1 through dim2.
7741                      */
7742                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7743                     /* Take care that the cell planes along dim1
7744                      * might not be orthogonal to that along dim2.
7745                      */
7746                     if (normal[dim1][dim2] > 0)
7747                     {
7748                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7749                     }
7750                 }
7751                 if (bDistMB_pulse)
7752                 {
7753                     rb[dim1] +=
7754                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7755                     if (rb[dim1] > 0)
7756                     {
7757                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7758                         /* Take care of coupling of the distances
7759                          * to the planes along dim0 and dim1 through dim2.
7760                          */
7761                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7762                         /* Take care that the cell planes along dim1
7763                          * might not be orthogonal to that along dim2.
7764                          */
7765                         if (normal[dim1][dim2] > 0)
7766                         {
7767                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7768                         }
7769                     }
7770                 }
7771             }
7772             /* The distance along the communication direction */
7773             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7774             tric_sh  = 0;
7775             for (i = dim+1; i < DIM; i++)
7776             {
7777                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7778             }
7779             rn[dim] += tric_sh;
7780             if (rn[dim] > 0)
7781             {
7782                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7783                 /* Take care of coupling of the distances
7784                  * to the planes along dim0 and dim1 through dim2.
7785                  */
7786                 if (dim_ind == 1 && zonei == 1)
7787                 {
7788                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7789                 }
7790             }
7791             if (bDistMB_pulse)
7792             {
7793                 clear_rvec(rb);
7794                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7795                 if (rb[dim] > 0)
7796                 {
7797                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7798                     /* Take care of coupling of the distances
7799                      * to the planes along dim0 and dim1 through dim2.
7800                      */
7801                     if (dim_ind == 1 && zonei == 1)
7802                     {
7803                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7804                     }
7805                 }
7806             }
7807         }
7808
7809         if (r2 < r_comm2 ||
7810             (bDistBonded &&
7811              ((bDistMB && rb2 < r_bcomm2) ||
7812               (bDist2B && r2  < r_bcomm2)) &&
7813              (!bBondComm ||
7814               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7815                missing_link(comm->cglink, index_gl[cg],
7816                             comm->bLocalCG)))))
7817         {
7818             /* Make an index to the local charge groups */
7819             if (nsend+1 > ind->nalloc)
7820             {
7821                 ind->nalloc = over_alloc_large(nsend+1);
7822                 srenew(ind->index, ind->nalloc);
7823             }
7824             if (nsend+1 > *ibuf_nalloc)
7825             {
7826                 *ibuf_nalloc = over_alloc_large(nsend+1);
7827                 srenew(*ibuf, *ibuf_nalloc);
7828             }
7829             ind->index[nsend] = cg;
7830             (*ibuf)[nsend]    = index_gl[cg];
7831             nsend_z++;
7832             vec_rvec_check_alloc(vbuf, nsend+1);
7833
7834             if (dd->ci[dim] == 0)
7835             {
7836                 /* Correct cg_cm for pbc */
7837                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7838                 if (bScrew)
7839                 {
7840                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7841                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7842                 }
7843             }
7844             else
7845             {
7846                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7847             }
7848             nsend++;
7849             nat += cgindex[cg+1] - cgindex[cg];
7850         }
7851     }
7852
7853     *nsend_ptr   = nsend;
7854     *nat_ptr     = nat;
7855     *nsend_z_ptr = nsend_z;
7856 }
7857
7858 static void setup_dd_communication(gmx_domdec_t *dd,
7859                                    matrix box, gmx_ddbox_t *ddbox,
7860                                    t_forcerec *fr,
7861                                    t_state *state, PaddedRVecVector *f)
7862 {
7863     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7864     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7865     int                    c, i, cg, cg_gl, nrcg;
7866     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7867     gmx_domdec_comm_t     *comm;
7868     gmx_domdec_zones_t    *zones;
7869     gmx_domdec_comm_dim_t *cd;
7870     gmx_domdec_ind_t      *ind;
7871     cginfo_mb_t           *cginfo_mb;
7872     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
7873     real                   r_comm2, r_bcomm2;
7874     dd_corners_t           corners;
7875     ivec                   tric_dist;
7876     rvec                  *cg_cm, *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr, *recv_vr;
7877     real                   skew_fac2_d, skew_fac_01;
7878     rvec                   sf2_round;
7879     int                    nsend, nat;
7880     int                    th;
7881
7882     if (debug)
7883     {
7884         fprintf(debug, "Setting up DD communication\n");
7885     }
7886
7887     comm  = dd->comm;
7888
7889     if (comm->nth == 0)
7890     {
7891         /* Initialize the thread data.
7892          * This can not be done in init_domain_decomposition,
7893          * as the numbers of threads is determined later.
7894          */
7895         comm->nth = gmx_omp_nthreads_get(emntDomdec);
7896         if (comm->nth > 1)
7897         {
7898             snew(comm->dth, comm->nth);
7899         }
7900     }
7901
7902     switch (fr->cutoff_scheme)
7903     {
7904         case ecutsGROUP:
7905             cg_cm = fr->cg_cm;
7906             break;
7907         case ecutsVERLET:
7908             cg_cm = as_rvec_array(state->x.data());
7909             break;
7910         default:
7911             gmx_incons("unimplemented");
7912             cg_cm = nullptr;
7913     }
7914
7915     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
7916     {
7917         /* Check if we need to use triclinic distances */
7918         tric_dist[dim_ind] = 0;
7919         for (i = 0; i <= dim_ind; i++)
7920         {
7921             if (ddbox->tric_dir[dd->dim[i]])
7922             {
7923                 tric_dist[dim_ind] = 1;
7924             }
7925         }
7926     }
7927
7928     bBondComm = comm->bBondComm;
7929
7930     /* Do we need to determine extra distances for multi-body bondeds? */
7931     bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
7932
7933     /* Do we need to determine extra distances for only two-body bondeds? */
7934     bDist2B = (bBondComm && !bDistMB);
7935
7936     r_comm2  = gmx::square(comm->cutoff);
7937     r_bcomm2 = gmx::square(comm->cutoff_mbody);
7938
7939     if (debug)
7940     {
7941         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, std::sqrt(r_bcomm2));
7942     }
7943
7944     zones = &comm->zones;
7945
7946     dim0 = dd->dim[0];
7947     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
7948     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
7949
7950     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
7951
7952     /* Triclinic stuff */
7953     normal      = ddbox->normal;
7954     skew_fac_01 = 0;
7955     if (dd->ndim >= 2)
7956     {
7957         v_0 = ddbox->v[dim0];
7958         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7959         {
7960             /* Determine the coupling coefficient for the distances
7961              * to the cell planes along dim0 and dim1 through dim2.
7962              * This is required for correct rounding.
7963              */
7964             skew_fac_01 =
7965                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7966             if (debug)
7967             {
7968                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
7969             }
7970         }
7971     }
7972     if (dd->ndim >= 3)
7973     {
7974         v_1 = ddbox->v[dim1];
7975     }
7976
7977     zone_cg_range = zones->cg_range;
7978     index_gl      = dd->index_gl;
7979     cgindex       = dd->cgindex;
7980     cginfo_mb     = fr->cginfo_mb;
7981
7982     zone_cg_range[0]   = 0;
7983     zone_cg_range[1]   = dd->ncg_home;
7984     comm->zone_ncg1[0] = dd->ncg_home;
7985     pos_cg             = dd->ncg_home;
7986
7987     nat_tot = dd->nat_home;
7988     nzone   = 1;
7989     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
7990     {
7991         dim = dd->dim[dim_ind];
7992         cd  = &comm->cd[dim_ind];
7993
7994         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7995         {
7996             /* No pbc in this dimension, the first node should not comm. */
7997             nzone_send = 0;
7998         }
7999         else
8000         {
8001             nzone_send = nzone;
8002         }
8003
8004         v_d         = ddbox->v[dim];
8005         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
8006
8007         cd->bInPlace = TRUE;
8008         for (p = 0; p < cd->np; p++)
8009         {
8010             /* Only atoms communicated in the first pulse are used
8011              * for multi-body bonded interactions or for bBondComm.
8012              */
8013             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8014
8015             ind   = &cd->ind[p];
8016             nsend = 0;
8017             nat   = 0;
8018             for (zone = 0; zone < nzone_send; zone++)
8019             {
8020                 if (tric_dist[dim_ind] && dim_ind > 0)
8021                 {
8022                     /* Determine slightly more optimized skew_fac's
8023                      * for rounding.
8024                      * This reduces the number of communicated atoms
8025                      * by about 10% for 3D DD of rhombic dodecahedra.
8026                      */
8027                     for (dimd = 0; dimd < dim; dimd++)
8028                     {
8029                         sf2_round[dimd] = 1;
8030                         if (ddbox->tric_dir[dimd])
8031                         {
8032                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8033                             {
8034                                 /* If we are shifted in dimension i
8035                                  * and the cell plane is tilted forward
8036                                  * in dimension i, skip this coupling.
8037                                  */
8038                                 if (!(zones->shift[nzone+zone][i] &&
8039                                       ddbox->v[dimd][i][dimd] >= 0))
8040                                 {
8041                                     sf2_round[dimd] +=
8042                                         gmx::square(ddbox->v[dimd][i][dimd]);
8043                                 }
8044                             }
8045                             sf2_round[dimd] = 1/sf2_round[dimd];
8046                         }
8047                     }
8048                 }
8049
8050                 zonei = zone_perm[dim_ind][zone];
8051                 if (p == 0)
8052                 {
8053                     /* Here we permutate the zones to obtain a convenient order
8054                      * for neighbor searching
8055                      */
8056                     cg0 = zone_cg_range[zonei];
8057                     cg1 = zone_cg_range[zonei+1];
8058                 }
8059                 else
8060                 {
8061                     /* Look only at the cg's received in the previous grid pulse
8062                      */
8063                     cg1 = zone_cg_range[nzone+zone+1];
8064                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8065                 }
8066
8067 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8068                 for (th = 0; th < comm->nth; th++)
8069                 {
8070                     try
8071                     {
8072                         gmx_domdec_ind_t *ind_p;
8073                         int             **ibuf_p, *ibuf_nalloc_p;
8074                         vec_rvec_t       *vbuf_p;
8075                         int              *nsend_p, *nat_p;
8076                         int              *nsend_zone_p;
8077                         int               cg0_th, cg1_th;
8078
8079                         if (th == 0)
8080                         {
8081                             /* Thread 0 writes in the comm buffers */
8082                             ind_p         = ind;
8083                             ibuf_p        = &comm->buf_int;
8084                             ibuf_nalloc_p = &comm->nalloc_int;
8085                             vbuf_p        = &comm->vbuf;
8086                             nsend_p       = &nsend;
8087                             nat_p         = &nat;
8088                             nsend_zone_p  = &ind->nsend[zone];
8089                         }
8090                         else
8091                         {
8092                             /* Other threads write into temp buffers */
8093                             ind_p         = &comm->dth[th].ind;
8094                             ibuf_p        = &comm->dth[th].ibuf;
8095                             ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8096                             vbuf_p        = &comm->dth[th].vbuf;
8097                             nsend_p       = &comm->dth[th].nsend;
8098                             nat_p         = &comm->dth[th].nat;
8099                             nsend_zone_p  = &comm->dth[th].nsend_zone;
8100
8101                             comm->dth[th].nsend      = 0;
8102                             comm->dth[th].nat        = 0;
8103                             comm->dth[th].nsend_zone = 0;
8104                         }
8105
8106                         if (comm->nth == 1)
8107                         {
8108                             cg0_th = cg0;
8109                             cg1_th = cg1;
8110                         }
8111                         else
8112                         {
8113                             cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8114                             cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8115                         }
8116
8117                         /* Get the cg's for this pulse in this zone */
8118                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8119                                            index_gl, cgindex,
8120                                            dim, dim_ind, dim0, dim1, dim2,
8121                                            r_comm2, r_bcomm2,
8122                                            box, tric_dist,
8123                                            normal, skew_fac2_d, skew_fac_01,
8124                                            v_d, v_0, v_1, &corners, sf2_round,
8125                                            bDistBonded, bBondComm,
8126                                            bDist2B, bDistMB,
8127                                            cg_cm, fr->cginfo,
8128                                            ind_p,
8129                                            ibuf_p, ibuf_nalloc_p,
8130                                            vbuf_p,
8131                                            nsend_p, nat_p,
8132                                            nsend_zone_p);
8133                     }
8134                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
8135                 } // END
8136
8137                 /* Append data of threads>=1 to the communication buffers */
8138                 for (th = 1; th < comm->nth; th++)
8139                 {
8140                     dd_comm_setup_work_t *dth;
8141                     int                   i, ns1;
8142
8143                     dth = &comm->dth[th];
8144
8145                     ns1 = nsend + dth->nsend_zone;
8146                     if (ns1 > ind->nalloc)
8147                     {
8148                         ind->nalloc = over_alloc_dd(ns1);
8149                         srenew(ind->index, ind->nalloc);
8150                     }
8151                     if (ns1 > comm->nalloc_int)
8152                     {
8153                         comm->nalloc_int = over_alloc_dd(ns1);
8154                         srenew(comm->buf_int, comm->nalloc_int);
8155                     }
8156                     if (ns1 > comm->vbuf.nalloc)
8157                     {
8158                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8159                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8160                     }
8161
8162                     for (i = 0; i < dth->nsend_zone; i++)
8163                     {
8164                         ind->index[nsend]    = dth->ind.index[i];
8165                         comm->buf_int[nsend] = dth->ibuf[i];
8166                         copy_rvec(dth->vbuf.v[i],
8167                                   comm->vbuf.v[nsend]);
8168                         nsend++;
8169                     }
8170                     nat              += dth->nat;
8171                     ind->nsend[zone] += dth->nsend_zone;
8172                 }
8173             }
8174             /* Clear the counts in case we do not have pbc */
8175             for (zone = nzone_send; zone < nzone; zone++)
8176             {
8177                 ind->nsend[zone] = 0;
8178             }
8179             ind->nsend[nzone]   = nsend;
8180             ind->nsend[nzone+1] = nat;
8181             /* Communicate the number of cg's and atoms to receive */
8182             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8183                             ind->nsend, nzone+2,
8184                             ind->nrecv, nzone+2);
8185
8186             /* The rvec buffer is also required for atom buffers of size nsend
8187              * in dd_move_x and dd_move_f.
8188              */
8189             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8190
8191             if (p > 0)
8192             {
8193                 /* We can receive in place if only the last zone is not empty */
8194                 for (zone = 0; zone < nzone-1; zone++)
8195                 {
8196                     if (ind->nrecv[zone] > 0)
8197                     {
8198                         cd->bInPlace = FALSE;
8199                     }
8200                 }
8201                 if (!cd->bInPlace)
8202                 {
8203                     /* The int buffer is only required here for the cg indices */
8204                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8205                     {
8206                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8207                         srenew(comm->buf_int2, comm->nalloc_int2);
8208                     }
8209                     /* The rvec buffer is also required for atom buffers
8210                      * of size nrecv in dd_move_x and dd_move_f.
8211                      */
8212                     i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8213                     vec_rvec_check_alloc(&comm->vbuf2, i);
8214                 }
8215             }
8216
8217             /* Make space for the global cg indices */
8218             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8219                 || dd->cg_nalloc == 0)
8220             {
8221                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8222                 srenew(index_gl, dd->cg_nalloc);
8223                 srenew(cgindex, dd->cg_nalloc+1);
8224             }
8225             /* Communicate the global cg indices */
8226             if (cd->bInPlace)
8227             {
8228                 recv_i = index_gl + pos_cg;
8229             }
8230             else
8231             {
8232                 recv_i = comm->buf_int2;
8233             }
8234             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8235                             comm->buf_int, nsend,
8236                             recv_i,        ind->nrecv[nzone]);
8237
8238             /* Make space for cg_cm */
8239             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8240             if (fr->cutoff_scheme == ecutsGROUP)
8241             {
8242                 cg_cm = fr->cg_cm;
8243             }
8244             else
8245             {
8246                 cg_cm = as_rvec_array(state->x.data());
8247             }
8248             /* Communicate cg_cm */
8249             if (cd->bInPlace)
8250             {
8251                 recv_vr = cg_cm + pos_cg;
8252             }
8253             else
8254             {
8255                 recv_vr = comm->vbuf2.v;
8256             }
8257             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8258                              comm->vbuf.v, nsend,
8259                              recv_vr,      ind->nrecv[nzone]);
8260
8261             /* Make the charge group index */
8262             if (cd->bInPlace)
8263             {
8264                 zone = (p == 0 ? 0 : nzone - 1);
8265                 while (zone < nzone)
8266                 {
8267                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8268                     {
8269                         cg_gl              = index_gl[pos_cg];
8270                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8271                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8272                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8273                         if (bBondComm)
8274                         {
8275                             /* Update the charge group presence,
8276                              * so we can use it in the next pass of the loop.
8277                              */
8278                             comm->bLocalCG[cg_gl] = TRUE;
8279                         }
8280                         pos_cg++;
8281                     }
8282                     if (p == 0)
8283                     {
8284                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8285                     }
8286                     zone++;
8287                     zone_cg_range[nzone+zone] = pos_cg;
8288                 }
8289             }
8290             else
8291             {
8292                 /* This part of the code is never executed with bBondComm. */
8293                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8294                                  index_gl, recv_i, cg_cm, recv_vr,
8295                                  cgindex, fr->cginfo_mb, fr->cginfo);
8296                 pos_cg += ind->nrecv[nzone];
8297             }
8298             nat_tot += ind->nrecv[nzone+1];
8299         }
8300         if (!cd->bInPlace)
8301         {
8302             /* Store the atom block for easy copying of communication buffers */
8303             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8304         }
8305         nzone += nzone;
8306     }
8307     dd->index_gl = index_gl;
8308     dd->cgindex  = cgindex;
8309
8310     dd->ncg_tot          = zone_cg_range[zones->n];
8311     dd->nat_tot          = nat_tot;
8312     comm->nat[ddnatHOME] = dd->nat_home;
8313     for (i = ddnatZONE; i < ddnatNR; i++)
8314     {
8315         comm->nat[i] = dd->nat_tot;
8316     }
8317
8318     if (!bBondComm)
8319     {
8320         /* We don't need to update cginfo, since that was alrady done above.
8321          * So we pass NULL for the forcerec.
8322          */
8323         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8324                       nullptr, comm->bLocalCG);
8325     }
8326
8327     if (debug)
8328     {
8329         fprintf(debug, "Finished setting up DD communication, zones:");
8330         for (c = 0; c < zones->n; c++)
8331         {
8332             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8333         }
8334         fprintf(debug, "\n");
8335     }
8336 }
8337
8338 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8339 {
8340     int c;
8341
8342     for (c = 0; c < zones->nizone; c++)
8343     {
8344         zones->izone[c].cg1  = zones->cg_range[c+1];
8345         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8346         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8347     }
8348 }
8349
8350 static void set_zones_size(gmx_domdec_t *dd,
8351                            matrix box, const gmx_ddbox_t *ddbox,
8352                            int zone_start, int zone_end)
8353 {
8354     gmx_domdec_comm_t  *comm;
8355     gmx_domdec_zones_t *zones;
8356     gmx_bool            bDistMB;
8357     int                 z, zi, d, dim;
8358     real                rcs, rcmbs;
8359     int                 i, j;
8360     real                vol;
8361
8362     comm = dd->comm;
8363
8364     zones = &comm->zones;
8365
8366     /* Do we need to determine extra distances for multi-body bondeds? */
8367     bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
8368
8369     for (z = zone_start; z < zone_end; z++)
8370     {
8371         /* Copy cell limits to zone limits.
8372          * Valid for non-DD dims and non-shifted dims.
8373          */
8374         copy_rvec(comm->cell_x0, zones->size[z].x0);
8375         copy_rvec(comm->cell_x1, zones->size[z].x1);
8376     }
8377
8378     for (d = 0; d < dd->ndim; d++)
8379     {
8380         dim = dd->dim[d];
8381
8382         for (z = 0; z < zones->n; z++)
8383         {
8384             /* With a staggered grid we have different sizes
8385              * for non-shifted dimensions.
8386              */
8387             if (dlbIsOn(dd->comm) && zones->shift[z][dim] == 0)
8388             {
8389                 if (d == 1)
8390                 {
8391                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8392                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8393                 }
8394                 else if (d == 2)
8395                 {
8396                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8397                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8398                 }
8399             }
8400         }
8401
8402         rcs   = comm->cutoff;
8403         rcmbs = comm->cutoff_mbody;
8404         if (ddbox->tric_dir[dim])
8405         {
8406             rcs   /= ddbox->skew_fac[dim];
8407             rcmbs /= ddbox->skew_fac[dim];
8408         }
8409
8410         /* Set the lower limit for the shifted zone dimensions */
8411         for (z = zone_start; z < zone_end; z++)
8412         {
8413             if (zones->shift[z][dim] > 0)
8414             {
8415                 dim = dd->dim[d];
8416                 if (!dlbIsOn(dd->comm) || d == 0)
8417                 {
8418                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8419                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8420                 }
8421                 else
8422                 {
8423                     /* Here we take the lower limit of the zone from
8424                      * the lowest domain of the zone below.
8425                      */
8426                     if (z < 4)
8427                     {
8428                         zones->size[z].x0[dim] =
8429                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8430                     }
8431                     else
8432                     {
8433                         if (d == 1)
8434                         {
8435                             zones->size[z].x0[dim] =
8436                                 zones->size[zone_perm[2][z-4]].x0[dim];
8437                         }
8438                         else
8439                         {
8440                             zones->size[z].x0[dim] =
8441                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8442                         }
8443                     }
8444                     /* A temporary limit, is updated below */
8445                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8446
8447                     if (bDistMB)
8448                     {
8449                         for (zi = 0; zi < zones->nizone; zi++)
8450                         {
8451                             if (zones->shift[zi][dim] == 0)
8452                             {
8453                                 /* This takes the whole zone into account.
8454                                  * With multiple pulses this will lead
8455                                  * to a larger zone then strictly necessary.
8456                                  */
8457                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8458                                                                   zones->size[zi].x1[dim]+rcmbs);
8459                             }
8460                         }
8461                     }
8462                 }
8463             }
8464         }
8465
8466         /* Loop over the i-zones to set the upper limit of each
8467          * j-zone they see.
8468          */
8469         for (zi = 0; zi < zones->nizone; zi++)
8470         {
8471             if (zones->shift[zi][dim] == 0)
8472             {
8473                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8474                 {
8475                     if (zones->shift[z][dim] > 0)
8476                     {
8477                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8478                                                           zones->size[zi].x1[dim]+rcs);
8479                     }
8480                 }
8481             }
8482         }
8483     }
8484
8485     for (z = zone_start; z < zone_end; z++)
8486     {
8487         /* Initialization only required to keep the compiler happy */
8488         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8489         int  nc, c;
8490
8491         /* To determine the bounding box for a zone we need to find
8492          * the extreme corners of 4, 2 or 1 corners.
8493          */
8494         nc = 1 << (ddbox->nboundeddim - 1);
8495
8496         for (c = 0; c < nc; c++)
8497         {
8498             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8499             corner[XX] = 0;
8500             if ((c & 1) == 0)
8501             {
8502                 corner[YY] = zones->size[z].x0[YY];
8503             }
8504             else
8505             {
8506                 corner[YY] = zones->size[z].x1[YY];
8507             }
8508             if ((c & 2) == 0)
8509             {
8510                 corner[ZZ] = zones->size[z].x0[ZZ];
8511             }
8512             else
8513             {
8514                 corner[ZZ] = zones->size[z].x1[ZZ];
8515             }
8516             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
8517                 box[ZZ][1 - dd->dim[0]] != 0)
8518             {
8519                 /* With 1D domain decomposition the cg's are not in
8520                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
8521                  * Shift the corner of the z-vector back to along the box
8522                  * vector of dimension d, so it will later end up at 0 along d.
8523                  * This can affect the location of this corner along dd->dim[0]
8524                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
8525                  */
8526                 int d = 1 - dd->dim[0];
8527
8528                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
8529             }
8530             /* Apply the triclinic couplings */
8531             assert(ddbox->npbcdim <= DIM);
8532             for (i = YY; i < ddbox->npbcdim; i++)
8533             {
8534                 for (j = XX; j < i; j++)
8535                 {
8536                     corner[j] += corner[i]*box[i][j]/box[i][i];
8537                 }
8538             }
8539             if (c == 0)
8540             {
8541                 copy_rvec(corner, corner_min);
8542                 copy_rvec(corner, corner_max);
8543             }
8544             else
8545             {
8546                 for (i = 0; i < DIM; i++)
8547                 {
8548                     corner_min[i] = std::min(corner_min[i], corner[i]);
8549                     corner_max[i] = std::max(corner_max[i], corner[i]);
8550                 }
8551             }
8552         }
8553         /* Copy the extreme cornes without offset along x */
8554         for (i = 0; i < DIM; i++)
8555         {
8556             zones->size[z].bb_x0[i] = corner_min[i];
8557             zones->size[z].bb_x1[i] = corner_max[i];
8558         }
8559         /* Add the offset along x */
8560         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8561         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8562     }
8563
8564     if (zone_start == 0)
8565     {
8566         vol = 1;
8567         for (dim = 0; dim < DIM; dim++)
8568         {
8569             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8570         }
8571         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8572     }
8573
8574     if (debug)
8575     {
8576         for (z = zone_start; z < zone_end; z++)
8577         {
8578             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8579                     z,
8580                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8581                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8582                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8583             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8584                     z,
8585                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8586                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8587                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8588         }
8589     }
8590 }
8591
8592 static int comp_cgsort(const void *a, const void *b)
8593 {
8594     int           comp;
8595
8596     gmx_cgsort_t *cga, *cgb;
8597     cga = (gmx_cgsort_t *)a;
8598     cgb = (gmx_cgsort_t *)b;
8599
8600     comp = cga->nsc - cgb->nsc;
8601     if (comp == 0)
8602     {
8603         comp = cga->ind_gl - cgb->ind_gl;
8604     }
8605
8606     return comp;
8607 }
8608
8609 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8610                          int *a, int *buf)
8611 {
8612     int i;
8613
8614     /* Order the data */
8615     for (i = 0; i < n; i++)
8616     {
8617         buf[i] = a[sort[i].ind];
8618     }
8619
8620     /* Copy back to the original array */
8621     for (i = 0; i < n; i++)
8622     {
8623         a[i] = buf[i];
8624     }
8625 }
8626
8627 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8628                          rvec *v, rvec *buf)
8629 {
8630     int i;
8631
8632     /* Order the data */
8633     for (i = 0; i < n; i++)
8634     {
8635         copy_rvec(v[sort[i].ind], buf[i]);
8636     }
8637
8638     /* Copy back to the original array */
8639     for (i = 0; i < n; i++)
8640     {
8641         copy_rvec(buf[i], v[i]);
8642     }
8643 }
8644
8645 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8646                            rvec *v, rvec *buf)
8647 {
8648     int a, atot, cg, cg0, cg1, i;
8649
8650     if (cgindex == nullptr)
8651     {
8652         /* Avoid the useless loop of the atoms within a cg */
8653         order_vec_cg(ncg, sort, v, buf);
8654
8655         return;
8656     }
8657
8658     /* Order the data */
8659     a = 0;
8660     for (cg = 0; cg < ncg; cg++)
8661     {
8662         cg0 = cgindex[sort[cg].ind];
8663         cg1 = cgindex[sort[cg].ind+1];
8664         for (i = cg0; i < cg1; i++)
8665         {
8666             copy_rvec(v[i], buf[a]);
8667             a++;
8668         }
8669     }
8670     atot = a;
8671
8672     /* Copy back to the original array */
8673     for (a = 0; a < atot; a++)
8674     {
8675         copy_rvec(buf[a], v[a]);
8676     }
8677 }
8678
8679 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8680                          int nsort_new, gmx_cgsort_t *sort_new,
8681                          gmx_cgsort_t *sort1)
8682 {
8683     int i1, i2, i_new;
8684
8685     /* The new indices are not very ordered, so we qsort them */
8686     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8687
8688     /* sort2 is already ordered, so now we can merge the two arrays */
8689     i1    = 0;
8690     i2    = 0;
8691     i_new = 0;
8692     while (i2 < nsort2 || i_new < nsort_new)
8693     {
8694         if (i2 == nsort2)
8695         {
8696             sort1[i1++] = sort_new[i_new++];
8697         }
8698         else if (i_new == nsort_new)
8699         {
8700             sort1[i1++] = sort2[i2++];
8701         }
8702         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8703                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8704                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8705         {
8706             sort1[i1++] = sort2[i2++];
8707         }
8708         else
8709         {
8710             sort1[i1++] = sort_new[i_new++];
8711         }
8712     }
8713 }
8714
8715 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8716 {
8717     gmx_domdec_sort_t *sort;
8718     gmx_cgsort_t      *cgsort, *sort_i;
8719     int                ncg_new, nsort2, nsort_new, i, *a, moved;
8720
8721     sort = dd->comm->sort;
8722
8723     a = fr->ns->grid->cell_index;
8724
8725     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
8726
8727     if (ncg_home_old >= 0)
8728     {
8729         /* The charge groups that remained in the same ns grid cell
8730          * are completely ordered. So we can sort efficiently by sorting
8731          * the charge groups that did move into the stationary list.
8732          */
8733         ncg_new   = 0;
8734         nsort2    = 0;
8735         nsort_new = 0;
8736         for (i = 0; i < dd->ncg_home; i++)
8737         {
8738             /* Check if this cg did not move to another node */
8739             if (a[i] < moved)
8740             {
8741                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8742                 {
8743                     /* This cg is new on this node or moved ns grid cell */
8744                     if (nsort_new >= sort->sort_new_nalloc)
8745                     {
8746                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8747                         srenew(sort->sort_new, sort->sort_new_nalloc);
8748                     }
8749                     sort_i = &(sort->sort_new[nsort_new++]);
8750                 }
8751                 else
8752                 {
8753                     /* This cg did not move */
8754                     sort_i = &(sort->sort2[nsort2++]);
8755                 }
8756                 /* Sort on the ns grid cell indices
8757                  * and the global topology index.
8758                  * index_gl is irrelevant with cell ns,
8759                  * but we set it here anyhow to avoid a conditional.
8760                  */
8761                 sort_i->nsc    = a[i];
8762                 sort_i->ind_gl = dd->index_gl[i];
8763                 sort_i->ind    = i;
8764                 ncg_new++;
8765             }
8766         }
8767         if (debug)
8768         {
8769             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8770                     nsort2, nsort_new);
8771         }
8772         /* Sort efficiently */
8773         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8774                      sort->sort);
8775     }
8776     else
8777     {
8778         cgsort  = sort->sort;
8779         ncg_new = 0;
8780         for (i = 0; i < dd->ncg_home; i++)
8781         {
8782             /* Sort on the ns grid cell indices
8783              * and the global topology index
8784              */
8785             cgsort[i].nsc    = a[i];
8786             cgsort[i].ind_gl = dd->index_gl[i];
8787             cgsort[i].ind    = i;
8788             if (cgsort[i].nsc < moved)
8789             {
8790                 ncg_new++;
8791             }
8792         }
8793         if (debug)
8794         {
8795             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8796         }
8797         /* Determine the order of the charge groups using qsort */
8798         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8799     }
8800
8801     return ncg_new;
8802 }
8803
8804 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8805 {
8806     gmx_cgsort_t *sort;
8807     int           ncg_new, i, na;
8808     const int    *a;
8809
8810     sort = dd->comm->sort->sort;
8811
8812     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8813
8814     ncg_new = 0;
8815     for (i = 0; i < na; i++)
8816     {
8817         if (a[i] >= 0)
8818         {
8819             sort[ncg_new].ind = a[i];
8820             ncg_new++;
8821         }
8822     }
8823
8824     return ncg_new;
8825 }
8826
8827 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
8828                           int ncg_home_old)
8829 {
8830     gmx_domdec_sort_t *sort;
8831     gmx_cgsort_t      *cgsort;
8832     int               *cgindex;
8833     int                ncg_new, i, *ibuf, cgsize;
8834     rvec              *vbuf;
8835
8836     sort = dd->comm->sort;
8837
8838     if (dd->ncg_home > sort->sort_nalloc)
8839     {
8840         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8841         srenew(sort->sort, sort->sort_nalloc);
8842         srenew(sort->sort2, sort->sort_nalloc);
8843     }
8844     cgsort = sort->sort;
8845
8846     switch (fr->cutoff_scheme)
8847     {
8848         case ecutsGROUP:
8849             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8850             break;
8851         case ecutsVERLET:
8852             ncg_new = dd_sort_order_nbnxn(dd, fr);
8853             break;
8854         default:
8855             gmx_incons("unimplemented");
8856             ncg_new = 0;
8857     }
8858
8859     /* We alloc with the old size, since cgindex is still old */
8860     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8861     vbuf = dd->comm->vbuf.v;
8862
8863     if (dd->comm->bCGs)
8864     {
8865         cgindex = dd->cgindex;
8866     }
8867     else
8868     {
8869         cgindex = nullptr;
8870     }
8871
8872     /* Remove the charge groups which are no longer at home here */
8873     dd->ncg_home = ncg_new;
8874     if (debug)
8875     {
8876         fprintf(debug, "Set the new home charge group count to %d\n",
8877                 dd->ncg_home);
8878     }
8879
8880     /* Reorder the state */
8881     if (state->flags & (1 << estX))
8882     {
8883         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->x.data()), vbuf);
8884     }
8885     if (state->flags & (1 << estV))
8886     {
8887         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->v.data()), vbuf);
8888     }
8889     if (state->flags & (1 << estCGP))
8890     {
8891         order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->cg_p.data()), vbuf);
8892     }
8893
8894     if (fr->cutoff_scheme == ecutsGROUP)
8895     {
8896         /* Reorder cgcm */
8897         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
8898     }
8899
8900     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8901     {
8902         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8903         srenew(sort->ibuf, sort->ibuf_nalloc);
8904     }
8905     ibuf = sort->ibuf;
8906     /* Reorder the global cg index */
8907     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
8908     /* Reorder the cginfo */
8909     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
8910     /* Rebuild the local cg index */
8911     if (dd->comm->bCGs)
8912     {
8913         ibuf[0] = 0;
8914         for (i = 0; i < dd->ncg_home; i++)
8915         {
8916             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8917             ibuf[i+1] = ibuf[i] + cgsize;
8918         }
8919         for (i = 0; i < dd->ncg_home+1; i++)
8920         {
8921             dd->cgindex[i] = ibuf[i];
8922         }
8923     }
8924     else
8925     {
8926         for (i = 0; i < dd->ncg_home+1; i++)
8927         {
8928             dd->cgindex[i] = i;
8929         }
8930     }
8931     /* Set the home atom number */
8932     dd->nat_home = dd->cgindex[dd->ncg_home];
8933
8934     if (fr->cutoff_scheme == ecutsVERLET)
8935     {
8936         /* The atoms are now exactly in grid order, update the grid order */
8937         nbnxn_set_atomorder(fr->nbv->nbs);
8938     }
8939     else
8940     {
8941         /* Copy the sorted ns cell indices back to the ns grid struct */
8942         for (i = 0; i < dd->ncg_home; i++)
8943         {
8944             fr->ns->grid->cell_index[i] = cgsort[i].nsc;
8945         }
8946         fr->ns->grid->nr = dd->ncg_home;
8947     }
8948 }
8949
8950 static void add_dd_statistics(gmx_domdec_t *dd)
8951 {
8952     gmx_domdec_comm_t *comm;
8953     int                ddnat;
8954
8955     comm = dd->comm;
8956
8957     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
8958     {
8959         comm->sum_nat[ddnat-ddnatZONE] +=
8960             comm->nat[ddnat] - comm->nat[ddnat-1];
8961     }
8962     comm->ndecomp++;
8963 }
8964
8965 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8966 {
8967     gmx_domdec_comm_t *comm;
8968     int                ddnat;
8969
8970     comm = dd->comm;
8971
8972     /* Reset all the statistics and counters for total run counting */
8973     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
8974     {
8975         comm->sum_nat[ddnat-ddnatZONE] = 0;
8976     }
8977     comm->ndecomp   = 0;
8978     comm->nload     = 0;
8979     comm->load_step = 0;
8980     comm->load_sum  = 0;
8981     comm->load_max  = 0;
8982     clear_ivec(comm->load_lim);
8983     comm->load_mdf = 0;
8984     comm->load_pme = 0;
8985 }
8986
8987 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
8988 {
8989     gmx_domdec_comm_t *comm;
8990     int                ddnat;
8991     double             av;
8992
8993     comm = cr->dd->comm;
8994
8995     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
8996
8997     if (fplog == nullptr)
8998     {
8999         return;
9000     }
9001
9002     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9003
9004     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9005     {
9006         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9007         switch (ddnat)
9008         {
9009             case ddnatZONE:
9010                 fprintf(fplog,
9011                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9012                         2, av);
9013                 break;
9014             case ddnatVSITE:
9015                 if (cr->dd->vsite_comm)
9016                 {
9017                     fprintf(fplog,
9018                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9019                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9020                             av);
9021                 }
9022                 break;
9023             case ddnatCON:
9024                 if (cr->dd->constraint_comm)
9025                 {
9026                     fprintf(fplog,
9027                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9028                             1 + ir->nLincsIter, av);
9029                 }
9030                 break;
9031             default:
9032                 gmx_incons(" Unknown type for DD statistics");
9033         }
9034     }
9035     fprintf(fplog, "\n");
9036
9037     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9038     {
9039         print_dd_load_av(fplog, cr->dd);
9040     }
9041 }
9042
9043 void dd_partition_system(FILE                *fplog,
9044                          gmx_int64_t          step,
9045                          t_commrec           *cr,
9046                          gmx_bool             bMasterState,
9047                          int                  nstglobalcomm,
9048                          t_state             *state_global,
9049                          const gmx_mtop_t    *top_global,
9050                          const t_inputrec    *ir,
9051                          t_state             *state_local,
9052                          PaddedRVecVector    *f,
9053                          t_mdatoms           *mdatoms,
9054                          gmx_localtop_t      *top_local,
9055                          t_forcerec          *fr,
9056                          gmx_vsite_t         *vsite,
9057                          gmx_constr_t         constr,
9058                          t_nrnb              *nrnb,
9059                          gmx_wallcycle_t      wcycle,
9060                          gmx_bool             bVerbose)
9061 {
9062     gmx_domdec_t      *dd;
9063     gmx_domdec_comm_t *comm;
9064     gmx_ddbox_t        ddbox = {0};
9065     t_block           *cgs_gl;
9066     gmx_int64_t        step_pcoupl;
9067     rvec               cell_ns_x0, cell_ns_x1;
9068     int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9069     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
9070     gmx_bool           bRedist, bSortCG, bResortAll;
9071     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9072     real               grid_density;
9073     char               sbuf[22];
9074
9075     wallcycle_start(wcycle, ewcDOMDEC);
9076
9077     dd   = cr->dd;
9078     comm = dd->comm;
9079
9080     bBoxChanged = (bMasterState || inputrecDeform(ir));
9081     if (ir->epc != epcNO)
9082     {
9083         /* With nstpcouple > 1 pressure coupling happens.
9084          * one step after calculating the pressure.
9085          * Box scaling happens at the end of the MD step,
9086          * after the DD partitioning.
9087          * We therefore have to do DLB in the first partitioning
9088          * after an MD step where P-coupling occurred.
9089          * We need to determine the last step in which p-coupling occurred.
9090          * MRS -- need to validate this for vv?
9091          */
9092         n = ir->nstpcouple;
9093         if (n == 1)
9094         {
9095             step_pcoupl = step - 1;
9096         }
9097         else
9098         {
9099             step_pcoupl = ((step - 1)/n)*n + 1;
9100         }
9101         if (step_pcoupl >= comm->partition_step)
9102         {
9103             bBoxChanged = TRUE;
9104         }
9105     }
9106
9107     bNStGlobalComm = (step % nstglobalcomm == 0);
9108
9109     if (!dlbIsOn(comm))
9110     {
9111         bDoDLB = FALSE;
9112     }
9113     else
9114     {
9115         /* Should we do dynamic load balacing this step?
9116          * Since it requires (possibly expensive) global communication,
9117          * we might want to do DLB less frequently.
9118          */
9119         if (bBoxChanged || ir->epc != epcNO)
9120         {
9121             bDoDLB = bBoxChanged;
9122         }
9123         else
9124         {
9125             bDoDLB = bNStGlobalComm;
9126         }
9127     }
9128
9129     /* Check if we have recorded loads on the nodes */
9130     if (comm->bRecordLoad && dd_load_count(comm) > 0)
9131     {
9132         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
9133
9134         /* Print load every nstlog, first and last step to the log file */
9135         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9136                     comm->n_load_collect == 0 ||
9137                     (ir->nsteps >= 0 &&
9138                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9139
9140         /* Avoid extra communication due to verbose screen output
9141          * when nstglobalcomm is set.
9142          */
9143         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
9144             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9145         {
9146             get_load_distribution(dd, wcycle);
9147             if (DDMASTER(dd))
9148             {
9149                 if (bLogLoad)
9150                 {
9151                     dd_print_load(fplog, dd, step-1);
9152                 }
9153                 if (bVerbose)
9154                 {
9155                     dd_print_load_verbose(dd);
9156                 }
9157             }
9158             comm->n_load_collect++;
9159
9160             if (dlbIsOn(comm))
9161             {
9162                 if (DDMASTER(dd))
9163                 {
9164                     /* Add the measured cycles to the running average */
9165                     const float averageFactor        = 0.1f;
9166                     comm->cyclesPerStepDlbExpAverage =
9167                         (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
9168                         averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
9169                 }
9170                 if (comm->dlbState == edlbsOnCanTurnOff &&
9171                     dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
9172                 {
9173                     gmx_bool turnOffDlb;
9174                     if (DDMASTER(dd))
9175                     {
9176                         /* If the running averaged cycles with DLB are more
9177                          * than before we turned on DLB, turn off DLB.
9178                          * We will again run and check the cycles without DLB
9179                          * and we can then decide if to turn off DLB forever.
9180                          */
9181                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
9182                                       comm->cyclesPerStepBeforeDLB);
9183                     }
9184                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
9185                     if (turnOffDlb)
9186                     {
9187                         /* To turn off DLB, we need to redistribute the atoms */
9188                         dd_collect_state(dd, state_local, state_global);
9189                         bMasterState = TRUE;
9190                         turn_off_dlb(fplog, cr, step);
9191                     }
9192                 }
9193             }
9194             else if (bCheckWhetherToTurnDlbOn)
9195             {
9196                 gmx_bool turnOffDlbForever = FALSE;
9197                 gmx_bool turnOnDlb         = FALSE;
9198
9199                 /* Since the timings are node dependent, the master decides */
9200                 if (DDMASTER(dd))
9201                 {
9202                     /* If we recently turned off DLB, we want to check if
9203                      * performance is better without DLB. We want to do this
9204                      * ASAP to minimize the chance that external factors
9205                      * slowed down the DLB step are gone here and we
9206                      * incorrectly conclude that DLB was causing the slowdown.
9207                      * So we measure one nstlist block, no running average.
9208                      */
9209                     if (comm->haveTurnedOffDlb &&
9210                         comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
9211                         comm->cyclesPerStepDlbExpAverage)
9212                     {
9213                         /* After turning off DLB we ran nstlist steps in fewer
9214                          * cycles than with DLB. This likely means that DLB
9215                          * in not benefical, but this could be due to a one
9216                          * time unlucky fluctuation, so we require two such
9217                          * observations in close succession to turn off DLB
9218                          * forever.
9219                          */
9220                         if (comm->dlbSlowerPartitioningCount > 0 &&
9221                             dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
9222                         {
9223                             turnOffDlbForever = TRUE;
9224                         }
9225                         comm->haveTurnedOffDlb           = false;
9226                         /* Register when we last measured DLB slowdown */
9227                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
9228                     }
9229                     else
9230                     {
9231                         /* Here we check if the max PME rank load is more than 0.98
9232                          * the max PP force load. If so, PP DLB will not help,
9233                          * since we are (almost) limited by PME. Furthermore,
9234                          * DLB will cause a significant extra x/f redistribution
9235                          * cost on the PME ranks, which will then surely result
9236                          * in lower total performance.
9237                          */
9238                         if (cr->npmenodes > 0 &&
9239                             dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
9240                         {
9241                             turnOnDlb = FALSE;
9242                         }
9243                         else
9244                         {
9245                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9246                         }
9247                     }
9248                 }
9249                 struct
9250                 {
9251                     gmx_bool turnOffDlbForever;
9252                     gmx_bool turnOnDlb;
9253                 }
9254                 bools {
9255                     turnOffDlbForever, turnOnDlb
9256                 };
9257                 dd_bcast(dd, sizeof(bools), &bools);
9258                 if (bools.turnOffDlbForever)
9259                 {
9260                     turn_off_dlb_forever(fplog, cr, step);
9261                 }
9262                 else if (bools.turnOnDlb)
9263                 {
9264                     turn_on_dlb(fplog, cr, step);
9265                     bDoDLB = TRUE;
9266                 }
9267             }
9268         }
9269         comm->n_load_have++;
9270     }
9271
9272     cgs_gl = &comm->cgs_gl;
9273
9274     bRedist = FALSE;
9275     if (bMasterState)
9276     {
9277         /* Clear the old state */
9278         clear_dd_indices(dd, 0, 0);
9279         ncgindex_set = 0;
9280
9281         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9282                   TRUE, cgs_gl, as_rvec_array(state_global->x.data()), &ddbox);
9283
9284         get_cg_distribution(fplog, dd, cgs_gl,
9285                             state_global->box, &ddbox, as_rvec_array(state_global->x.data()));
9286
9287         dd_distribute_state(dd, cgs_gl,
9288                             state_global, state_local, f);
9289
9290         dd_make_local_cgs(dd, &top_local->cgs);
9291
9292         /* Ensure that we have space for the new distribution */
9293         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9294
9295         if (fr->cutoff_scheme == ecutsGROUP)
9296         {
9297             calc_cgcm(fplog, 0, dd->ncg_home,
9298                       &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9299         }
9300
9301         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9302
9303         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9304     }
9305     else if (state_local->ddp_count != dd->ddp_count)
9306     {
9307         if (state_local->ddp_count > dd->ddp_count)
9308         {
9309             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9310         }
9311
9312         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9313         {
9314             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9315         }
9316
9317         /* Clear the old state */
9318         clear_dd_indices(dd, 0, 0);
9319
9320         /* Build the new indices */
9321         rebuild_cgindex(dd, cgs_gl->index, state_local);
9322         make_dd_indices(dd, cgs_gl->index, 0);
9323         ncgindex_set = dd->ncg_home;
9324
9325         if (fr->cutoff_scheme == ecutsGROUP)
9326         {
9327             /* Redetermine the cg COMs */
9328             calc_cgcm(fplog, 0, dd->ncg_home,
9329                       &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9330         }
9331
9332         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9333
9334         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9335
9336         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9337                   TRUE, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9338
9339         bRedist = dlbIsOn(comm);
9340     }
9341     else
9342     {
9343         /* We have the full state, only redistribute the cgs */
9344
9345         /* Clear the non-home indices */
9346         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9347         ncgindex_set = 0;
9348
9349         /* Avoid global communication for dim's without pbc and -gcom */
9350         if (!bNStGlobalComm)
9351         {
9352             copy_rvec(comm->box0, ddbox.box0    );
9353             copy_rvec(comm->box_size, ddbox.box_size);
9354         }
9355         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9356                   bNStGlobalComm, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9357
9358         bBoxChanged = TRUE;
9359         bRedist     = TRUE;
9360     }
9361     /* For dim's without pbc and -gcom */
9362     copy_rvec(ddbox.box0, comm->box0    );
9363     copy_rvec(ddbox.box_size, comm->box_size);
9364
9365     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9366                       step, wcycle);
9367
9368     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9369     {
9370         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9371     }
9372
9373     /* Check if we should sort the charge groups */
9374     bSortCG = (bMasterState || bRedist);
9375
9376     ncg_home_old = dd->ncg_home;
9377
9378     ncg_moved = 0;
9379     if (bRedist)
9380     {
9381         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9382
9383         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9384                            state_local, f, fr,
9385                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9386
9387         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9388     }
9389
9390     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9391                           dd, &ddbox,
9392                           &comm->cell_x0, &comm->cell_x1,
9393                           dd->ncg_home, fr->cg_cm,
9394                           cell_ns_x0, cell_ns_x1, &grid_density);
9395
9396     if (bBoxChanged)
9397     {
9398         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9399     }
9400
9401     switch (fr->cutoff_scheme)
9402     {
9403         case ecutsGROUP:
9404             copy_ivec(fr->ns->grid->n, ncells_old);
9405             grid_first(fplog, fr->ns->grid, dd, &ddbox,
9406                        state_local->box, cell_ns_x0, cell_ns_x1,
9407                        fr->rlist, grid_density);
9408             break;
9409         case ecutsVERLET:
9410             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9411             break;
9412         default:
9413             gmx_incons("unimplemented");
9414     }
9415     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9416     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9417
9418     if (bSortCG)
9419     {
9420         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9421
9422         /* Sort the state on charge group position.
9423          * This enables exact restarts from this step.
9424          * It also improves performance by about 15% with larger numbers
9425          * of atoms per node.
9426          */
9427
9428         /* Fill the ns grid with the home cell,
9429          * so we can sort with the indices.
9430          */
9431         set_zones_ncg_home(dd);
9432
9433         switch (fr->cutoff_scheme)
9434         {
9435             case ecutsVERLET:
9436                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9437
9438                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9439                                   0,
9440                                   comm->zones.size[0].bb_x0,
9441                                   comm->zones.size[0].bb_x1,
9442                                   0, dd->ncg_home,
9443                                   comm->zones.dens_zone0,
9444                                   fr->cginfo,
9445                                   as_rvec_array(state_local->x.data()),
9446                                   ncg_moved, bRedist ? comm->moved : nullptr,
9447                                   fr->nbv->grp[eintLocal].kernel_type,
9448                                   fr->nbv->grp[eintLocal].nbat);
9449
9450                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9451                 break;
9452             case ecutsGROUP:
9453                 fill_grid(&comm->zones, fr->ns->grid, dd->ncg_home,
9454                           0, dd->ncg_home, fr->cg_cm);
9455
9456                 copy_ivec(fr->ns->grid->n, ncells_new);
9457                 break;
9458             default:
9459                 gmx_incons("unimplemented");
9460         }
9461
9462         bResortAll = bMasterState;
9463
9464         /* Check if we can user the old order and ns grid cell indices
9465          * of the charge groups to sort the charge groups efficiently.
9466          */
9467         if (ncells_new[XX] != ncells_old[XX] ||
9468             ncells_new[YY] != ncells_old[YY] ||
9469             ncells_new[ZZ] != ncells_old[ZZ])
9470         {
9471             bResortAll = TRUE;
9472         }
9473
9474         if (debug)
9475         {
9476             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9477                     gmx_step_str(step, sbuf), dd->ncg_home);
9478         }
9479         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9480                       bResortAll ? -1 : ncg_home_old);
9481
9482         /* After sorting and compacting we set the correct size */
9483         dd_resize_state(state_local, f, dd->nat_home);
9484
9485         /* Rebuild all the indices */
9486         ga2la_clear(dd->ga2la);
9487         ncgindex_set = 0;
9488
9489         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9490     }
9491
9492     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9493
9494     /* Setup up the communication and communicate the coordinates */
9495     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9496
9497     /* Set the indices */
9498     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9499
9500     /* Set the charge group boundaries for neighbor searching */
9501     set_cg_boundaries(&comm->zones);
9502
9503     if (fr->cutoff_scheme == ecutsVERLET)
9504     {
9505         set_zones_size(dd, state_local->box, &ddbox,
9506                        bSortCG ? 1 : 0, comm->zones.n);
9507     }
9508
9509     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9510
9511     /*
9512        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9513                  -1,as_rvec_array(state_local->x.data()),state_local->box);
9514      */
9515
9516     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9517
9518     /* Extract a local topology from the global topology */
9519     for (i = 0; i < dd->ndim; i++)
9520     {
9521         np[dd->dim[i]] = comm->cd[i].np;
9522     }
9523     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9524                       comm->cellsize_min, np,
9525                       fr,
9526                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : as_rvec_array(state_local->x.data()),
9527                       vsite, top_global, top_local);
9528
9529     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9530
9531     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9532
9533     /* Set up the special atom communication */
9534     n = comm->nat[ddnatZONE];
9535     for (i = ddnatZONE+1; i < ddnatNR; i++)
9536     {
9537         switch (i)
9538         {
9539             case ddnatVSITE:
9540                 if (vsite && vsite->n_intercg_vsite)
9541                 {
9542                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9543                 }
9544                 break;
9545             case ddnatCON:
9546                 if (dd->bInterCGcons || dd->bInterCGsettles)
9547                 {
9548                     /* Only for inter-cg constraints we need special code */
9549                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9550                                                   constr, ir->nProjOrder,
9551                                                   top_local->idef.il);
9552                 }
9553                 break;
9554             default:
9555                 gmx_incons("Unknown special atom type setup");
9556         }
9557         comm->nat[i] = n;
9558     }
9559
9560     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9561
9562     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9563
9564     /* Make space for the extra coordinates for virtual site
9565      * or constraint communication.
9566      */
9567     state_local->natoms = comm->nat[ddnatNR-1];
9568
9569     dd_resize_state(state_local, f, state_local->natoms);
9570
9571     if (fr->bF_NoVirSum)
9572     {
9573         if (vsite && vsite->n_intercg_vsite)
9574         {
9575             nat_f_novirsum = comm->nat[ddnatVSITE];
9576         }
9577         else
9578         {
9579             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9580             {
9581                 nat_f_novirsum = dd->nat_tot;
9582             }
9583             else
9584             {
9585                 nat_f_novirsum = dd->nat_home;
9586             }
9587         }
9588     }
9589     else
9590     {
9591         nat_f_novirsum = 0;
9592     }
9593
9594     /* Set the number of atoms required for the force calculation.
9595      * Forces need to be constrained when doing energy
9596      * minimization. For simple simulations we could avoid some
9597      * allocation, zeroing and copying, but this is probably not worth
9598      * the complications and checking.
9599      */
9600     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9601                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9602
9603     /* Update atom data for mdatoms and several algorithms */
9604     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
9605                               nullptr, mdatoms, vsite, nullptr);
9606
9607     if (ir->implicit_solvent)
9608     {
9609         make_local_gb(cr, fr->born, ir->gb_algorithm);
9610     }
9611
9612     if (!(cr->duty & DUTY_PME))
9613     {
9614         /* Send the charges and/or c6/sigmas to our PME only node */
9615         gmx_pme_send_parameters(cr,
9616                                 fr->ic,
9617                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9618                                 mdatoms->chargeA, mdatoms->chargeB,
9619                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9620                                 mdatoms->sigmaA, mdatoms->sigmaB,
9621                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9622     }
9623
9624     if (constr)
9625     {
9626         set_constraints(constr, top_local, ir, mdatoms, cr);
9627     }
9628
9629     if (ir->bPull)
9630     {
9631         /* Update the local pull groups */
9632         dd_make_local_pull_groups(cr, ir->pull_work, mdatoms);
9633     }
9634
9635     if (ir->bRot)
9636     {
9637         /* Update the local rotation groups */
9638         dd_make_local_rotation_groups(dd, ir->rot);
9639     }
9640
9641     if (ir->eSwapCoords != eswapNO)
9642     {
9643         /* Update the local groups needed for ion swapping */
9644         dd_make_local_swap_groups(dd, ir->swap);
9645     }
9646
9647     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9648     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9649
9650     add_dd_statistics(dd);
9651
9652     /* Make sure we only count the cycles for this DD partitioning */
9653     clear_dd_cycle_counts(dd);
9654
9655     /* Because the order of the atoms might have changed since
9656      * the last vsite construction, we need to communicate the constructing
9657      * atom coordinates again (for spreading the forces this MD step).
9658      */
9659     dd_move_x_vsites(dd, state_local->box, as_rvec_array(state_local->x.data()));
9660
9661     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9662
9663     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9664     {
9665         dd_move_x(dd, state_local->box, as_rvec_array(state_local->x.data()));
9666         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9667                      -1, as_rvec_array(state_local->x.data()), state_local->box);
9668     }
9669
9670     /* Store the partitioning step */
9671     comm->partition_step = step;
9672
9673     /* Increase the DD partitioning counter */
9674     dd->ddp_count++;
9675     /* The state currently matches this DD partitioning count, store it */
9676     state_local->ddp_count = dd->ddp_count;
9677     if (bMasterState)
9678     {
9679         /* The DD master node knows the complete cg distribution,
9680          * store the count so we can possibly skip the cg info communication.
9681          */
9682         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9683     }
9684
9685     if (comm->DD_debug > 0)
9686     {
9687         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9688         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9689                                 "after partitioning");
9690     }
9691
9692     wallcycle_stop(wcycle, ewcDOMDEC);
9693 }