src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <limits.h>
  44 #include <math.h>
  45 #include <stdio.h>
  46 #include <stdlib.h>
  47 #include <string.h>
  48
  49 #include <algorithm>
  50
  51 #include "gromacs/domdec/domdec_network.h"
  52 #include "gromacs/domdec/ga2la.h"
  53 #include "gromacs/ewald/pme.h"
  54 #include "gromacs/fileio/gmxfio.h"
  55 #include "gromacs/fileio/pdbio.h"
  56 #include "gromacs/gmxlib/chargegroup.h"
  57 #include "gromacs/gmxlib/network.h"
  58 #include "gromacs/gmxlib/nrnb.h"
  59 #include "gromacs/gpu_utils/gpu_utils.h"
  60 #include "gromacs/hardware/hw_info.h"
  61 #include "gromacs/imd/imd.h"
  62 #include "gromacs/listed-forces/manage-threading.h"
  63 #include "gromacs/math/functions.h"
  64 #include "gromacs/math/vec.h"
  65 #include "gromacs/math/vectypes.h"
  66 #include "gromacs/mdlib/constr.h"
  67 #include "gromacs/mdlib/force.h"
  68 #include "gromacs/mdlib/forcerec.h"
  69 #include "gromacs/mdlib/genborn.h"
  70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  71 #include "gromacs/mdlib/mdatoms.h"
  72 #include "gromacs/mdlib/mdrun.h"
  73 #include "gromacs/mdlib/nb_verlet.h"
  74 #include "gromacs/mdlib/nbnxn_grid.h"
  75 #include "gromacs/mdlib/nsgrid.h"
  76 #include "gromacs/mdlib/shellfc.h"
  77 #include "gromacs/mdlib/vsite.h"
  78 #include "gromacs/mdtypes/commrec.h"
  79 #include "gromacs/mdtypes/df_history.h"
  80 #include "gromacs/mdtypes/forcerec.h"
  81 #include "gromacs/mdtypes/inputrec.h"
  82 #include "gromacs/mdtypes/md_enums.h"
  83 #include "gromacs/mdtypes/mdatom.h"
  84 #include "gromacs/mdtypes/nblist.h"
  85 #include "gromacs/mdtypes/state.h"
  86 #include "gromacs/pbcutil/ishift.h"
  87 #include "gromacs/pbcutil/pbc.h"
  88 #include "gromacs/pulling/pull.h"
  89 #include "gromacs/pulling/pull_rotation.h"
  90 #include "gromacs/swap/swapcoords.h"
  91 #include "gromacs/timing/wallcycle.h"
  92 #include "gromacs/topology/block.h"
  93 #include "gromacs/topology/idef.h"
  94 #include "gromacs/topology/ifunc.h"
  95 #include "gromacs/topology/mtop_util.h"
  96 #include "gromacs/topology/topology.h"
  97 #include "gromacs/utility/basedefinitions.h"
  98 #include "gromacs/utility/basenetwork.h"
  99 #include "gromacs/utility/cstringutil.h"
 100 #include "gromacs/utility/exceptions.h"
 101 #include "gromacs/utility/fatalerror.h"
 102 #include "gromacs/utility/gmxmpi.h"
 103 #include "gromacs/utility/qsort_threadsafe.h"
 104 #include "gromacs/utility/real.h"
 105 #include "gromacs/utility/smalloc.h"
 106
 107 #include "domdec_constraints.h"
 108 #include "domdec_internal.h"
 109 #include "domdec_vsite.h"
 110
 111 #define DDRANK(dd, rank)    (rank)
 112 #define DDMASTERRANK(dd)   (dd->masterrank)
 113
 114 struct gmx_domdec_master_t
 115 {
 116     /* The cell boundaries */
 117     real **cell_x;
 118     /* The global charge group division */
 119     int   *ncg;    /* Number of home charge groups for each node */
 120     int   *index;  /* Index of nnodes+1 into cg */
 121     int   *cg;     /* Global charge group index */
 122     int   *nat;    /* Number of home atoms for each node. */
 123     int   *ibuf;   /* Buffer for communication */
 124     rvec  *vbuf;   /* Buffer for state scattering and gathering */
 125 };
 126
 127 #define DD_NLOAD_MAX 9
 128
 129 const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on" };
 130
 131 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 132 #define DD_CGIBS 2
 133
 134 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 135 #define DD_FLAG_NRCG  65535
 136 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 137 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 138
 139 /* The DD zone order */
 140 static const ivec dd_zo[DD_MAXZONE] =
 141 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 142
 143 /* The 3D setup */
 144 #define dd_z3n  8
 145 #define dd_zp3n 4
 146 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 147
 148 /* The 2D setup */
 149 #define dd_z2n  4
 150 #define dd_zp2n 2
 151 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 152
 153 /* The 1D setup */
 154 #define dd_z1n  2
 155 #define dd_zp1n 1
 156 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 157
 158 /* The 0D setup */
 159 #define dd_z0n  1
 160 #define dd_zp0n 1
 161 static const ivec dd_zp0[dd_zp0n] = {{0, 0, 1}};
 162
 163 /* Factors used to avoid problems due to rounding issues */
 164 #define DD_CELL_MARGIN       1.0001
 165 #define DD_CELL_MARGIN2      1.00005
 166 /* Factor to account for pressure scaling during nstlist steps */
 167 #define DD_PRES_SCALE_MARGIN 1.02
 168
 169 /* Turn on DLB when the load imbalance causes this amount of total loss.
 170  * There is a bit of overhead with DLB and it's difficult to achieve
 171  * a load imbalance of less than 2% with DLB.
 172  */
 173 #define DD_PERF_LOSS_DLB_ON  0.02
 174
 175 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 176 #define DD_PERF_LOSS_WARN    0.05
 177
 178 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 179
 180 /* Use separate MPI send and receive commands
 181  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 182  * This saves memory (and some copying for small nnodes).
 183  * For high parallelization scatter and gather calls are used.
 184  */
 185 #define GMX_DD_NNODES_SENDRECV 4
 186
 187
 188 /*
 189    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 190
 191    static void index2xyz(ivec nc,int ind,ivec xyz)
 192    {
 193    xyz[XX] = ind % nc[XX];
 194    xyz[YY] = (ind / nc[XX]) % nc[YY];
 195    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 196    }
 197  */
 198
 199 /* This order is required to minimize the coordinate communication in PME
 200  * which uses decomposition in the x direction.
 201  */
 202 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 203
 204 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 205 {
 206     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 207     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 208     xyz[ZZ] = ind % nc[ZZ];
 209 }
 210
 211 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 212 {
 213     int ddindex;
 214     int ddnodeid = -1;
 215
 216     ddindex = dd_index(dd->nc, c);
 217     if (dd->comm->bCartesianPP_PME)
 218     {
 219         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 220     }
 221     else if (dd->comm->bCartesianPP)
 222     {
 223 #ifdef GMX_MPI
 224         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 225 #endif
 226     }
 227     else
 228     {
 229         ddnodeid = ddindex;
 230     }
 231
 232     return ddnodeid;
 233 }
 234
 235 static gmx_bool dynamic_dd_box(const gmx_ddbox_t *ddbox, const t_inputrec *ir)
 236 {
 237     return (ddbox->nboundeddim < DIM || inputrecDynamicBox(ir));
 238 }
 239
 240 int ddglatnr(gmx_domdec_t *dd, int i)
 241 {
 242     int atnr;
 243
 244     if (dd == NULL)
 245     {
 246         atnr = i + 1;
 247     }
 248     else
 249     {
 250         if (i >= dd->comm->nat[ddnatNR-1])
 251         {
 252             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 253         }
 254         atnr = dd->gatindex[i] + 1;
 255     }
 256
 257     return atnr;
 258 }
 259
 260 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 261 {
 262     return &dd->comm->cgs_gl;
 263 }
 264
 265 static bool dlbIsOn(const gmx_domdec_comm_t *comm)
 266 {
 267     return (comm->dlbState == edlbsOn);
 268 }
 269
 270 static void vec_rvec_init(vec_rvec_t *v)
 271 {
 272     v->nalloc = 0;
 273     v->v      = NULL;
 274 }
 275
 276 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 277 {
 278     if (n > v->nalloc)
 279     {
 280         v->nalloc = over_alloc_dd(n);
 281         srenew(v->v, v->nalloc);
 282     }
 283 }
 284
 285 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 286 {
 287     int i;
 288
 289     if (state->ddp_count != dd->ddp_count)
 290     {
 291         gmx_incons("The state does not the domain decomposition state");
 292     }
 293
 294     state->ncg_gl = dd->ncg_home;
 295     if (state->ncg_gl > state->cg_gl_nalloc)
 296     {
 297         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 298         srenew(state->cg_gl, state->cg_gl_nalloc);
 299     }
 300     for (i = 0; i < state->ncg_gl; i++)
 301     {
 302         state->cg_gl[i] = dd->index_gl[i];
 303     }
 304
 305     state->ddp_count_cg_gl = dd->ddp_count;
 306 }
 307
 308 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 309 {
 310     return &dd->comm->zones;
 311 }
 312
 313 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 314                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 315 {
 316     gmx_domdec_zones_t *zones;
 317     int                 izone, d, dim;
 318
 319     zones = &dd->comm->zones;
 320
 321     izone = 0;
 322     while (icg >= zones->izone[izone].cg1)
 323     {
 324         izone++;
 325     }
 326
 327     if (izone == 0)
 328     {
 329         *jcg0 = icg;
 330     }
 331     else if (izone < zones->nizone)
 332     {
 333         *jcg0 = zones->izone[izone].jcg0;
 334     }
 335     else
 336     {
 337         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 338                   icg, izone, zones->nizone);
 339     }
 340
 341     *jcg1 = zones->izone[izone].jcg1;
 342
 343     for (d = 0; d < dd->ndim; d++)
 344     {
 345         dim         = dd->dim[d];
 346         shift0[dim] = zones->izone[izone].shift0[dim];
 347         shift1[dim] = zones->izone[izone].shift1[dim];
 348         if (dd->comm->tric_dir[dim] || (dlbIsOn(dd->comm) && d > 0))
 349         {
 350             /* A conservative approach, this can be optimized */
 351             shift0[dim] -= 1;
 352             shift1[dim] += 1;
 353         }
 354     }
 355 }
 356
 357 int dd_natoms_vsite(gmx_domdec_t *dd)
 358 {
 359     return dd->comm->nat[ddnatVSITE];
 360 }
 361
 362 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 363 {
 364     *at_start = dd->comm->nat[ddnatCON-1];
 365     *at_end   = dd->comm->nat[ddnatCON];
 366 }
 367
 368 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 369 {
 370     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 371     int                   *index, *cgindex;
 372     gmx_domdec_comm_t     *comm;
 373     gmx_domdec_comm_dim_t *cd;
 374     gmx_domdec_ind_t      *ind;
 375     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 376     gmx_bool               bPBC, bScrew;
 377
 378     comm = dd->comm;
 379
 380     cgindex = dd->cgindex;
 381
 382     buf = comm->vbuf.v;
 383
 384     nzone   = 1;
 385     nat_tot = dd->nat_home;
 386     for (d = 0; d < dd->ndim; d++)
 387     {
 388         bPBC   = (dd->ci[dd->dim[d]] == 0);
 389         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 390         if (bPBC)
 391         {
 392             copy_rvec(box[dd->dim[d]], shift);
 393         }
 394         cd = &comm->cd[d];
 395         for (p = 0; p < cd->np; p++)
 396         {
 397             ind   = &cd->ind[p];
 398             index = ind->index;
 399             n     = 0;
 400             if (!bPBC)
 401             {
 402                 for (i = 0; i < ind->nsend[nzone]; i++)
 403                 {
 404                     at0 = cgindex[index[i]];
 405                     at1 = cgindex[index[i]+1];
 406                     for (j = at0; j < at1; j++)
 407                     {
 408                         copy_rvec(x[j], buf[n]);
 409                         n++;
 410                     }
 411                 }
 412             }
 413             else if (!bScrew)
 414             {
 415                 for (i = 0; i < ind->nsend[nzone]; i++)
 416                 {
 417                     at0 = cgindex[index[i]];
 418                     at1 = cgindex[index[i]+1];
 419                     for (j = at0; j < at1; j++)
 420                     {
 421                         /* We need to shift the coordinates */
 422                         rvec_add(x[j], shift, buf[n]);
 423                         n++;
 424                     }
 425                 }
 426             }
 427             else
 428             {
 429                 for (i = 0; i < ind->nsend[nzone]; i++)
 430                 {
 431                     at0 = cgindex[index[i]];
 432                     at1 = cgindex[index[i]+1];
 433                     for (j = at0; j < at1; j++)
 434                     {
 435                         /* Shift x */
 436                         buf[n][XX] = x[j][XX] + shift[XX];
 437                         /* Rotate y and z.
 438                          * This operation requires a special shift force
 439                          * treatment, which is performed in calc_vir.
 440                          */
 441                         buf[n][YY] = box[YY][YY] - x[j][YY];
 442                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 443                         n++;
 444                     }
 445                 }
 446             }
 447
 448             if (cd->bInPlace)
 449             {
 450                 rbuf = x + nat_tot;
 451             }
 452             else
 453             {
 454                 rbuf = comm->vbuf2.v;
 455             }
 456             /* Send and receive the coordinates */
 457             dd_sendrecv_rvec(dd, d, dddirBackward,
 458                              buf,  ind->nsend[nzone+1],
 459                              rbuf, ind->nrecv[nzone+1]);
 460             if (!cd->bInPlace)
 461             {
 462                 j = 0;
 463                 for (zone = 0; zone < nzone; zone++)
 464                 {
 465                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 466                     {
 467                         copy_rvec(rbuf[j], x[i]);
 468                         j++;
 469                     }
 470                 }
 471             }
 472             nat_tot += ind->nrecv[nzone+1];
 473         }
 474         nzone += nzone;
 475     }
 476 }
 477
 478 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 479 {
 480     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 481     int                   *index, *cgindex;
 482     gmx_domdec_comm_t     *comm;
 483     gmx_domdec_comm_dim_t *cd;
 484     gmx_domdec_ind_t      *ind;
 485     rvec                  *buf, *sbuf;
 486     ivec                   vis;
 487     int                    is;
 488     gmx_bool               bShiftForcesNeedPbc, bScrew;
 489
 490     comm = dd->comm;
 491
 492     cgindex = dd->cgindex;
 493
 494     buf = comm->vbuf.v;
 495
 496     nzone   = comm->zones.n/2;
 497     nat_tot = dd->nat_tot;
 498     for (d = dd->ndim-1; d >= 0; d--)
 499     {
 500         /* Only forces in domains near the PBC boundaries need to
 501            consider PBC in the treatment of fshift */
 502         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 503         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 504         if (fshift == NULL && !bScrew)
 505         {
 506             bShiftForcesNeedPbc = FALSE;
 507         }
 508         /* Determine which shift vector we need */
 509         clear_ivec(vis);
 510         vis[dd->dim[d]] = 1;
 511         is              = IVEC2IS(vis);
 512
 513         cd = &comm->cd[d];
 514         for (p = cd->np-1; p >= 0; p--)
 515         {
 516             ind      = &cd->ind[p];
 517             nat_tot -= ind->nrecv[nzone+1];
 518             if (cd->bInPlace)
 519             {
 520                 sbuf = f + nat_tot;
 521             }
 522             else
 523             {
 524                 sbuf = comm->vbuf2.v;
 525                 j    = 0;
 526                 for (zone = 0; zone < nzone; zone++)
 527                 {
 528                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 529                     {
 530                         copy_rvec(f[i], sbuf[j]);
 531                         j++;
 532                     }
 533                 }
 534             }
 535             /* Communicate the forces */
 536             dd_sendrecv_rvec(dd, d, dddirForward,
 537                              sbuf, ind->nrecv[nzone+1],
 538                              buf,  ind->nsend[nzone+1]);
 539             index = ind->index;
 540             /* Add the received forces */
 541             n = 0;
 542             if (!bShiftForcesNeedPbc)
 543             {
 544                 for (i = 0; i < ind->nsend[nzone]; i++)
 545                 {
 546                     at0 = cgindex[index[i]];
 547                     at1 = cgindex[index[i]+1];
 548                     for (j = at0; j < at1; j++)
 549                     {
 550                         rvec_inc(f[j], buf[n]);
 551                         n++;
 552                     }
 553                 }
 554             }
 555             else if (!bScrew)
 556             {
 557                 /* fshift should always be defined if this function is
 558                  * called when bShiftForcesNeedPbc is true */
 559                 assert(NULL != fshift);
 560                 for (i = 0; i < ind->nsend[nzone]; i++)
 561                 {
 562                     at0 = cgindex[index[i]];
 563                     at1 = cgindex[index[i]+1];
 564                     for (j = at0; j < at1; j++)
 565                     {
 566                         rvec_inc(f[j], buf[n]);
 567                         /* Add this force to the shift force */
 568                         rvec_inc(fshift[is], buf[n]);
 569                         n++;
 570                     }
 571                 }
 572             }
 573             else
 574             {
 575                 for (i = 0; i < ind->nsend[nzone]; i++)
 576                 {
 577                     at0 = cgindex[index[i]];
 578                     at1 = cgindex[index[i]+1];
 579                     for (j = at0; j < at1; j++)
 580                     {
 581                         /* Rotate the force */
 582                         f[j][XX] += buf[n][XX];
 583                         f[j][YY] -= buf[n][YY];
 584                         f[j][ZZ] -= buf[n][ZZ];
 585                         if (fshift)
 586                         {
 587                             /* Add this force to the shift force */
 588                             rvec_inc(fshift[is], buf[n]);
 589                         }
 590                         n++;
 591                     }
 592                 }
 593             }
 594         }
 595         nzone /= 2;
 596     }
 597 }
 598
 599 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 600 {
 601     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 602     int                   *index, *cgindex;
 603     gmx_domdec_comm_t     *comm;
 604     gmx_domdec_comm_dim_t *cd;
 605     gmx_domdec_ind_t      *ind;
 606     real                  *buf, *rbuf;
 607
 608     comm = dd->comm;
 609
 610     cgindex = dd->cgindex;
 611
 612     buf = &comm->vbuf.v[0][0];
 613
 614     nzone   = 1;
 615     nat_tot = dd->nat_home;
 616     for (d = 0; d < dd->ndim; d++)
 617     {
 618         cd = &comm->cd[d];
 619         for (p = 0; p < cd->np; p++)
 620         {
 621             ind   = &cd->ind[p];
 622             index = ind->index;
 623             n     = 0;
 624             for (i = 0; i < ind->nsend[nzone]; i++)
 625             {
 626                 at0 = cgindex[index[i]];
 627                 at1 = cgindex[index[i]+1];
 628                 for (j = at0; j < at1; j++)
 629                 {
 630                     buf[n] = v[j];
 631                     n++;
 632                 }
 633             }
 634
 635             if (cd->bInPlace)
 636             {
 637                 rbuf = v + nat_tot;
 638             }
 639             else
 640             {
 641                 rbuf = &comm->vbuf2.v[0][0];
 642             }
 643             /* Send and receive the coordinates */
 644             dd_sendrecv_real(dd, d, dddirBackward,
 645                              buf,  ind->nsend[nzone+1],
 646                              rbuf, ind->nrecv[nzone+1]);
 647             if (!cd->bInPlace)
 648             {
 649                 j = 0;
 650                 for (zone = 0; zone < nzone; zone++)
 651                 {
 652                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 653                     {
 654                         v[i] = rbuf[j];
 655                         j++;
 656                     }
 657                 }
 658             }
 659             nat_tot += ind->nrecv[nzone+1];
 660         }
 661         nzone += nzone;
 662     }
 663 }
 664
 665 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 666 {
 667     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 668     int                   *index, *cgindex;
 669     gmx_domdec_comm_t     *comm;
 670     gmx_domdec_comm_dim_t *cd;
 671     gmx_domdec_ind_t      *ind;
 672     real                  *buf, *sbuf;
 673
 674     comm = dd->comm;
 675
 676     cgindex = dd->cgindex;
 677
 678     buf = &comm->vbuf.v[0][0];
 679
 680     nzone   = comm->zones.n/2;
 681     nat_tot = dd->nat_tot;
 682     for (d = dd->ndim-1; d >= 0; d--)
 683     {
 684         cd = &comm->cd[d];
 685         for (p = cd->np-1; p >= 0; p--)
 686         {
 687             ind      = &cd->ind[p];
 688             nat_tot -= ind->nrecv[nzone+1];
 689             if (cd->bInPlace)
 690             {
 691                 sbuf = v + nat_tot;
 692             }
 693             else
 694             {
 695                 sbuf = &comm->vbuf2.v[0][0];
 696                 j    = 0;
 697                 for (zone = 0; zone < nzone; zone++)
 698                 {
 699                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 700                     {
 701                         sbuf[j] = v[i];
 702                         j++;
 703                     }
 704                 }
 705             }
 706             /* Communicate the forces */
 707             dd_sendrecv_real(dd, d, dddirForward,
 708                              sbuf, ind->nrecv[nzone+1],
 709                              buf,  ind->nsend[nzone+1]);
 710             index = ind->index;
 711             /* Add the received forces */
 712             n = 0;
 713             for (i = 0; i < ind->nsend[nzone]; i++)
 714             {
 715                 at0 = cgindex[index[i]];
 716                 at1 = cgindex[index[i]+1];
 717                 for (j = at0; j < at1; j++)
 718                 {
 719                     v[j] += buf[n];
 720                     n++;
 721                 }
 722             }
 723         }
 724         nzone /= 2;
 725     }
 726 }
 727
 728 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 729 {
 730     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 731             d, i, j,
 732             zone->min0, zone->max1,
 733             zone->mch0, zone->mch0,
 734             zone->p1_0, zone->p1_1);
 735 }
 736
 737
 738 #define DDZONECOMM_MAXZONE  5
 739 #define DDZONECOMM_BUFSIZE  3
 740
 741 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 742                                int ddimind, int direction,
 743                                gmx_ddzone_t *buf_s, int n_s,
 744                                gmx_ddzone_t *buf_r, int n_r)
 745 {
 746 #define ZBS  DDZONECOMM_BUFSIZE
 747     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 748     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 749     int  i;
 750
 751     for (i = 0; i < n_s; i++)
 752     {
 753         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 754         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 755         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 756         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 757         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 758         vbuf_s[i*ZBS+1][2] = 0;
 759         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 760         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 761         vbuf_s[i*ZBS+2][2] = 0;
 762     }
 763
 764     dd_sendrecv_rvec(dd, ddimind, direction,
 765                      vbuf_s, n_s*ZBS,
 766                      vbuf_r, n_r*ZBS);
 767
 768     for (i = 0; i < n_r; i++)
 769     {
 770         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 771         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 772         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 773         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 774         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 775         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 776         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 777     }
 778
 779 #undef ZBS
 780 }
 781
 782 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 783                           rvec cell_ns_x0, rvec cell_ns_x1)
 784 {
 785     int                d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
 786     gmx_ddzone_t      *zp;
 787     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 788     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 789     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 790     rvec               extr_s[2], extr_r[2];
 791     rvec               dh;
 792     real               dist_d, c = 0, det;
 793     gmx_domdec_comm_t *comm;
 794     gmx_bool           bPBC, bUse;
 795
 796     comm = dd->comm;
 797
 798     for (d = 1; d < dd->ndim; d++)
 799     {
 800         dim      = dd->dim[d];
 801         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 802         zp->min0 = cell_ns_x0[dim];
 803         zp->max1 = cell_ns_x1[dim];
 804         zp->min1 = cell_ns_x1[dim];
 805         zp->mch0 = cell_ns_x0[dim];
 806         zp->mch1 = cell_ns_x1[dim];
 807         zp->p1_0 = cell_ns_x0[dim];
 808         zp->p1_1 = cell_ns_x1[dim];
 809     }
 810
 811     for (d = dd->ndim-2; d >= 0; d--)
 812     {
 813         dim  = dd->dim[d];
 814         bPBC = (dim < ddbox->npbcdim);
 815
 816         /* Use an rvec to store two reals */
 817         extr_s[d][0] = comm->cell_f0[d+1];
 818         extr_s[d][1] = comm->cell_f1[d+1];
 819         extr_s[d][2] = comm->cell_f1[d+1];
 820
 821         pos = 0;
 822         /* Store the extremes in the backward sending buffer,
 823          * so the get updated separately from the forward communication.
 824          */
 825         for (d1 = d; d1 < dd->ndim-1; d1++)
 826         {
 827             /* We invert the order to be able to use the same loop for buf_e */
 828             buf_s[pos].min0 = extr_s[d1][1];
 829             buf_s[pos].max1 = extr_s[d1][0];
 830             buf_s[pos].min1 = extr_s[d1][2];
 831             buf_s[pos].mch0 = 0;
 832             buf_s[pos].mch1 = 0;
 833             /* Store the cell corner of the dimension we communicate along */
 834             buf_s[pos].p1_0 = comm->cell_x0[dim];
 835             buf_s[pos].p1_1 = 0;
 836             pos++;
 837         }
 838
 839         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 840         pos++;
 841
 842         if (dd->ndim == 3 && d == 0)
 843         {
 844             buf_s[pos] = comm->zone_d2[0][1];
 845             pos++;
 846             buf_s[pos] = comm->zone_d1[0];
 847             pos++;
 848         }
 849
 850         /* We only need to communicate the extremes
 851          * in the forward direction
 852          */
 853         npulse = comm->cd[d].np;
 854         if (bPBC)
 855         {
 856             /* Take the minimum to avoid double communication */
 857             npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
 858         }
 859         else
 860         {
 861             /* Without PBC we should really not communicate over
 862              * the boundaries, but implementing that complicates
 863              * the communication setup and therefore we simply
 864              * do all communication, but ignore some data.
 865              */
 866             npulse_min = npulse;
 867         }
 868         for (p = 0; p < npulse_min; p++)
 869         {
 870             /* Communicate the extremes forward */
 871             bUse = (bPBC || dd->ci[dim] > 0);
 872
 873             dd_sendrecv_rvec(dd, d, dddirForward,
 874                              extr_s+d, dd->ndim-d-1,
 875                              extr_r+d, dd->ndim-d-1);
 876
 877             if (bUse)
 878             {
 879                 for (d1 = d; d1 < dd->ndim-1; d1++)
 880                 {
 881                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 882                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 883                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 884                 }
 885             }
 886         }
 887
 888         buf_size = pos;
 889         for (p = 0; p < npulse; p++)
 890         {
 891             /* Communicate all the zone information backward */
 892             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 893
 894             dd_sendrecv_ddzone(dd, d, dddirBackward,
 895                                buf_s, buf_size,
 896                                buf_r, buf_size);
 897
 898             clear_rvec(dh);
 899             if (p > 0)
 900             {
 901                 for (d1 = d+1; d1 < dd->ndim; d1++)
 902                 {
 903                     /* Determine the decrease of maximum required
 904                      * communication height along d1 due to the distance along d,
 905                      * this avoids a lot of useless atom communication.
 906                      */
 907                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 908
 909                     if (ddbox->tric_dir[dim])
 910                     {
 911                         /* c is the off-diagonal coupling between the cell planes
 912                          * along directions d and d1.
 913                          */
 914                         c = ddbox->v[dim][dd->dim[d1]][dim];
 915                     }
 916                     else
 917                     {
 918                         c = 0;
 919                     }
 920                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 921                     if (det > 0)
 922                     {
 923                         dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
 924                     }
 925                     else
 926                     {
 927                         /* A negative value signals out of range */
 928                         dh[d1] = -1;
 929                     }
 930                 }
 931             }
 932
 933             /* Accumulate the extremes over all pulses */
 934             for (i = 0; i < buf_size; i++)
 935             {
 936                 if (p == 0)
 937                 {
 938                     buf_e[i] = buf_r[i];
 939                 }
 940                 else
 941                 {
 942                     if (bUse)
 943                     {
 944                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 945                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 946                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 947                     }
 948
 949                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 950                     {
 951                         d1 = 1;
 952                     }
 953                     else
 954                     {
 955                         d1 = d + 1;
 956                     }
 957                     if (bUse && dh[d1] >= 0)
 958                     {
 959                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 960                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 961                     }
 962                 }
 963                 /* Copy the received buffer to the send buffer,
 964                  * to pass the data through with the next pulse.
 965                  */
 966                 buf_s[i] = buf_r[i];
 967             }
 968             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 969                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 970             {
 971                 /* Store the extremes */
 972                 pos = 0;
 973
 974                 for (d1 = d; d1 < dd->ndim-1; d1++)
 975                 {
 976                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 977                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 978                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
 979                     pos++;
 980                 }
 981
 982                 if (d == 1 || (d == 0 && dd->ndim == 3))
 983                 {
 984                     for (i = d; i < 2; i++)
 985                     {
 986                         comm->zone_d2[1-d][i] = buf_e[pos];
 987                         pos++;
 988                     }
 989                 }
 990                 if (d == 0)
 991                 {
 992                     comm->zone_d1[1] = buf_e[pos];
 993                     pos++;
 994                 }
 995             }
 996         }
 997     }
 998
 999     if (dd->ndim >= 2)
1000     {
1001         dim = dd->dim[1];
1002         for (i = 0; i < 2; i++)
1003         {
1004             if (debug)
1005             {
1006                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1007             }
1008             cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1009             cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1010         }
1011     }
1012     if (dd->ndim >= 3)
1013     {
1014         dim = dd->dim[2];
1015         for (i = 0; i < 2; i++)
1016         {
1017             for (j = 0; j < 2; j++)
1018             {
1019                 if (debug)
1020                 {
1021                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1022                 }
1023                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1024                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1025             }
1026         }
1027     }
1028     for (d = 1; d < dd->ndim; d++)
1029     {
1030         comm->cell_f_max0[d] = extr_s[d-1][0];
1031         comm->cell_f_min1[d] = extr_s[d-1][1];
1032         if (debug)
1033         {
1034             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1035                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1036         }
1037     }
1038 }
1039
1040 static void dd_collect_cg(gmx_domdec_t *dd,
1041                           t_state      *state_local)
1042 {
1043     gmx_domdec_master_t *ma = NULL;
1044     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1045
1046     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1047     {
1048         /* The master has the correct distribution */
1049         return;
1050     }
1051
1052     if (state_local->ddp_count == dd->ddp_count)
1053     {
1054         /* The local state and DD are in sync, use the DD indices */
1055         ncg_home = dd->ncg_home;
1056         cg       = dd->index_gl;
1057         nat_home = dd->nat_home;
1058     }
1059     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1060     {
1061         /* The DD is out of sync with the local state, but we have stored
1062          * the cg indices with the local state, so we can use those.
1063          */
1064         t_block *cgs_gl;
1065
1066         cgs_gl = &dd->comm->cgs_gl;
1067
1068         ncg_home = state_local->ncg_gl;
1069         cg       = state_local->cg_gl;
1070         nat_home = 0;
1071         for (i = 0; i < ncg_home; i++)
1072         {
1073             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1074         }
1075     }
1076     else
1077     {
1078         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1079     }
1080
1081     buf2[0] = ncg_home;
1082     buf2[1] = nat_home;
1083     if (DDMASTER(dd))
1084     {
1085         ma   = dd->ma;
1086         ibuf = ma->ibuf;
1087     }
1088     else
1089     {
1090         ibuf = NULL;
1091     }
1092     /* Collect the charge group and atom counts on the master */
1093     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1094
1095     if (DDMASTER(dd))
1096     {
1097         ma->index[0] = 0;
1098         for (i = 0; i < dd->nnodes; i++)
1099         {
1100             ma->ncg[i]     = ma->ibuf[2*i];
1101             ma->nat[i]     = ma->ibuf[2*i+1];
1102             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1103
1104         }
1105         /* Make byte counts and indices */
1106         for (i = 0; i < dd->nnodes; i++)
1107         {
1108             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1109             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1110         }
1111         if (debug)
1112         {
1113             fprintf(debug, "Initial charge group distribution: ");
1114             for (i = 0; i < dd->nnodes; i++)
1115             {
1116                 fprintf(debug, " %d", ma->ncg[i]);
1117             }
1118             fprintf(debug, "\n");
1119         }
1120     }
1121
1122     /* Collect the charge group indices on the master */
1123     dd_gatherv(dd,
1124                ncg_home*sizeof(int), cg,
1125                DDMASTER(dd) ? ma->ibuf : NULL,
1126                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1127                DDMASTER(dd) ? ma->cg : NULL);
1128
1129     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1130 }
1131
1132 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1133                                     rvec *lv, rvec *v)
1134 {
1135     gmx_domdec_master_t *ma;
1136     int                  n, i, c, a, nalloc = 0;
1137     rvec                *buf = NULL;
1138     t_block             *cgs_gl;
1139
1140     ma = dd->ma;
1141
1142     if (!DDMASTER(dd))
1143     {
1144 #ifdef GMX_MPI
1145         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1146                  dd->rank, dd->mpi_comm_all);
1147 #endif
1148     }
1149     else
1150     {
1151         /* Copy the master coordinates to the global array */
1152         cgs_gl = &dd->comm->cgs_gl;
1153
1154         n = DDMASTERRANK(dd);
1155         a = 0;
1156         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1157         {
1158             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1159             {
1160                 copy_rvec(lv[a++], v[c]);
1161             }
1162         }
1163
1164         for (n = 0; n < dd->nnodes; n++)
1165         {
1166             if (n != dd->rank)
1167             {
1168                 if (ma->nat[n] > nalloc)
1169                 {
1170                     nalloc = over_alloc_dd(ma->nat[n]);
1171                     srenew(buf, nalloc);
1172                 }
1173 #ifdef GMX_MPI
1174                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1175                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1176 #endif
1177                 a = 0;
1178                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1179                 {
1180                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1181                     {
1182                         copy_rvec(buf[a++], v[c]);
1183                     }
1184                 }
1185             }
1186         }
1187         sfree(buf);
1188     }
1189 }
1190
1191 static void get_commbuffer_counts(gmx_domdec_t *dd,
1192                                   int **counts, int **disps)
1193 {
1194     gmx_domdec_master_t *ma;
1195     int                  n;
1196
1197     ma = dd->ma;
1198
1199     /* Make the rvec count and displacment arrays */
1200     *counts  = ma->ibuf;
1201     *disps   = ma->ibuf + dd->nnodes;
1202     for (n = 0; n < dd->nnodes; n++)
1203     {
1204         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1205         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1206     }
1207 }
1208
1209 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1210                                    rvec *lv, rvec *v)
1211 {
1212     gmx_domdec_master_t *ma;
1213     int                 *rcounts = NULL, *disps = NULL;
1214     int                  n, i, c, a;
1215     rvec                *buf = NULL;
1216     t_block             *cgs_gl;
1217
1218     ma = dd->ma;
1219
1220     if (DDMASTER(dd))
1221     {
1222         get_commbuffer_counts(dd, &rcounts, &disps);
1223
1224         buf = ma->vbuf;
1225     }
1226
1227     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1228
1229     if (DDMASTER(dd))
1230     {
1231         cgs_gl = &dd->comm->cgs_gl;
1232
1233         a = 0;
1234         for (n = 0; n < dd->nnodes; n++)
1235         {
1236             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1237             {
1238                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1239                 {
1240                     copy_rvec(buf[a++], v[c]);
1241                 }
1242             }
1243         }
1244     }
1245 }
1246
1247 void dd_collect_vec(gmx_domdec_t *dd,
1248                     t_state *state_local, rvec *lv, rvec *v)
1249 {
1250     dd_collect_cg(dd, state_local);
1251
1252     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1253     {
1254         dd_collect_vec_sendrecv(dd, lv, v);
1255     }
1256     else
1257     {
1258         dd_collect_vec_gatherv(dd, lv, v);
1259     }
1260 }
1261
1262
1263 void dd_collect_state(gmx_domdec_t *dd,
1264                       t_state *state_local, t_state *state)
1265 {
1266     int est, i, j, nh;
1267
1268     nh = state->nhchainlength;
1269
1270     if (DDMASTER(dd))
1271     {
1272         for (i = 0; i < efptNR; i++)
1273         {
1274             state->lambda[i] = state_local->lambda[i];
1275         }
1276         state->fep_state = state_local->fep_state;
1277         state->veta      = state_local->veta;
1278         state->vol0      = state_local->vol0;
1279         copy_mat(state_local->box, state->box);
1280         copy_mat(state_local->boxv, state->boxv);
1281         copy_mat(state_local->svir_prev, state->svir_prev);
1282         copy_mat(state_local->fvir_prev, state->fvir_prev);
1283         copy_mat(state_local->pres_prev, state->pres_prev);
1284
1285         for (i = 0; i < state_local->ngtc; i++)
1286         {
1287             for (j = 0; j < nh; j++)
1288             {
1289                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1290                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1291             }
1292             state->therm_integral[i] = state_local->therm_integral[i];
1293         }
1294         for (i = 0; i < state_local->nnhpres; i++)
1295         {
1296             for (j = 0; j < nh; j++)
1297             {
1298                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1299                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1300             }
1301         }
1302     }
1303     for (est = 0; est < estNR; est++)
1304     {
1305         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1306         {
1307             switch (est)
1308             {
1309                 case estX:
1310                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1311                     break;
1312                 case estV:
1313                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1314                     break;
1315                 case estSDX:
1316                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1317                     break;
1318                 case estCGP:
1319                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1320                     break;
1321                 case estDISRE_INITF:
1322                 case estDISRE_RM3TAV:
1323                 case estORIRE_INITF:
1324                 case estORIRE_DTAV:
1325                     break;
1326                 default:
1327                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1328             }
1329         }
1330     }
1331 }
1332
1333 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1334 {
1335     int est;
1336
1337     if (debug)
1338     {
1339         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1340     }
1341
1342     state->nalloc = over_alloc_dd(nalloc);
1343
1344     for (est = 0; est < estNR; est++)
1345     {
1346         if (EST_DISTR(est) && (state->flags & (1<<est)))
1347         {
1348             /* We need to allocate one element extra, since we might use
1349              * (unaligned) 4-wide SIMD loads to access rvec entries.
1350              */
1351             switch (est)
1352             {
1353                 case estX:
1354                     srenew(state->x, state->nalloc + 1);
1355                     break;
1356                 case estV:
1357                     srenew(state->v, state->nalloc + 1);
1358                     break;
1359                 case estSDX:
1360                     srenew(state->sd_X, state->nalloc + 1);
1361                     break;
1362                 case estCGP:
1363                     srenew(state->cg_p, state->nalloc + 1);
1364                     break;
1365                 case estDISRE_INITF:
1366                 case estDISRE_RM3TAV:
1367                 case estORIRE_INITF:
1368                 case estORIRE_DTAV:
1369                     /* No reallocation required */
1370                     break;
1371                 default:
1372                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1373             }
1374         }
1375     }
1376
1377     if (f != NULL)
1378     {
1379         srenew(*f, state->nalloc);
1380     }
1381 }
1382
1383 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1384                                int nalloc)
1385 {
1386     if (nalloc > fr->cg_nalloc)
1387     {
1388         if (debug)
1389         {
1390             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1391         }
1392         fr->cg_nalloc = over_alloc_dd(nalloc);
1393         srenew(fr->cginfo, fr->cg_nalloc);
1394         if (fr->cutoff_scheme == ecutsGROUP)
1395         {
1396             srenew(fr->cg_cm, fr->cg_nalloc);
1397         }
1398     }
1399     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1400     {
1401         /* We don't use charge groups, we use x in state to set up
1402          * the atom communication.
1403          */
1404         dd_realloc_state(state, f, nalloc);
1405     }
1406 }
1407
1408 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1409                                        rvec *v, rvec *lv)
1410 {
1411     gmx_domdec_master_t *ma;
1412     int                  n, i, c, a, nalloc = 0;
1413     rvec                *buf = NULL;
1414
1415     if (DDMASTER(dd))
1416     {
1417         ma  = dd->ma;
1418
1419         for (n = 0; n < dd->nnodes; n++)
1420         {
1421             if (n != dd->rank)
1422             {
1423                 if (ma->nat[n] > nalloc)
1424                 {
1425                     nalloc = over_alloc_dd(ma->nat[n]);
1426                     srenew(buf, nalloc);
1427                 }
1428                 /* Use lv as a temporary buffer */
1429                 a = 0;
1430                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1431                 {
1432                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1433                     {
1434                         copy_rvec(v[c], buf[a++]);
1435                     }
1436                 }
1437                 if (a != ma->nat[n])
1438                 {
1439                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1440                               a, ma->nat[n]);
1441                 }
1442
1443 #ifdef GMX_MPI
1444                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1445                          DDRANK(dd, n), n, dd->mpi_comm_all);
1446 #endif
1447             }
1448         }
1449         sfree(buf);
1450         n = DDMASTERRANK(dd);
1451         a = 0;
1452         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1453         {
1454             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1455             {
1456                 copy_rvec(v[c], lv[a++]);
1457             }
1458         }
1459     }
1460     else
1461     {
1462 #ifdef GMX_MPI
1463         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1464                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1465 #endif
1466     }
1467 }
1468
1469 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1470                                        rvec *v, rvec *lv)
1471 {
1472     gmx_domdec_master_t *ma;
1473     int                 *scounts = NULL, *disps = NULL;
1474     int                  n, i, c, a;
1475     rvec                *buf = NULL;
1476
1477     if (DDMASTER(dd))
1478     {
1479         ma  = dd->ma;
1480
1481         get_commbuffer_counts(dd, &scounts, &disps);
1482
1483         buf = ma->vbuf;
1484         a   = 0;
1485         for (n = 0; n < dd->nnodes; n++)
1486         {
1487             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1488             {
1489                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1490                 {
1491                     copy_rvec(v[c], buf[a++]);
1492                 }
1493             }
1494         }
1495     }
1496
1497     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1498 }
1499
1500 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1501 {
1502     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1503     {
1504         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1505     }
1506     else
1507     {
1508         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1509     }
1510 }
1511
1512 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1513 {
1514     int i;
1515     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1516     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1517     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1518
1519     if (dfhist->nlambda > 0)
1520     {
1521         int nlam = dfhist->nlambda;
1522         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1523         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1524         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1525         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1526         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1527         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1528
1529         for (i = 0; i < nlam; i++)
1530         {
1531             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1532             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1533             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1534             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1535             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1536             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1537         }
1538     }
1539 }
1540
1541 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1542                                 t_state *state, t_state *state_local,
1543                                 rvec **f)
1544 {
1545     int  i, j, nh;
1546
1547     nh = state->nhchainlength;
1548
1549     if (DDMASTER(dd))
1550     {
1551         for (i = 0; i < efptNR; i++)
1552         {
1553             state_local->lambda[i] = state->lambda[i];
1554         }
1555         state_local->fep_state = state->fep_state;
1556         state_local->veta      = state->veta;
1557         state_local->vol0      = state->vol0;
1558         copy_mat(state->box, state_local->box);
1559         copy_mat(state->box_rel, state_local->box_rel);
1560         copy_mat(state->boxv, state_local->boxv);
1561         copy_mat(state->svir_prev, state_local->svir_prev);
1562         copy_mat(state->fvir_prev, state_local->fvir_prev);
1563         copy_df_history(&state_local->dfhist, &state->dfhist);
1564         for (i = 0; i < state_local->ngtc; i++)
1565         {
1566             for (j = 0; j < nh; j++)
1567             {
1568                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1569                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1570             }
1571             state_local->therm_integral[i] = state->therm_integral[i];
1572         }
1573         for (i = 0; i < state_local->nnhpres; i++)
1574         {
1575             for (j = 0; j < nh; j++)
1576             {
1577                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1578                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1579             }
1580         }
1581     }
1582     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1583     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1584     dd_bcast(dd, sizeof(real), &state_local->veta);
1585     dd_bcast(dd, sizeof(real), &state_local->vol0);
1586     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1587     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1588     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1589     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1590     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1591     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1592     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1593     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1594     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1595     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1596
1597     /* communicate df_history -- required for restarting from checkpoint */
1598     dd_distribute_dfhist(dd, &state_local->dfhist);
1599
1600     if (dd->nat_home > state_local->nalloc)
1601     {
1602         dd_realloc_state(state_local, f, dd->nat_home);
1603     }
1604     for (i = 0; i < estNR; i++)
1605     {
1606         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1607         {
1608             switch (i)
1609             {
1610                 case estX:
1611                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1612                     break;
1613                 case estV:
1614                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1615                     break;
1616                 case estSDX:
1617                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1618                     break;
1619                 case estCGP:
1620                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1621                     break;
1622                 case estDISRE_INITF:
1623                 case estDISRE_RM3TAV:
1624                 case estORIRE_INITF:
1625                 case estORIRE_DTAV:
1626                     /* Not implemented yet */
1627                     break;
1628                 default:
1629                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1630             }
1631         }
1632     }
1633 }
1634
1635 static char dim2char(int dim)
1636 {
1637     char c = '?';
1638
1639     switch (dim)
1640     {
1641         case XX: c = 'X'; break;
1642         case YY: c = 'Y'; break;
1643         case ZZ: c = 'Z'; break;
1644         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1645     }
1646
1647     return c;
1648 }
1649
1650 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1651                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1652 {
1653     rvec   grid_s[2], *grid_r = NULL, cx, r;
1654     char   fname[STRLEN], buf[22];
1655     FILE  *out;
1656     int    a, i, d, z, y, x;
1657     matrix tric;
1658     real   vol;
1659
1660     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1661     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1662
1663     if (DDMASTER(dd))
1664     {
1665         snew(grid_r, 2*dd->nnodes);
1666     }
1667
1668     dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : NULL);
1669
1670     if (DDMASTER(dd))
1671     {
1672         for (d = 0; d < DIM; d++)
1673         {
1674             for (i = 0; i < DIM; i++)
1675             {
1676                 if (d == i)
1677                 {
1678                     tric[d][i] = 1;
1679                 }
1680                 else
1681                 {
1682                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1683                     {
1684                         tric[d][i] = box[i][d]/box[i][i];
1685                     }
1686                     else
1687                     {
1688                         tric[d][i] = 0;
1689                     }
1690                 }
1691             }
1692         }
1693         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1694         out = gmx_fio_fopen(fname, "w");
1695         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1696         a = 1;
1697         for (i = 0; i < dd->nnodes; i++)
1698         {
1699             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1700             for (d = 0; d < DIM; d++)
1701             {
1702                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1703             }
1704             for (z = 0; z < 2; z++)
1705             {
1706                 for (y = 0; y < 2; y++)
1707                 {
1708                     for (x = 0; x < 2; x++)
1709                     {
1710                         cx[XX] = grid_r[i*2+x][XX];
1711                         cx[YY] = grid_r[i*2+y][YY];
1712                         cx[ZZ] = grid_r[i*2+z][ZZ];
1713                         mvmul(tric, cx, r);
1714                         gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
1715                                                  10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
1716                     }
1717                 }
1718             }
1719             for (d = 0; d < DIM; d++)
1720             {
1721                 for (x = 0; x < 4; x++)
1722                 {
1723                     switch (d)
1724                     {
1725                         case 0: y = 1 + i*8 + 2*x; break;
1726                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1727                         case 2: y = 1 + i*8 + x; break;
1728                     }
1729                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
1730                 }
1731             }
1732         }
1733         gmx_fio_fclose(out);
1734         sfree(grid_r);
1735     }
1736 }
1737
1738 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
1739                   gmx_mtop_t *mtop, t_commrec *cr,
1740                   int natoms, rvec x[], matrix box)
1741 {
1742     char          fname[STRLEN], buf[22];
1743     FILE         *out;
1744     int           i, ii, resnr, c;
1745     char         *atomname, *resname;
1746     real          b;
1747     gmx_domdec_t *dd;
1748
1749     dd = cr->dd;
1750     if (natoms == -1)
1751     {
1752         natoms = dd->comm->nat[ddnatVSITE];
1753     }
1754
1755     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
1756
1757     out = gmx_fio_fopen(fname, "w");
1758
1759     fprintf(out, "TITLE     %s\n", title);
1760     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1761     for (i = 0; i < natoms; i++)
1762     {
1763         ii = dd->gatindex[i];
1764         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
1765         if (i < dd->comm->nat[ddnatZONE])
1766         {
1767             c = 0;
1768             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1769             {
1770                 c++;
1771             }
1772             b = c;
1773         }
1774         else if (i < dd->comm->nat[ddnatVSITE])
1775         {
1776             b = dd->comm->zones.n;
1777         }
1778         else
1779         {
1780             b = dd->comm->zones.n + 1;
1781         }
1782         gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
1783                                  10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
1784     }
1785     fprintf(out, "TER\n");
1786
1787     gmx_fio_fclose(out);
1788 }
1789
1790 real dd_cutoff_multibody(const gmx_domdec_t *dd)
1791 {
1792     gmx_domdec_comm_t *comm;
1793     int                di;
1794     real               r;
1795
1796     comm = dd->comm;
1797
1798     r = -1;
1799     if (comm->bInterCGBondeds)
1800     {
1801         if (comm->cutoff_mbody > 0)
1802         {
1803             r = comm->cutoff_mbody;
1804         }
1805         else
1806         {
1807             /* cutoff_mbody=0 means we do not have DLB */
1808             r = comm->cellsize_min[dd->dim[0]];
1809             for (di = 1; di < dd->ndim; di++)
1810             {
1811                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
1812             }
1813             if (comm->bBondComm)
1814             {
1815                 r = std::max(r, comm->cutoff_mbody);
1816             }
1817             else
1818             {
1819                 r = std::min(r, comm->cutoff);
1820             }
1821         }
1822     }
1823
1824     return r;
1825 }
1826
1827 real dd_cutoff_twobody(const gmx_domdec_t *dd)
1828 {
1829     real r_mb;
1830
1831     r_mb = dd_cutoff_multibody(dd);
1832
1833     return std::max(dd->comm->cutoff, r_mb);
1834 }
1835
1836
1837 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
1838 {
1839     int nc, ntot;
1840
1841     nc   = dd->nc[dd->comm->cartpmedim];
1842     ntot = dd->comm->ntot[dd->comm->cartpmedim];
1843     copy_ivec(coord, coord_pme);
1844     coord_pme[dd->comm->cartpmedim] =
1845         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
1846 }
1847
1848 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
1849 {
1850     /* Here we assign a PME node to communicate with this DD node
1851      * by assuming that the major index of both is x.
1852      * We add cr->npmenodes/2 to obtain an even distribution.
1853      */
1854     return (ddindex*npme + npme/2)/ndd;
1855 }
1856
1857 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
1858 {
1859     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
1860 }
1861
1862 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
1863 {
1864     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
1865 }
1866
1867 static int *dd_pmenodes(t_commrec *cr)
1868 {
1869     int *pmenodes;
1870     int  n, i, p0, p1;
1871
1872     snew(pmenodes, cr->npmenodes);
1873     n = 0;
1874     for (i = 0; i < cr->dd->nnodes; i++)
1875     {
1876         p0 = cr_ddindex2pmeindex(cr, i);
1877         p1 = cr_ddindex2pmeindex(cr, i+1);
1878         if (i+1 == cr->dd->nnodes || p1 > p0)
1879         {
1880             if (debug)
1881             {
1882                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
1883             }
1884             pmenodes[n] = i + 1 + n;
1885             n++;
1886         }
1887     }
1888
1889     return pmenodes;
1890 }
1891
1892 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
1893 {
1894     gmx_domdec_t *dd;
1895     ivec          coords;
1896     int           slab;
1897
1898     dd = cr->dd;
1899     /*
1900        if (dd->comm->bCartesian) {
1901        gmx_ddindex2xyz(dd->nc,ddindex,coords);
1902        dd_coords2pmecoords(dd,coords,coords_pme);
1903        copy_ivec(dd->ntot,nc);
1904        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
1905        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1906
1907        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
1908        } else {
1909        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
1910        }
1911      */
1912     coords[XX] = x;
1913     coords[YY] = y;
1914     coords[ZZ] = z;
1915     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
1916
1917     return slab;
1918 }
1919
1920 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
1921 {
1922     gmx_domdec_comm_t *comm;
1923     ivec               coords;
1924     int                ddindex, nodeid = -1;
1925
1926     comm = cr->dd->comm;
1927
1928     coords[XX] = x;
1929     coords[YY] = y;
1930     coords[ZZ] = z;
1931     if (comm->bCartesianPP_PME)
1932     {
1933 #ifdef GMX_MPI
1934         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
1935 #endif
1936     }
1937     else
1938     {
1939         ddindex = dd_index(cr->dd->nc, coords);
1940         if (comm->bCartesianPP)
1941         {
1942             nodeid = comm->ddindex2simnodeid[ddindex];
1943         }
1944         else
1945         {
1946             if (comm->pmenodes)
1947             {
1948                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
1949             }
1950             else
1951             {
1952                 nodeid = ddindex;
1953             }
1954         }
1955     }
1956
1957     return nodeid;
1958 }
1959
1960 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
1961 {
1962     gmx_domdec_t      *dd;
1963     gmx_domdec_comm_t *comm;
1964     int                i;
1965     int                pmenode = -1;
1966
1967     dd   = cr->dd;
1968     comm = dd->comm;
1969
1970     /* This assumes a uniform x domain decomposition grid cell size */
1971     if (comm->bCartesianPP_PME)
1972     {
1973 #ifdef GMX_MPI
1974         ivec coord, coord_pme;
1975         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
1976         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1977         {
1978             /* This is a PP node */
1979             dd_cart_coord2pmecoord(dd, coord, coord_pme);
1980             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
1981         }
1982 #endif
1983     }
1984     else if (comm->bCartesianPP)
1985     {
1986         if (sim_nodeid < dd->nnodes)
1987         {
1988             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1989         }
1990     }
1991     else
1992     {
1993         /* This assumes DD cells with identical x coordinates
1994          * are numbered sequentially.
1995          */
1996         if (dd->comm->pmenodes == NULL)
1997         {
1998             if (sim_nodeid < dd->nnodes)
1999             {
2000                 /* The DD index equals the nodeid */
2001                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2002             }
2003         }
2004         else
2005         {
2006             i = 0;
2007             while (sim_nodeid > dd->comm->pmenodes[i])
2008             {
2009                 i++;
2010             }
2011             if (sim_nodeid < dd->comm->pmenodes[i])
2012             {
2013                 pmenode = dd->comm->pmenodes[i];
2014             }
2015         }
2016     }
2017
2018     return pmenode;
2019 }
2020
2021 void get_pme_nnodes(const gmx_domdec_t *dd,
2022                     int *npmenodes_x, int *npmenodes_y)
2023 {
2024     if (dd != NULL)
2025     {
2026         *npmenodes_x = dd->comm->npmenodes_x;
2027         *npmenodes_y = dd->comm->npmenodes_y;
2028     }
2029     else
2030     {
2031         *npmenodes_x = 1;
2032         *npmenodes_y = 1;
2033     }
2034 }
2035
2036 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2037                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2038 {
2039     gmx_domdec_t *dd;
2040     int           x, y, z;
2041     ivec          coord, coord_pme;
2042
2043     dd = cr->dd;
2044
2045     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2046
2047     *nmy_ddnodes = 0;
2048     for (x = 0; x < dd->nc[XX]; x++)
2049     {
2050         for (y = 0; y < dd->nc[YY]; y++)
2051         {
2052             for (z = 0; z < dd->nc[ZZ]; z++)
2053             {
2054                 if (dd->comm->bCartesianPP_PME)
2055                 {
2056                     coord[XX] = x;
2057                     coord[YY] = y;
2058                     coord[ZZ] = z;
2059                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2060                     if (dd->ci[XX] == coord_pme[XX] &&
2061                         dd->ci[YY] == coord_pme[YY] &&
2062                         dd->ci[ZZ] == coord_pme[ZZ])
2063                     {
2064                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2065                     }
2066                 }
2067                 else
2068                 {
2069                     /* The slab corresponds to the nodeid in the PME group */
2070                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2071                     {
2072                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2073                     }
2074                 }
2075             }
2076         }
2077     }
2078
2079     /* The last PP-only node is the peer node */
2080     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2081
2082     if (debug)
2083     {
2084         fprintf(debug, "Receive coordinates from PP ranks:");
2085         for (x = 0; x < *nmy_ddnodes; x++)
2086         {
2087             fprintf(debug, " %d", (*my_ddnodes)[x]);
2088         }
2089         fprintf(debug, "\n");
2090     }
2091 }
2092
2093 static gmx_bool receive_vir_ener(t_commrec *cr)
2094 {
2095     gmx_domdec_comm_t *comm;
2096     int                pmenode;
2097     gmx_bool           bReceive;
2098
2099     bReceive = TRUE;
2100     if (cr->npmenodes < cr->dd->nnodes)
2101     {
2102         comm = cr->dd->comm;
2103         if (comm->bCartesianPP_PME)
2104         {
2105             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2106 #ifdef GMX_MPI
2107             ivec coords;
2108             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2109             coords[comm->cartpmedim]++;
2110             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2111             {
2112                 int rank;
2113                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2114                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2115                 {
2116                     /* This is not the last PP node for pmenode */
2117                     bReceive = FALSE;
2118                 }
2119             }
2120 #endif
2121         }
2122         else
2123         {
2124             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2125             if (cr->sim_nodeid+1 < cr->nnodes &&
2126                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2127             {
2128                 /* This is not the last PP node for pmenode */
2129                 bReceive = FALSE;
2130             }
2131         }
2132     }
2133
2134     return bReceive;
2135 }
2136
2137 static void set_zones_ncg_home(gmx_domdec_t *dd)
2138 {
2139     gmx_domdec_zones_t *zones;
2140     int                 i;
2141
2142     zones = &dd->comm->zones;
2143
2144     zones->cg_range[0] = 0;
2145     for (i = 1; i < zones->n+1; i++)
2146     {
2147         zones->cg_range[i] = dd->ncg_home;
2148     }
2149     /* zone_ncg1[0] should always be equal to ncg_home */
2150     dd->comm->zone_ncg1[0] = dd->ncg_home;
2151 }
2152
2153 static void rebuild_cgindex(gmx_domdec_t *dd,
2154                             const int *gcgs_index, t_state *state)
2155 {
2156     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2157
2158     ind        = state->cg_gl;
2159     dd_cg_gl   = dd->index_gl;
2160     cgindex    = dd->cgindex;
2161     nat        = 0;
2162     cgindex[0] = nat;
2163     for (i = 0; i < state->ncg_gl; i++)
2164     {
2165         cgindex[i]  = nat;
2166         cg_gl       = ind[i];
2167         dd_cg_gl[i] = cg_gl;
2168         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2169     }
2170     cgindex[i] = nat;
2171
2172     dd->ncg_home = state->ncg_gl;
2173     dd->nat_home = nat;
2174
2175     set_zones_ncg_home(dd);
2176 }
2177
2178 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2179 {
2180     while (cg >= cginfo_mb->cg_end)
2181     {
2182         cginfo_mb++;
2183     }
2184
2185     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2186 }
2187
2188 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2189                           t_forcerec *fr, char *bLocalCG)
2190 {
2191     cginfo_mb_t *cginfo_mb;
2192     int         *cginfo;
2193     int          cg;
2194
2195     if (fr != NULL)
2196     {
2197         cginfo_mb = fr->cginfo_mb;
2198         cginfo    = fr->cginfo;
2199
2200         for (cg = cg0; cg < cg1; cg++)
2201         {
2202             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2203         }
2204     }
2205
2206     if (bLocalCG != NULL)
2207     {
2208         for (cg = cg0; cg < cg1; cg++)
2209         {
2210             bLocalCG[index_gl[cg]] = TRUE;
2211         }
2212     }
2213 }
2214
2215 static void make_dd_indices(gmx_domdec_t *dd,
2216                             const int *gcgs_index, int cg_start)
2217 {
2218     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2219     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2220     gmx_bool     bCGs;
2221
2222     if (dd->nat_tot > dd->gatindex_nalloc)
2223     {
2224         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2225         srenew(dd->gatindex, dd->gatindex_nalloc);
2226     }
2227
2228     nzone      = dd->comm->zones.n;
2229     zone2cg    = dd->comm->zones.cg_range;
2230     zone_ncg1  = dd->comm->zone_ncg1;
2231     index_gl   = dd->index_gl;
2232     gatindex   = dd->gatindex;
2233     bCGs       = dd->comm->bCGs;
2234
2235     if (zone2cg[1] != dd->ncg_home)
2236     {
2237         gmx_incons("dd->ncg_zone is not up to date");
2238     }
2239
2240     /* Make the local to global and global to local atom index */
2241     a = dd->cgindex[cg_start];
2242     for (zone = 0; zone < nzone; zone++)
2243     {
2244         if (zone == 0)
2245         {
2246             cg0 = cg_start;
2247         }
2248         else
2249         {
2250             cg0 = zone2cg[zone];
2251         }
2252         cg1    = zone2cg[zone+1];
2253         cg1_p1 = cg0 + zone_ncg1[zone];
2254
2255         for (cg = cg0; cg < cg1; cg++)
2256         {
2257             zone1 = zone;
2258             if (cg >= cg1_p1)
2259             {
2260                 /* Signal that this cg is from more than one pulse away */
2261                 zone1 += nzone;
2262             }
2263             cg_gl = index_gl[cg];
2264             if (bCGs)
2265             {
2266                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2267                 {
2268                     gatindex[a] = a_gl;
2269                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2270                     a++;
2271                 }
2272             }
2273             else
2274             {
2275                 gatindex[a] = cg_gl;
2276                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2277                 a++;
2278             }
2279         }
2280     }
2281 }
2282
2283 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2284                           const char *where)
2285 {
2286     int i, ngl, nerr;
2287
2288     nerr = 0;
2289     if (bLocalCG == NULL)
2290     {
2291         return nerr;
2292     }
2293     for (i = 0; i < dd->ncg_tot; i++)
2294     {
2295         if (!bLocalCG[dd->index_gl[i]])
2296         {
2297             fprintf(stderr,
2298                     "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2299             nerr++;
2300         }
2301     }
2302     ngl = 0;
2303     for (i = 0; i < ncg_sys; i++)
2304     {
2305         if (bLocalCG[i])
2306         {
2307             ngl++;
2308         }
2309     }
2310     if (ngl != dd->ncg_tot)
2311     {
2312         fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2313         nerr++;
2314     }
2315
2316     return nerr;
2317 }
2318
2319 static void check_index_consistency(gmx_domdec_t *dd,
2320                                     int natoms_sys, int ncg_sys,
2321                                     const char *where)
2322 {
2323     int   nerr, ngl, i, a, cell;
2324     int  *have;
2325
2326     nerr = 0;
2327
2328     if (dd->comm->DD_debug > 1)
2329     {
2330         snew(have, natoms_sys);
2331         for (a = 0; a < dd->nat_tot; a++)
2332         {
2333             if (have[dd->gatindex[a]] > 0)
2334             {
2335                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2336             }
2337             else
2338             {
2339                 have[dd->gatindex[a]] = a + 1;
2340             }
2341         }
2342         sfree(have);
2343     }
2344
2345     snew(have, dd->nat_tot);
2346
2347     ngl  = 0;
2348     for (i = 0; i < natoms_sys; i++)
2349     {
2350         if (ga2la_get(dd->ga2la, i, &a, &cell))
2351         {
2352             if (a >= dd->nat_tot)
2353             {
2354                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2355                 nerr++;
2356             }
2357             else
2358             {
2359                 have[a] = 1;
2360                 if (dd->gatindex[a] != i)
2361                 {
2362                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2363                     nerr++;
2364                 }
2365             }
2366             ngl++;
2367         }
2368     }
2369     if (ngl != dd->nat_tot)
2370     {
2371         fprintf(stderr,
2372                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2373                 dd->rank, where, ngl, dd->nat_tot);
2374     }
2375     for (a = 0; a < dd->nat_tot; a++)
2376     {
2377         if (have[a] == 0)
2378         {
2379             fprintf(stderr,
2380                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
2381                     dd->rank, where, a+1, dd->gatindex[a]+1);
2382         }
2383     }
2384     sfree(have);
2385
2386     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2387
2388     if (nerr > 0)
2389     {
2390         gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2391                   dd->rank, where, nerr);
2392     }
2393 }
2394
2395 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2396 {
2397     int   i;
2398     char *bLocalCG;
2399
2400     if (a_start == 0)
2401     {
2402         /* Clear the whole list without searching */
2403         ga2la_clear(dd->ga2la);
2404     }
2405     else
2406     {
2407         for (i = a_start; i < dd->nat_tot; i++)
2408         {
2409             ga2la_del(dd->ga2la, dd->gatindex[i]);
2410         }
2411     }
2412
2413     bLocalCG = dd->comm->bLocalCG;
2414     if (bLocalCG)
2415     {
2416         for (i = cg_start; i < dd->ncg_tot; i++)
2417         {
2418             bLocalCG[dd->index_gl[i]] = FALSE;
2419         }
2420     }
2421
2422     dd_clear_local_vsite_indices(dd);
2423
2424     if (dd->constraints)
2425     {
2426         dd_clear_local_constraint_indices(dd);
2427     }
2428 }
2429
2430 /* This function should be used for moving the domain boudaries during DLB,
2431  * for obtaining the minimum cell size. It checks the initially set limit
2432  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2433  * and, possibly, a longer cut-off limit set for PME load balancing.
2434  */
2435 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2436 {
2437     real cellsize_min;
2438
2439     cellsize_min = comm->cellsize_min[dim];
2440
2441     if (!comm->bVacDLBNoLimit)
2442     {
2443         /* The cut-off might have changed, e.g. by PME load balacning,
2444          * from the value used to set comm->cellsize_min, so check it.
2445          */
2446         cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2447
2448         if (comm->bPMELoadBalDLBLimits)
2449         {
2450             /* Check for the cut-off limit set by the PME load balancing */
2451             cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2452         }
2453     }
2454
2455     return cellsize_min;
2456 }
2457
2458 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2459                             int dim_ind)
2460 {
2461     real grid_jump_limit;
2462
2463     /* The distance between the boundaries of cells at distance
2464      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2465      * and by the fact that cells should not be shifted by more than
2466      * half their size, such that cg's only shift by one cell
2467      * at redecomposition.
2468      */
2469     grid_jump_limit = comm->cellsize_limit;
2470     if (!comm->bVacDLBNoLimit)
2471     {
2472         if (comm->bPMELoadBalDLBLimits)
2473         {
2474             cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
2475         }
2476         grid_jump_limit = std::max(grid_jump_limit,
2477                                    cutoff/comm->cd[dim_ind].np);
2478     }
2479
2480     return grid_jump_limit;
2481 }
2482
2483 static gmx_bool check_grid_jump(gmx_int64_t     step,
2484                                 gmx_domdec_t   *dd,
2485                                 real            cutoff,
2486                                 gmx_ddbox_t    *ddbox,
2487                                 gmx_bool        bFatal)
2488 {
2489     gmx_domdec_comm_t *comm;
2490     int                d, dim;
2491     real               limit, bfac;
2492     gmx_bool           bInvalid;
2493
2494     bInvalid = FALSE;
2495
2496     comm = dd->comm;
2497
2498     for (d = 1; d < dd->ndim; d++)
2499     {
2500         dim   = dd->dim[d];
2501         limit = grid_jump_limit(comm, cutoff, d);
2502         bfac  = ddbox->box_size[dim];
2503         if (ddbox->tric_dir[dim])
2504         {
2505             bfac *= ddbox->skew_fac[dim];
2506         }
2507         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2508                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2509         {
2510             bInvalid = TRUE;
2511
2512             if (bFatal)
2513             {
2514                 char buf[22];
2515
2516                 /* This error should never be triggered under normal
2517                  * circumstances, but you never know ...
2518                  */
2519                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2520                           gmx_step_str(step, buf),
2521                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2522             }
2523         }
2524     }
2525
2526     return bInvalid;
2527 }
2528
2529 static int dd_load_count(gmx_domdec_comm_t *comm)
2530 {
2531     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2532 }
2533
2534 static float dd_force_load(gmx_domdec_comm_t *comm)
2535 {
2536     float load;
2537
2538     if (comm->eFlop)
2539     {
2540         load = comm->flop;
2541         if (comm->eFlop > 1)
2542         {
2543             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2544         }
2545     }
2546     else
2547     {
2548         load = comm->cycl[ddCyclF];
2549         if (comm->cycl_n[ddCyclF] > 1)
2550         {
2551             /* Subtract the maximum of the last n cycle counts
2552              * to get rid of possible high counts due to other sources,
2553              * for instance system activity, that would otherwise
2554              * affect the dynamic load balancing.
2555              */
2556             load -= comm->cycl_max[ddCyclF];
2557         }
2558
2559 #ifdef GMX_MPI
2560         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2561         {
2562             float gpu_wait, gpu_wait_sum;
2563
2564             gpu_wait = comm->cycl[ddCyclWaitGPU];
2565             if (comm->cycl_n[ddCyclF] > 1)
2566             {
2567                 /* We should remove the WaitGPU time of the same MD step
2568                  * as the one with the maximum F time, since the F time
2569                  * and the wait time are not independent.
2570                  * Furthermore, the step for the max F time should be chosen
2571                  * the same on all ranks that share the same GPU.
2572                  * But to keep the code simple, we remove the average instead.
2573                  * The main reason for artificially long times at some steps
2574                  * is spurious CPU activity or MPI time, so we don't expect
2575                  * that changes in the GPU wait time matter a lot here.
2576                  */
2577                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2578             }
2579             /* Sum the wait times over the ranks that share the same GPU */
2580             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2581                           comm->mpi_comm_gpu_shared);
2582             /* Replace the wait time by the average over the ranks */
2583             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2584         }
2585 #endif
2586     }
2587
2588     return load;
2589 }
2590
2591 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2592 {
2593     gmx_domdec_comm_t *comm;
2594     int                i;
2595
2596     comm = dd->comm;
2597
2598     snew(*dim_f, dd->nc[dim]+1);
2599     (*dim_f)[0] = 0;
2600     for (i = 1; i < dd->nc[dim]; i++)
2601     {
2602         if (comm->slb_frac[dim])
2603         {
2604             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2605         }
2606         else
2607         {
2608             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2609         }
2610     }
2611     (*dim_f)[dd->nc[dim]] = 1;
2612 }
2613
2614 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2615 {
2616     int  pmeindex, slab, nso, i;
2617     ivec xyz;
2618
2619     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2620     {
2621         ddpme->dim = YY;
2622     }
2623     else
2624     {
2625         ddpme->dim = dimind;
2626     }
2627     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2628
2629     ddpme->nslab = (ddpme->dim == 0 ?
2630                     dd->comm->npmenodes_x :
2631                     dd->comm->npmenodes_y);
2632
2633     if (ddpme->nslab <= 1)
2634     {
2635         return;
2636     }
2637
2638     nso = dd->comm->npmenodes/ddpme->nslab;
2639     /* Determine for each PME slab the PP location range for dimension dim */
2640     snew(ddpme->pp_min, ddpme->nslab);
2641     snew(ddpme->pp_max, ddpme->nslab);
2642     for (slab = 0; slab < ddpme->nslab; slab++)
2643     {
2644         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2645         ddpme->pp_max[slab] = 0;
2646     }
2647     for (i = 0; i < dd->nnodes; i++)
2648     {
2649         ddindex2xyz(dd->nc, i, xyz);
2650         /* For y only use our y/z slab.
2651          * This assumes that the PME x grid size matches the DD grid size.
2652          */
2653         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2654         {
2655             pmeindex = ddindex2pmeindex(dd, i);
2656             if (dimind == 0)
2657             {
2658                 slab = pmeindex/nso;
2659             }
2660             else
2661             {
2662                 slab = pmeindex % ddpme->nslab;
2663             }
2664             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
2665             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
2666         }
2667     }
2668
2669     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2670 }
2671
2672 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2673 {
2674     if (dd->comm->ddpme[0].dim == XX)
2675     {
2676         return dd->comm->ddpme[0].maxshift;
2677     }
2678     else
2679     {
2680         return 0;
2681     }
2682 }
2683
2684 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2685 {
2686     if (dd->comm->ddpme[0].dim == YY)
2687     {
2688         return dd->comm->ddpme[0].maxshift;
2689     }
2690     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2691     {
2692         return dd->comm->ddpme[1].maxshift;
2693     }
2694     else
2695     {
2696         return 0;
2697     }
2698 }
2699
2700 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2701                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
2702 {
2703     gmx_domdec_comm_t *comm;
2704     int                nc, ns, s;
2705     int               *xmin, *xmax;
2706     real               range, pme_boundary;
2707     int                sh;
2708
2709     comm = dd->comm;
2710     nc   = dd->nc[ddpme->dim];
2711     ns   = ddpme->nslab;
2712
2713     if (!ddpme->dim_match)
2714     {
2715         /* PP decomposition is not along dim: the worst situation */
2716         sh = ns/2;
2717     }
2718     else if (ns <= 3 || (bUniform && ns == nc))
2719     {
2720         /* The optimal situation */
2721         sh = 1;
2722     }
2723     else
2724     {
2725         /* We need to check for all pme nodes which nodes they
2726          * could possibly need to communicate with.
2727          */
2728         xmin = ddpme->pp_min;
2729         xmax = ddpme->pp_max;
2730         /* Allow for atoms to be maximally 2/3 times the cut-off
2731          * out of their DD cell. This is a reasonable balance between
2732          * between performance and support for most charge-group/cut-off
2733          * combinations.
2734          */
2735         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2736         /* Avoid extra communication when we are exactly at a boundary */
2737         range *= 0.999;
2738
2739         sh = 1;
2740         for (s = 0; s < ns; s++)
2741         {
2742             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2743             pme_boundary = (real)s/ns;
2744             while (sh+1 < ns &&
2745                    ((s-(sh+1) >= 0 &&
2746                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2747                     (s-(sh+1) <  0 &&
2748                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2749             {
2750                 sh++;
2751             }
2752             pme_boundary = (real)(s+1)/ns;
2753             while (sh+1 < ns &&
2754                    ((s+(sh+1) <  ns &&
2755                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2756                     (s+(sh+1) >= ns &&
2757                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2758             {
2759                 sh++;
2760             }
2761         }
2762     }
2763
2764     ddpme->maxshift = sh;
2765
2766     if (debug)
2767     {
2768         fprintf(debug, "PME slab communication range for dim %d is %d\n",
2769                 ddpme->dim, ddpme->maxshift);
2770     }
2771 }
2772
2773 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
2774 {
2775     int d, dim;
2776
2777     for (d = 0; d < dd->ndim; d++)
2778     {
2779         dim = dd->dim[d];
2780         if (dim < ddbox->nboundeddim &&
2781             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2782             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2783         {
2784             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2785                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2786                       dd->nc[dim], dd->comm->cellsize_limit);
2787         }
2788     }
2789 }
2790
2791 enum {
2792     setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
2793 };
2794
2795 /* Set the domain boundaries. Use for static (or no) load balancing,
2796  * and also for the starting state for dynamic load balancing.
2797  * setmode determine if and where the boundaries are stored, use enum above.
2798  * Returns the number communication pulses in npulse.
2799  */
2800 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
2801                                   int setmode, ivec npulse)
2802 {
2803     gmx_domdec_comm_t *comm;
2804     int                d, j;
2805     rvec               cellsize_min;
2806     real              *cell_x, cell_dx, cellsize;
2807
2808     comm = dd->comm;
2809
2810     for (d = 0; d < DIM; d++)
2811     {
2812         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2813         npulse[d]       = 1;
2814         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2815         {
2816             /* Uniform grid */
2817             cell_dx = ddbox->box_size[d]/dd->nc[d];
2818             switch (setmode)
2819             {
2820                 case setcellsizeslbMASTER:
2821                     for (j = 0; j < dd->nc[d]+1; j++)
2822                     {
2823                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2824                     }
2825                     break;
2826                 case setcellsizeslbLOCAL:
2827                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2828                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2829                     break;
2830                 default:
2831                     break;
2832             }
2833             cellsize = cell_dx*ddbox->skew_fac[d];
2834             while (cellsize*npulse[d] < comm->cutoff)
2835             {
2836                 npulse[d]++;
2837             }
2838             cellsize_min[d] = cellsize;
2839         }
2840         else
2841         {
2842             /* Statically load balanced grid */
2843             /* Also when we are not doing a master distribution we determine
2844              * all cell borders in a loop to obtain identical values
2845              * to the master distribution case and to determine npulse.
2846              */
2847             if (setmode == setcellsizeslbMASTER)
2848             {
2849                 cell_x = dd->ma->cell_x[d];
2850             }
2851             else
2852             {
2853                 snew(cell_x, dd->nc[d]+1);
2854             }
2855             cell_x[0] = ddbox->box0[d];
2856             for (j = 0; j < dd->nc[d]; j++)
2857             {
2858                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
2859                 cell_x[j+1] = cell_x[j] + cell_dx;
2860                 cellsize    = cell_dx*ddbox->skew_fac[d];
2861                 while (cellsize*npulse[d] < comm->cutoff &&
2862                        npulse[d] < dd->nc[d]-1)
2863                 {
2864                     npulse[d]++;
2865                 }
2866                 cellsize_min[d] = std::min(cellsize_min[d], cellsize);
2867             }
2868             if (setmode == setcellsizeslbLOCAL)
2869             {
2870                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2871                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2872             }
2873             if (setmode != setcellsizeslbMASTER)
2874             {
2875                 sfree(cell_x);
2876             }
2877         }
2878         /* The following limitation is to avoid that a cell would receive
2879          * some of its own home charge groups back over the periodic boundary.
2880          * Double charge groups cause trouble with the global indices.
2881          */
2882         if (d < ddbox->npbcdim &&
2883             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2884         {
2885             char error_string[STRLEN];
2886
2887             sprintf(error_string,
2888                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2889                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
2890                     comm->cutoff,
2891                     dd->nc[d], dd->nc[d],
2892                     dd->nnodes > dd->nc[d] ? "cells" : "ranks");
2893
2894             if (setmode == setcellsizeslbLOCAL)
2895             {
2896                 gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2897                                      error_string);
2898             }
2899             else
2900             {
2901                 gmx_fatal(FARGS, error_string);
2902             }
2903         }
2904     }
2905
2906     if (!dlbIsOn(comm))
2907     {
2908         copy_rvec(cellsize_min, comm->cellsize_min);
2909     }
2910
2911     for (d = 0; d < comm->npmedecompdim; d++)
2912     {
2913         set_pme_maxshift(dd, &comm->ddpme[d],
2914                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
2915                          comm->ddpme[d].slb_dim_f);
2916     }
2917 }
2918
2919
2920 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2921                                                   int d, int dim, domdec_root_t *root,
2922                                                   gmx_ddbox_t *ddbox,
2923                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
2924 {
2925     gmx_domdec_comm_t *comm;
2926     int                ncd, i, j, nmin, nmin_old;
2927     gmx_bool           bLimLo, bLimHi;
2928     real              *cell_size;
2929     real               fac, halfway, cellsize_limit_f_i, region_size;
2930     gmx_bool           bPBC, bLastHi = FALSE;
2931     int                nrange[] = {range[0], range[1]};
2932
2933     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
2934
2935     comm = dd->comm;
2936
2937     ncd = dd->nc[dim];
2938
2939     bPBC = (dim < ddbox->npbcdim);
2940
2941     cell_size = root->buf_ncd;
2942
2943     if (debug)
2944     {
2945         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
2946     }
2947
2948     /* First we need to check if the scaling does not make cells
2949      * smaller than the smallest allowed size.
2950      * We need to do this iteratively, since if a cell is too small,
2951      * it needs to be enlarged, which makes all the other cells smaller,
2952      * which could in turn make another cell smaller than allowed.
2953      */
2954     for (i = range[0]; i < range[1]; i++)
2955     {
2956         root->bCellMin[i] = FALSE;
2957     }
2958     nmin = 0;
2959     do
2960     {
2961         nmin_old = nmin;
2962         /* We need the total for normalization */
2963         fac = 0;
2964         for (i = range[0]; i < range[1]; i++)
2965         {
2966             if (root->bCellMin[i] == FALSE)
2967             {
2968                 fac += cell_size[i];
2969             }
2970         }
2971         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
2972         /* Determine the cell boundaries */
2973         for (i = range[0]; i < range[1]; i++)
2974         {
2975             if (root->bCellMin[i] == FALSE)
2976             {
2977                 cell_size[i] *= fac;
2978                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
2979                 {
2980                     cellsize_limit_f_i = 0;
2981                 }
2982                 else
2983                 {
2984                     cellsize_limit_f_i = cellsize_limit_f;
2985                 }
2986                 if (cell_size[i] < cellsize_limit_f_i)
2987                 {
2988                     root->bCellMin[i] = TRUE;
2989                     cell_size[i]      = cellsize_limit_f_i;
2990                     nmin++;
2991                 }
2992             }
2993             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
2994         }
2995     }
2996     while (nmin > nmin_old);
2997
2998     i            = range[1]-1;
2999     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3000     /* For this check we should not use DD_CELL_MARGIN,
3001      * but a slightly smaller factor,
3002      * since rounding could get use below the limit.
3003      */
3004     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3005     {
3006         char buf[22];
3007         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3008                   gmx_step_str(step, buf),
3009                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3010                   ncd, comm->cellsize_min[dim]);
3011     }
3012
3013     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3014
3015     if (!bUniform)
3016     {
3017         /* Check if the boundary did not displace more than halfway
3018          * each of the cells it bounds, as this could cause problems,
3019          * especially when the differences between cell sizes are large.
3020          * If changes are applied, they will not make cells smaller
3021          * than the cut-off, as we check all the boundaries which
3022          * might be affected by a change and if the old state was ok,
3023          * the cells will at most be shrunk back to their old size.
3024          */
3025         for (i = range[0]+1; i < range[1]; i++)
3026         {
3027             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3028             if (root->cell_f[i] < halfway)
3029             {
3030                 root->cell_f[i] = halfway;
3031                 /* Check if the change also causes shifts of the next boundaries */
3032                 for (j = i+1; j < range[1]; j++)
3033                 {
3034                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3035                     {
3036                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3037                     }
3038                 }
3039             }
3040             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3041             if (root->cell_f[i] > halfway)
3042             {
3043                 root->cell_f[i] = halfway;
3044                 /* Check if the change also causes shifts of the next boundaries */
3045                 for (j = i-1; j >= range[0]+1; j--)
3046                 {
3047                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3048                     {
3049                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3050                     }
3051                 }
3052             }
3053         }
3054     }
3055
3056     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3057     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3058      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3059      * for a and b nrange is used */
3060     if (d > 0)
3061     {
3062         /* Take care of the staggering of the cell boundaries */
3063         if (bUniform)
3064         {
3065             for (i = range[0]; i < range[1]; i++)
3066             {
3067                 root->cell_f_max0[i] = root->cell_f[i];
3068                 root->cell_f_min1[i] = root->cell_f[i+1];
3069             }
3070         }
3071         else
3072         {
3073             for (i = range[0]+1; i < range[1]; i++)
3074             {
3075                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3076                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3077                 if (bLimLo && bLimHi)
3078                 {
3079                     /* Both limits violated, try the best we can */
3080                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3081                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3082                     nrange[0]       = range[0];
3083                     nrange[1]       = i;
3084                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3085
3086                     nrange[0] = i;
3087                     nrange[1] = range[1];
3088                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3089
3090                     return;
3091                 }
3092                 else if (bLimLo)
3093                 {
3094                     /* root->cell_f[i] = root->bound_min[i]; */
3095                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3096                     bLastHi   = FALSE;
3097                 }
3098                 else if (bLimHi && !bLastHi)
3099                 {
3100                     bLastHi = TRUE;
3101                     if (nrange[1] < range[1])   /* found a LimLo before */
3102                     {
3103                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3104                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3105                         nrange[0] = nrange[1];
3106                     }
3107                     root->cell_f[i] = root->bound_max[i];
3108                     nrange[1]       = i;
3109                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3110                     nrange[0] = i;
3111                     nrange[1] = range[1];
3112                 }
3113             }
3114             if (nrange[1] < range[1])   /* found last a LimLo */
3115             {
3116                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3117                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3118                 nrange[0] = nrange[1];
3119                 nrange[1] = range[1];
3120                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3121             }
3122             else if (nrange[0] > range[0]) /* found at least one LimHi */
3123             {
3124                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3125             }
3126         }
3127     }
3128 }
3129
3130
3131 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3132                                        int d, int dim, domdec_root_t *root,
3133                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3134                                        gmx_bool bUniform, gmx_int64_t step)
3135 {
3136     gmx_domdec_comm_t *comm;
3137     int                ncd, d1, i, pos;
3138     real              *cell_size;
3139     real               load_aver, load_i, imbalance, change, change_max, sc;
3140     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3141     real               change_limit;
3142     real               relax = 0.5;
3143     gmx_bool           bPBC;
3144     int                range[] = { 0, 0 };
3145
3146     comm = dd->comm;
3147
3148     /* Convert the maximum change from the input percentage to a fraction */
3149     change_limit = comm->dlb_scale_lim*0.01;
3150
3151     ncd = dd->nc[dim];
3152
3153     bPBC = (dim < ddbox->npbcdim);
3154
3155     cell_size = root->buf_ncd;
3156
3157     /* Store the original boundaries */
3158     for (i = 0; i < ncd+1; i++)
3159     {
3160         root->old_cell_f[i] = root->cell_f[i];
3161     }
3162     if (bUniform)
3163     {
3164         for (i = 0; i < ncd; i++)
3165         {
3166             cell_size[i] = 1.0/ncd;
3167         }
3168     }
3169     else if (dd_load_count(comm) > 0)
3170     {
3171         load_aver  = comm->load[d].sum_m/ncd;
3172         change_max = 0;
3173         for (i = 0; i < ncd; i++)
3174         {
3175             /* Determine the relative imbalance of cell i */
3176             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3177             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3178             /* Determine the change of the cell size using underrelaxation */
3179             change     = -relax*imbalance;
3180             change_max = std::max(change_max, std::max(change, -change));
3181         }
3182         /* Limit the amount of scaling.
3183          * We need to use the same rescaling for all cells in one row,
3184          * otherwise the load balancing might not converge.
3185          */
3186         sc = relax;
3187         if (change_max > change_limit)
3188         {
3189             sc *= change_limit/change_max;
3190         }
3191         for (i = 0; i < ncd; i++)
3192         {
3193             /* Determine the relative imbalance of cell i */
3194             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3195             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3196             /* Determine the change of the cell size using underrelaxation */
3197             change       = -sc*imbalance;
3198             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3199         }
3200     }
3201
3202     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3203     cellsize_limit_f *= DD_CELL_MARGIN;
3204     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3205     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3206     if (ddbox->tric_dir[dim])
3207     {
3208         cellsize_limit_f /= ddbox->skew_fac[dim];
3209         dist_min_f       /= ddbox->skew_fac[dim];
3210     }
3211     if (bDynamicBox && d > 0)
3212     {
3213         dist_min_f *= DD_PRES_SCALE_MARGIN;
3214     }
3215     if (d > 0 && !bUniform)
3216     {
3217         /* Make sure that the grid is not shifted too much */
3218         for (i = 1; i < ncd; i++)
3219         {
3220             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3221             {
3222                 gmx_incons("Inconsistent DD boundary staggering limits!");
3223             }
3224             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3225             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3226             if (space > 0)
3227             {
3228                 root->bound_min[i] += 0.5*space;
3229             }
3230             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3231             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3232             if (space < 0)
3233             {
3234                 root->bound_max[i] += 0.5*space;
3235             }
3236             if (debug)
3237             {
3238                 fprintf(debug,
3239                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3240                         d, i,
3241                         root->cell_f_max0[i-1] + dist_min_f,
3242                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3243                         root->cell_f_min1[i] - dist_min_f);
3244             }
3245         }
3246     }
3247     range[1]          = ncd;
3248     root->cell_f[0]   = 0;
3249     root->cell_f[ncd] = 1;
3250     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3251
3252
3253     /* After the checks above, the cells should obey the cut-off
3254      * restrictions, but it does not hurt to check.
3255      */
3256     for (i = 0; i < ncd; i++)
3257     {
3258         if (debug)
3259         {
3260             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3261                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3262         }
3263
3264         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3265             root->cell_f[i+1] - root->cell_f[i] <
3266             cellsize_limit_f/DD_CELL_MARGIN)
3267         {
3268             char buf[22];
3269             fprintf(stderr,
3270                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3271                     gmx_step_str(step, buf), dim2char(dim), i,
3272                     (root->cell_f[i+1] - root->cell_f[i])
3273                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3274         }
3275     }
3276
3277     pos = ncd + 1;
3278     /* Store the cell boundaries of the lower dimensions at the end */
3279     for (d1 = 0; d1 < d; d1++)
3280     {
3281         root->cell_f[pos++] = comm->cell_f0[d1];
3282         root->cell_f[pos++] = comm->cell_f1[d1];
3283     }
3284
3285     if (d < comm->npmedecompdim)
3286     {
3287         /* The master determines the maximum shift for
3288          * the coordinate communication between separate PME nodes.
3289          */
3290         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3291     }
3292     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3293     if (d >= 1)
3294     {
3295         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3296     }
3297 }
3298
3299 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3300                                              gmx_ddbox_t *ddbox, int dimind)
3301 {
3302     gmx_domdec_comm_t *comm;
3303     int                dim;
3304
3305     comm = dd->comm;
3306
3307     /* Set the cell dimensions */
3308     dim                = dd->dim[dimind];
3309     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3310     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3311     if (dim >= ddbox->nboundeddim)
3312     {
3313         comm->cell_x0[dim] += ddbox->box0[dim];
3314         comm->cell_x1[dim] += ddbox->box0[dim];
3315     }
3316 }
3317
3318 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3319                                          int d, int dim, real *cell_f_row,
3320                                          gmx_ddbox_t *ddbox)
3321 {
3322     gmx_domdec_comm_t *comm;
3323     int                d1, pos;
3324
3325     comm = dd->comm;
3326
3327 #ifdef GMX_MPI
3328     /* Each node would only need to know two fractions,
3329      * but it is probably cheaper to broadcast the whole array.
3330      */
3331     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3332               0, comm->mpi_comm_load[d]);
3333 #endif
3334     /* Copy the fractions for this dimension from the buffer */
3335     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3336     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3337     /* The whole array was communicated, so set the buffer position */
3338     pos = dd->nc[dim] + 1;
3339     for (d1 = 0; d1 <= d; d1++)
3340     {
3341         if (d1 < d)
3342         {
3343             /* Copy the cell fractions of the lower dimensions */
3344             comm->cell_f0[d1] = cell_f_row[pos++];
3345             comm->cell_f1[d1] = cell_f_row[pos++];
3346         }
3347         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3348     }
3349     /* Convert the communicated shift from float to int */
3350     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3351     if (d >= 1)
3352     {
3353         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3354     }
3355 }
3356
3357 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3358                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3359                                          gmx_bool bUniform, gmx_int64_t step)
3360 {
3361     gmx_domdec_comm_t *comm;
3362     int                d, dim, d1;
3363     gmx_bool           bRowMember, bRowRoot;
3364     real              *cell_f_row;
3365
3366     comm = dd->comm;
3367
3368     for (d = 0; d < dd->ndim; d++)
3369     {
3370         dim        = dd->dim[d];
3371         bRowMember = TRUE;
3372         bRowRoot   = TRUE;
3373         for (d1 = d; d1 < dd->ndim; d1++)
3374         {
3375             if (dd->ci[dd->dim[d1]] > 0)
3376             {
3377                 if (d1 != d)
3378                 {
3379                     bRowMember = FALSE;
3380                 }
3381                 bRowRoot = FALSE;
3382             }
3383         }
3384         if (bRowMember)
3385         {
3386             if (bRowRoot)
3387             {
3388                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3389                                            ddbox, bDynamicBox, bUniform, step);
3390                 cell_f_row = comm->root[d]->cell_f;
3391             }
3392             else
3393             {
3394                 cell_f_row = comm->cell_f_row;
3395             }
3396             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3397         }
3398     }
3399 }
3400
3401 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3402 {
3403     int d;
3404
3405     /* This function assumes the box is static and should therefore
3406      * not be called when the box has changed since the last
3407      * call to dd_partition_system.
3408      */
3409     for (d = 0; d < dd->ndim; d++)
3410     {
3411         relative_to_absolute_cell_bounds(dd, ddbox, d);
3412     }
3413 }
3414
3415
3416
3417 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3418                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3419                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3420                                   gmx_wallcycle_t wcycle)
3421 {
3422     gmx_domdec_comm_t *comm;
3423     int                dim;
3424
3425     comm = dd->comm;
3426
3427     if (bDoDLB)
3428     {
3429         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3430         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3431         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3432     }
3433     else if (bDynamicBox)
3434     {
3435         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3436     }
3437
3438     /* Set the dimensions for which no DD is used */
3439     for (dim = 0; dim < DIM; dim++)
3440     {
3441         if (dd->nc[dim] == 1)
3442         {
3443             comm->cell_x0[dim] = 0;
3444             comm->cell_x1[dim] = ddbox->box_size[dim];
3445             if (dim >= ddbox->nboundeddim)
3446             {
3447                 comm->cell_x0[dim] += ddbox->box0[dim];
3448                 comm->cell_x1[dim] += ddbox->box0[dim];
3449             }
3450         }
3451     }
3452 }
3453
3454 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3455 {
3456     int                    d, np, i;
3457     gmx_domdec_comm_dim_t *cd;
3458
3459     for (d = 0; d < dd->ndim; d++)
3460     {
3461         cd = &dd->comm->cd[d];
3462         np = npulse[dd->dim[d]];
3463         if (np > cd->np_nalloc)
3464         {
3465             if (debug)
3466             {
3467                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3468                         dim2char(dd->dim[d]), np);
3469             }
3470             if (DDMASTER(dd) && cd->np_nalloc > 0)
3471             {
3472                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3473             }
3474             srenew(cd->ind, np);
3475             for (i = cd->np_nalloc; i < np; i++)
3476             {
3477                 cd->ind[i].index  = NULL;
3478                 cd->ind[i].nalloc = 0;
3479             }
3480             cd->np_nalloc = np;
3481         }
3482         cd->np = np;
3483     }
3484 }
3485
3486
3487 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3488                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3489                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3490                               gmx_wallcycle_t wcycle)
3491 {
3492     gmx_domdec_comm_t *comm;
3493     int                d;
3494     ivec               npulse;
3495
3496     comm = dd->comm;
3497
3498     /* Copy the old cell boundaries for the cg displacement check */
3499     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3500     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3501
3502     if (dlbIsOn(comm))
3503     {
3504         if (DDMASTER(dd))
3505         {
3506             check_box_size(dd, ddbox);
3507         }
3508         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3509     }
3510     else
3511     {
3512         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3513         realloc_comm_ind(dd, npulse);
3514     }
3515
3516     if (debug)
3517     {
3518         for (d = 0; d < DIM; d++)
3519         {
3520             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3521                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3522         }
3523     }
3524 }
3525
3526 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3527                                   gmx_ddbox_t *ddbox,
3528                                   rvec cell_ns_x0, rvec cell_ns_x1,
3529                                   gmx_int64_t step)
3530 {
3531     gmx_domdec_comm_t *comm;
3532     int                dim_ind, dim;
3533
3534     comm = dd->comm;
3535
3536     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3537     {
3538         dim = dd->dim[dim_ind];
3539
3540         /* Without PBC we don't have restrictions on the outer cells */
3541         if (!(dim >= ddbox->npbcdim &&
3542               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3543             dlbIsOn(comm) &&
3544             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3545             comm->cellsize_min[dim])
3546         {
3547             char buf[22];
3548             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3549                       gmx_step_str(step, buf), dim2char(dim),
3550                       comm->cell_x1[dim] - comm->cell_x0[dim],
3551                       ddbox->skew_fac[dim],
3552                       dd->comm->cellsize_min[dim],
3553                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3554         }
3555     }
3556
3557     if ((dlbIsOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3558     {
3559         /* Communicate the boundaries and update cell_ns_x0/1 */
3560         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3561         if (dlbIsOn(dd->comm) && dd->ndim > 1)
3562         {
3563             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3564         }
3565     }
3566 }
3567
3568 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3569 {
3570     if (YY < npbcdim)
3571     {
3572         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3573     }
3574     else
3575     {
3576         tcm[YY][XX] = 0;
3577     }
3578     if (ZZ < npbcdim)
3579     {
3580         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3581         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3582     }
3583     else
3584     {
3585         tcm[ZZ][XX] = 0;
3586         tcm[ZZ][YY] = 0;
3587     }
3588 }
3589
3590 static void check_screw_box(matrix box)
3591 {
3592     /* Mathematical limitation */
3593     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3594     {
3595         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3596     }
3597
3598     /* Limitation due to the asymmetry of the eighth shell method */
3599     if (box[ZZ][YY] != 0)
3600     {
3601         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3602     }
3603 }
3604
3605 static void distribute_cg(FILE *fplog,
3606                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3607                           gmx_domdec_t *dd)
3608 {
3609     gmx_domdec_master_t *ma;
3610     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3611     int                  i, icg, j, k, k0, k1, d;
3612     matrix               tcm;
3613     rvec                 cg_cm;
3614     ivec                 ind;
3615     real                 nrcg, inv_ncg, pos_d;
3616     int                 *cgindex;
3617     gmx_bool             bScrew;
3618
3619     ma = dd->ma;
3620
3621     if (tmp_ind == NULL)
3622     {
3623         snew(tmp_nalloc, dd->nnodes);
3624         snew(tmp_ind, dd->nnodes);
3625         for (i = 0; i < dd->nnodes; i++)
3626         {
3627             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3628             snew(tmp_ind[i], tmp_nalloc[i]);
3629         }
3630     }
3631
3632     /* Clear the count */
3633     for (i = 0; i < dd->nnodes; i++)
3634     {
3635         ma->ncg[i] = 0;
3636         ma->nat[i] = 0;
3637     }
3638
3639     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3640
3641     cgindex = cgs->index;
3642
3643     /* Compute the center of geometry for all charge groups */
3644     for (icg = 0; icg < cgs->nr; icg++)
3645     {
3646         k0      = cgindex[icg];
3647         k1      = cgindex[icg+1];
3648         nrcg    = k1 - k0;
3649         if (nrcg == 1)
3650         {
3651             copy_rvec(pos[k0], cg_cm);
3652         }
3653         else
3654         {
3655             inv_ncg = 1.0/nrcg;
3656
3657             clear_rvec(cg_cm);
3658             for (k = k0; (k < k1); k++)
3659             {
3660                 rvec_inc(cg_cm, pos[k]);
3661             }
3662             for (d = 0; (d < DIM); d++)
3663             {
3664                 cg_cm[d] *= inv_ncg;
3665             }
3666         }
3667         /* Put the charge group in the box and determine the cell index */
3668         for (d = DIM-1; d >= 0; d--)
3669         {
3670             pos_d = cg_cm[d];
3671             if (d < dd->npbcdim)
3672             {
3673                 bScrew = (dd->bScrewPBC && d == XX);
3674                 if (tric_dir[d] && dd->nc[d] > 1)
3675                 {
3676                     /* Use triclinic coordintates for this dimension */
3677                     for (j = d+1; j < DIM; j++)
3678                     {
3679                         pos_d += cg_cm[j]*tcm[j][d];
3680                     }
3681                 }
3682                 while (pos_d >= box[d][d])
3683                 {
3684                     pos_d -= box[d][d];
3685                     rvec_dec(cg_cm, box[d]);
3686                     if (bScrew)
3687                     {
3688                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3689                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3690                     }
3691                     for (k = k0; (k < k1); k++)
3692                     {
3693                         rvec_dec(pos[k], box[d]);
3694                         if (bScrew)
3695                         {
3696                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3697                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3698                         }
3699                     }
3700                 }
3701                 while (pos_d < 0)
3702                 {
3703                     pos_d += box[d][d];
3704                     rvec_inc(cg_cm, box[d]);
3705                     if (bScrew)
3706                     {
3707                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3708                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3709                     }
3710                     for (k = k0; (k < k1); k++)
3711                     {
3712                         rvec_inc(pos[k], box[d]);
3713                         if (bScrew)
3714                         {
3715                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3716                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3717                         }
3718                     }
3719                 }
3720             }
3721             /* This could be done more efficiently */
3722             ind[d] = 0;
3723             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3724             {
3725                 ind[d]++;
3726             }
3727         }
3728         i = dd_index(dd->nc, ind);
3729         if (ma->ncg[i] == tmp_nalloc[i])
3730         {
3731             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3732             srenew(tmp_ind[i], tmp_nalloc[i]);
3733         }
3734         tmp_ind[i][ma->ncg[i]] = icg;
3735         ma->ncg[i]++;
3736         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3737     }
3738
3739     k1 = 0;
3740     for (i = 0; i < dd->nnodes; i++)
3741     {
3742         ma->index[i] = k1;
3743         for (k = 0; k < ma->ncg[i]; k++)
3744         {
3745             ma->cg[k1++] = tmp_ind[i][k];
3746         }
3747     }
3748     ma->index[dd->nnodes] = k1;
3749
3750     for (i = 0; i < dd->nnodes; i++)
3751     {
3752         sfree(tmp_ind[i]);
3753     }
3754     sfree(tmp_ind);
3755     sfree(tmp_nalloc);
3756
3757     if (fplog)
3758     {
3759         // Use double for the sums to avoid natoms^2 overflowing
3760         // (65537^2 > 2^32)
3761         int    nat_sum, nat_min, nat_max;
3762         double nat2_sum;
3763
3764         nat_sum  = 0;
3765         nat2_sum = 0;
3766         nat_min  = ma->nat[0];
3767         nat_max  = ma->nat[0];
3768         for (i = 0; i < dd->nnodes; i++)
3769         {
3770             nat_sum  += ma->nat[i];
3771             // cast to double to avoid integer overflows when squaring
3772             nat2_sum += gmx::square(static_cast<double>(ma->nat[i]));
3773             nat_min   = std::min(nat_min, ma->nat[i]);
3774             nat_max   = std::max(nat_max, ma->nat[i]);
3775         }
3776         nat_sum  /= dd->nnodes;
3777         nat2_sum /= dd->nnodes;
3778
3779         fprintf(fplog, "Atom distribution over %d domains: av %d stddev %d min %d max %d\n",
3780                 dd->nnodes,
3781                 nat_sum,
3782                 static_cast<int>(std::sqrt(nat2_sum - gmx::square(static_cast<double>(nat_sum)) + 0.5)),
3783                 nat_min, nat_max);
3784     }
3785 }
3786
3787 static void get_cg_distribution(FILE *fplog, gmx_domdec_t *dd,
3788                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
3789                                 rvec pos[])
3790 {
3791     gmx_domdec_master_t *ma = NULL;
3792     ivec                 npulse;
3793     int                  i, cg_gl;
3794     int                 *ibuf, buf2[2] = { 0, 0 };
3795     gmx_bool             bMaster = DDMASTER(dd);
3796
3797     if (bMaster)
3798     {
3799         ma = dd->ma;
3800
3801         if (dd->bScrewPBC)
3802         {
3803             check_screw_box(box);
3804         }
3805
3806         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
3807
3808         distribute_cg(fplog, box, ddbox->tric_dir, cgs, pos, dd);
3809         for (i = 0; i < dd->nnodes; i++)
3810         {
3811             ma->ibuf[2*i]   = ma->ncg[i];
3812             ma->ibuf[2*i+1] = ma->nat[i];
3813         }
3814         ibuf = ma->ibuf;
3815     }
3816     else
3817     {
3818         ibuf = NULL;
3819     }
3820     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
3821
3822     dd->ncg_home = buf2[0];
3823     dd->nat_home = buf2[1];
3824     dd->ncg_tot  = dd->ncg_home;
3825     dd->nat_tot  = dd->nat_home;
3826     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3827     {
3828         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3829         srenew(dd->index_gl, dd->cg_nalloc);
3830         srenew(dd->cgindex, dd->cg_nalloc+1);
3831     }
3832     if (bMaster)
3833     {
3834         for (i = 0; i < dd->nnodes; i++)
3835         {
3836             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
3837             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3838         }
3839     }
3840
3841     dd_scatterv(dd,
3842                 bMaster ? ma->ibuf : NULL,
3843                 bMaster ? ma->ibuf+dd->nnodes : NULL,
3844                 bMaster ? ma->cg : NULL,
3845                 dd->ncg_home*sizeof(int), dd->index_gl);
3846
3847     /* Determine the home charge group sizes */
3848     dd->cgindex[0] = 0;
3849     for (i = 0; i < dd->ncg_home; i++)
3850     {
3851         cg_gl            = dd->index_gl[i];
3852         dd->cgindex[i+1] =
3853             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3854     }
3855
3856     if (debug)
3857     {
3858         fprintf(debug, "Home charge groups:\n");
3859         for (i = 0; i < dd->ncg_home; i++)
3860         {
3861             fprintf(debug, " %d", dd->index_gl[i]);
3862             if (i % 10 == 9)
3863             {
3864                 fprintf(debug, "\n");
3865             }
3866         }
3867         fprintf(debug, "\n");
3868     }
3869 }
3870
3871 static int compact_and_copy_vec_at(int ncg, int *move,
3872                                    int *cgindex,
3873                                    int nvec, int vec,
3874                                    rvec *src, gmx_domdec_comm_t *comm,
3875                                    gmx_bool bCompact)
3876 {
3877     int m, icg, i, i0, i1, nrcg;
3878     int home_pos;
3879     int pos_vec[DIM*2];
3880
3881     home_pos = 0;
3882
3883     for (m = 0; m < DIM*2; m++)
3884     {
3885         pos_vec[m] = 0;
3886     }
3887
3888     i0 = 0;
3889     for (icg = 0; icg < ncg; icg++)
3890     {
3891         i1 = cgindex[icg+1];
3892         m  = move[icg];
3893         if (m == -1)
3894         {
3895             if (bCompact)
3896             {
3897                 /* Compact the home array in place */
3898                 for (i = i0; i < i1; i++)
3899                 {
3900                     copy_rvec(src[i], src[home_pos++]);
3901                 }
3902             }
3903         }
3904         else
3905         {
3906             /* Copy to the communication buffer */
3907             nrcg        = i1 - i0;
3908             pos_vec[m] += 1 + vec*nrcg;
3909             for (i = i0; i < i1; i++)
3910             {
3911                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
3912             }
3913             pos_vec[m] += (nvec - vec - 1)*nrcg;
3914         }
3915         if (!bCompact)
3916         {
3917             home_pos += i1 - i0;
3918         }
3919         i0 = i1;
3920     }
3921
3922     return home_pos;
3923 }
3924
3925 static int compact_and_copy_vec_cg(int ncg, int *move,
3926                                    int *cgindex,
3927                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
3928                                    gmx_bool bCompact)
3929 {
3930     int m, icg, i0, i1, nrcg;
3931     int home_pos;
3932     int pos_vec[DIM*2];
3933
3934     home_pos = 0;
3935
3936     for (m = 0; m < DIM*2; m++)
3937     {
3938         pos_vec[m] = 0;
3939     }
3940
3941     i0 = 0;
3942     for (icg = 0; icg < ncg; icg++)
3943     {
3944         i1 = cgindex[icg+1];
3945         m  = move[icg];
3946         if (m == -1)
3947         {
3948             if (bCompact)
3949             {
3950                 /* Compact the home array in place */
3951                 copy_rvec(src[icg], src[home_pos++]);
3952             }
3953         }
3954         else
3955         {
3956             nrcg = i1 - i0;
3957             /* Copy to the communication buffer */
3958             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
3959             pos_vec[m] += 1 + nrcg*nvec;
3960         }
3961         i0 = i1;
3962     }
3963     if (!bCompact)
3964     {
3965         home_pos = ncg;
3966     }
3967
3968     return home_pos;
3969 }
3970
3971 static int compact_ind(int ncg, int *move,
3972                        int *index_gl, int *cgindex,
3973                        int *gatindex,
3974                        gmx_ga2la_t *ga2la, char *bLocalCG,
3975                        int *cginfo)
3976 {
3977     int cg, nat, a0, a1, a, a_gl;
3978     int home_pos;
3979
3980     home_pos = 0;
3981     nat      = 0;
3982     for (cg = 0; cg < ncg; cg++)
3983     {
3984         a0 = cgindex[cg];
3985         a1 = cgindex[cg+1];
3986         if (move[cg] == -1)
3987         {
3988             /* Compact the home arrays in place.
3989              * Anything that can be done here avoids access to global arrays.
3990              */
3991             cgindex[home_pos] = nat;
3992             for (a = a0; a < a1; a++)
3993             {
3994                 a_gl          = gatindex[a];
3995                 gatindex[nat] = a_gl;
3996                 /* The cell number stays 0, so we don't need to set it */
3997                 ga2la_change_la(ga2la, a_gl, nat);
3998                 nat++;
3999             }
4000             index_gl[home_pos] = index_gl[cg];
4001             cginfo[home_pos]   = cginfo[cg];
4002             /* The charge group remains local, so bLocalCG does not change */
4003             home_pos++;
4004         }
4005         else
4006         {
4007             /* Clear the global indices */
4008             for (a = a0; a < a1; a++)
4009             {
4010                 ga2la_del(ga2la, gatindex[a]);
4011             }
4012             if (bLocalCG)
4013             {
4014                 bLocalCG[index_gl[cg]] = FALSE;
4015             }
4016         }
4017     }
4018     cgindex[home_pos] = nat;
4019
4020     return home_pos;
4021 }
4022
4023 static void clear_and_mark_ind(int ncg, int *move,
4024                                int *index_gl, int *cgindex, int *gatindex,
4025                                gmx_ga2la_t *ga2la, char *bLocalCG,
4026                                int *cell_index)
4027 {
4028     int cg, a0, a1, a;
4029
4030     for (cg = 0; cg < ncg; cg++)
4031     {
4032         if (move[cg] >= 0)
4033         {
4034             a0 = cgindex[cg];
4035             a1 = cgindex[cg+1];
4036             /* Clear the global indices */
4037             for (a = a0; a < a1; a++)
4038             {
4039                 ga2la_del(ga2la, gatindex[a]);
4040             }
4041             if (bLocalCG)
4042             {
4043                 bLocalCG[index_gl[cg]] = FALSE;
4044             }
4045             /* Signal that this cg has moved using the ns cell index.
4046              * Here we set it to -1. fill_grid will change it
4047              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4048              */
4049             cell_index[cg] = -1;
4050         }
4051     }
4052 }
4053
4054 static void print_cg_move(FILE *fplog,
4055                           gmx_domdec_t *dd,
4056                           gmx_int64_t step, int cg, int dim, int dir,
4057                           gmx_bool bHaveCgcmOld, real limitd,
4058                           rvec cm_old, rvec cm_new, real pos_d)
4059 {
4060     gmx_domdec_comm_t *comm;
4061     char               buf[22];
4062
4063     comm = dd->comm;
4064
4065     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4066     if (limitd > 0)
4067     {
4068         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4069                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4070                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4071     }
4072     else
4073     {
4074         /* We don't have a limiting distance available: don't print it */
4075         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4076                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4077                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4078     }
4079     fprintf(fplog, "distance out of cell %f\n",
4080             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4081     if (bHaveCgcmOld)
4082     {
4083         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4084                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4085     }
4086     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4087             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4088     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4089             dim2char(dim),
4090             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4091     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4092             dim2char(dim),
4093             comm->cell_x0[dim], comm->cell_x1[dim]);
4094 }
4095
4096 static void cg_move_error(FILE *fplog,
4097                           gmx_domdec_t *dd,
4098                           gmx_int64_t step, int cg, int dim, int dir,
4099                           gmx_bool bHaveCgcmOld, real limitd,
4100                           rvec cm_old, rvec cm_new, real pos_d)
4101 {
4102     if (fplog)
4103     {
4104         print_cg_move(fplog, dd, step, cg, dim, dir,
4105                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4106     }
4107     print_cg_move(stderr, dd, step, cg, dim, dir,
4108                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4109     gmx_fatal(FARGS,
4110               "%s moved too far between two domain decomposition steps\n"
4111               "This usually means that your system is not well equilibrated",
4112               dd->comm->bCGs ? "A charge group" : "An atom");
4113 }
4114
4115 static void rotate_state_atom(t_state *state, int a)
4116 {
4117     int est;
4118
4119     for (est = 0; est < estNR; est++)
4120     {
4121         if (EST_DISTR(est) && (state->flags & (1<<est)))
4122         {
4123             switch (est)
4124             {
4125                 case estX:
4126                     /* Rotate the complete state; for a rectangular box only */
4127                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4128                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4129                     break;
4130                 case estV:
4131                     state->v[a][YY] = -state->v[a][YY];
4132                     state->v[a][ZZ] = -state->v[a][ZZ];
4133                     break;
4134                 case estSDX:
4135                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4136                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4137                     break;
4138                 case estCGP:
4139                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4140                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4141                     break;
4142                 case estDISRE_INITF:
4143                 case estDISRE_RM3TAV:
4144                 case estORIRE_INITF:
4145                 case estORIRE_DTAV:
4146                     /* These are distances, so not affected by rotation */
4147                     break;
4148                 default:
4149                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4150             }
4151         }
4152     }
4153 }
4154
4155 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4156 {
4157     if (natoms > comm->moved_nalloc)
4158     {
4159         /* Contents should be preserved here */
4160         comm->moved_nalloc = over_alloc_dd(natoms);
4161         srenew(comm->moved, comm->moved_nalloc);
4162     }
4163
4164     return comm->moved;
4165 }
4166
4167 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4168                          gmx_domdec_t *dd,
4169                          t_state *state,
4170                          ivec tric_dir, matrix tcm,
4171                          rvec cell_x0, rvec cell_x1,
4172                          rvec limitd, rvec limit0, rvec limit1,
4173                          const int *cgindex,
4174                          int cg_start, int cg_end,
4175                          rvec *cg_cm,
4176                          int *move)
4177 {
4178     int      npbcdim;
4179     int      cg, k, k0, k1, d, dim, d2;
4180     int      mc, nrcg;
4181     int      flag;
4182     gmx_bool bScrew;
4183     ivec     dev;
4184     real     inv_ncg, pos_d;
4185     rvec     cm_new;
4186
4187     npbcdim = dd->npbcdim;
4188
4189     for (cg = cg_start; cg < cg_end; cg++)
4190     {
4191         k0   = cgindex[cg];
4192         k1   = cgindex[cg+1];
4193         nrcg = k1 - k0;
4194         if (nrcg == 1)
4195         {
4196             copy_rvec(state->x[k0], cm_new);
4197         }
4198         else
4199         {
4200             inv_ncg = 1.0/nrcg;
4201
4202             clear_rvec(cm_new);
4203             for (k = k0; (k < k1); k++)
4204             {
4205                 rvec_inc(cm_new, state->x[k]);
4206             }
4207             for (d = 0; (d < DIM); d++)
4208             {
4209                 cm_new[d] = inv_ncg*cm_new[d];
4210             }
4211         }
4212
4213         clear_ivec(dev);
4214         /* Do pbc and check DD cell boundary crossings */
4215         for (d = DIM-1; d >= 0; d--)
4216         {
4217             if (dd->nc[d] > 1)
4218             {
4219                 bScrew = (dd->bScrewPBC && d == XX);
4220                 /* Determine the location of this cg in lattice coordinates */
4221                 pos_d = cm_new[d];
4222                 if (tric_dir[d])
4223                 {
4224                     for (d2 = d+1; d2 < DIM; d2++)
4225                     {
4226                         pos_d += cm_new[d2]*tcm[d2][d];
4227                     }
4228                 }
4229                 /* Put the charge group in the triclinic unit-cell */
4230                 if (pos_d >= cell_x1[d])
4231                 {
4232                     if (pos_d >= limit1[d])
4233                     {
4234                         cg_move_error(fplog, dd, step, cg, d, 1,
4235                                       cg_cm != state->x, limitd[d],
4236                                       cg_cm[cg], cm_new, pos_d);
4237                     }
4238                     dev[d] = 1;
4239                     if (dd->ci[d] == dd->nc[d] - 1)
4240                     {
4241                         rvec_dec(cm_new, state->box[d]);
4242                         if (bScrew)
4243                         {
4244                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4245                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4246                         }
4247                         for (k = k0; (k < k1); k++)
4248                         {
4249                             rvec_dec(state->x[k], state->box[d]);
4250                             if (bScrew)
4251                             {
4252                                 rotate_state_atom(state, k);
4253                             }
4254                         }
4255                     }
4256                 }
4257                 else if (pos_d < cell_x0[d])
4258                 {
4259                     if (pos_d < limit0[d])
4260                     {
4261                         cg_move_error(fplog, dd, step, cg, d, -1,
4262                                       cg_cm != state->x, limitd[d],
4263                                       cg_cm[cg], cm_new, pos_d);
4264                     }
4265                     dev[d] = -1;
4266                     if (dd->ci[d] == 0)
4267                     {
4268                         rvec_inc(cm_new, state->box[d]);
4269                         if (bScrew)
4270                         {
4271                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4272                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4273                         }
4274                         for (k = k0; (k < k1); k++)
4275                         {
4276                             rvec_inc(state->x[k], state->box[d]);
4277                             if (bScrew)
4278                             {
4279                                 rotate_state_atom(state, k);
4280                             }
4281                         }
4282                     }
4283                 }
4284             }
4285             else if (d < npbcdim)
4286             {
4287                 /* Put the charge group in the rectangular unit-cell */
4288                 while (cm_new[d] >= state->box[d][d])
4289                 {
4290                     rvec_dec(cm_new, state->box[d]);
4291                     for (k = k0; (k < k1); k++)
4292                     {
4293                         rvec_dec(state->x[k], state->box[d]);
4294                     }
4295                 }
4296                 while (cm_new[d] < 0)
4297                 {
4298                     rvec_inc(cm_new, state->box[d]);
4299                     for (k = k0; (k < k1); k++)
4300                     {
4301                         rvec_inc(state->x[k], state->box[d]);
4302                     }
4303                 }
4304             }
4305         }
4306
4307         copy_rvec(cm_new, cg_cm[cg]);
4308
4309         /* Determine where this cg should go */
4310         flag = 0;
4311         mc   = -1;
4312         for (d = 0; d < dd->ndim; d++)
4313         {
4314             dim = dd->dim[d];
4315             if (dev[dim] == 1)
4316             {
4317                 flag |= DD_FLAG_FW(d);
4318                 if (mc == -1)
4319                 {
4320                     mc = d*2;
4321                 }
4322             }
4323             else if (dev[dim] == -1)
4324             {
4325                 flag |= DD_FLAG_BW(d);
4326                 if (mc == -1)
4327                 {
4328                     if (dd->nc[dim] > 2)
4329                     {
4330                         mc = d*2 + 1;
4331                     }
4332                     else
4333                     {
4334                         mc = d*2;
4335                     }
4336                 }
4337             }
4338         }
4339         /* Temporarily store the flag in move */
4340         move[cg] = mc + flag;
4341     }
4342 }
4343
4344 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4345                                gmx_domdec_t *dd, ivec tric_dir,
4346                                t_state *state, rvec **f,
4347                                t_forcerec *fr,
4348                                gmx_bool bCompact,
4349                                t_nrnb *nrnb,
4350                                int *ncg_stay_home,
4351                                int *ncg_moved)
4352 {
4353     int               *move;
4354     int                npbcdim;
4355     int                ncg[DIM*2], nat[DIM*2];
4356     int                c, i, cg, k, d, dim, dim2, dir, d2, d3;
4357     int                mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
4358     int                sbuf[2], rbuf[2];
4359     int                home_pos_cg, home_pos_at, buf_pos;
4360     int                flag;
4361     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4362     real               pos_d;
4363     matrix             tcm;
4364     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1;
4365     int               *cgindex;
4366     cginfo_mb_t       *cginfo_mb;
4367     gmx_domdec_comm_t *comm;
4368     int               *moved;
4369     int                nthread, thread;
4370
4371     if (dd->bScrewPBC)
4372     {
4373         check_screw_box(state->box);
4374     }
4375
4376     comm  = dd->comm;
4377     if (fr->cutoff_scheme == ecutsGROUP)
4378     {
4379         cg_cm = fr->cg_cm;
4380     }
4381
4382     for (i = 0; i < estNR; i++)
4383     {
4384         if (EST_DISTR(i))
4385         {
4386             switch (i)
4387             {
4388                 case estX: /* Always present */ break;
4389                 case estV:   bV   = (state->flags & (1<<i)); break;
4390                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4391                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4392                 case estLD_RNG:
4393                 case estLD_RNGI:
4394                 case estDISRE_INITF:
4395                 case estDISRE_RM3TAV:
4396                 case estORIRE_INITF:
4397                 case estORIRE_DTAV:
4398                     /* No processing required */
4399                     break;
4400                 default:
4401                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4402             }
4403         }
4404     }
4405
4406     if (dd->ncg_tot > comm->nalloc_int)
4407     {
4408         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4409         srenew(comm->buf_int, comm->nalloc_int);
4410     }
4411     move = comm->buf_int;
4412
4413     /* Clear the count */
4414     for (c = 0; c < dd->ndim*2; c++)
4415     {
4416         ncg[c] = 0;
4417         nat[c] = 0;
4418     }
4419
4420     npbcdim = dd->npbcdim;
4421
4422     for (d = 0; (d < DIM); d++)
4423     {
4424         limitd[d] = dd->comm->cellsize_min[d];
4425         if (d >= npbcdim && dd->ci[d] == 0)
4426         {
4427             cell_x0[d] = -GMX_FLOAT_MAX;
4428         }
4429         else
4430         {
4431             cell_x0[d] = comm->cell_x0[d];
4432         }
4433         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4434         {
4435             cell_x1[d] = GMX_FLOAT_MAX;
4436         }
4437         else
4438         {
4439             cell_x1[d] = comm->cell_x1[d];
4440         }
4441         if (d < npbcdim)
4442         {
4443             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4444             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4445         }
4446         else
4447         {
4448             /* We check after communication if a charge group moved
4449              * more than one cell. Set the pre-comm check limit to float_max.
4450              */
4451             limit0[d] = -GMX_FLOAT_MAX;
4452             limit1[d] =  GMX_FLOAT_MAX;
4453         }
4454     }
4455
4456     make_tric_corr_matrix(npbcdim, state->box, tcm);
4457
4458     cgindex = dd->cgindex;
4459
4460     nthread = gmx_omp_nthreads_get(emntDomdec);
4461
4462     /* Compute the center of geometry for all home charge groups
4463      * and put them in the box and determine where they should go.
4464      */
4465 #pragma omp parallel for num_threads(nthread) schedule(static)
4466     for (thread = 0; thread < nthread; thread++)
4467     {
4468         try
4469         {
4470             calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4471                          cell_x0, cell_x1, limitd, limit0, limit1,
4472                          cgindex,
4473                          ( thread   *dd->ncg_home)/nthread,
4474                          ((thread+1)*dd->ncg_home)/nthread,
4475                          fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4476                          move);
4477         }
4478         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4479     }
4480
4481     for (cg = 0; cg < dd->ncg_home; cg++)
4482     {
4483         if (move[cg] >= 0)
4484         {
4485             mc       = move[cg];
4486             flag     = mc & ~DD_FLAG_NRCG;
4487             mc       = mc & DD_FLAG_NRCG;
4488             move[cg] = mc;
4489
4490             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4491             {
4492                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4493                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4494             }
4495             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4496             /* We store the cg size in the lower 16 bits
4497              * and the place where the charge group should go
4498              * in the next 6 bits. This saves some communication volume.
4499              */
4500             nrcg = cgindex[cg+1] - cgindex[cg];
4501             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4502             ncg[mc] += 1;
4503             nat[mc] += nrcg;
4504         }
4505     }
4506
4507     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4508     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4509
4510     *ncg_moved = 0;
4511     for (i = 0; i < dd->ndim*2; i++)
4512     {
4513         *ncg_moved += ncg[i];
4514     }
4515
4516     nvec = 1;
4517     if (bV)
4518     {
4519         nvec++;
4520     }
4521     if (bSDX)
4522     {
4523         nvec++;
4524     }
4525     if (bCGP)
4526     {
4527         nvec++;
4528     }
4529
4530     /* Make sure the communication buffers are large enough */
4531     for (mc = 0; mc < dd->ndim*2; mc++)
4532     {
4533         nvr = ncg[mc] + nat[mc]*nvec;
4534         if (nvr > comm->cgcm_state_nalloc[mc])
4535         {
4536             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4537             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4538         }
4539     }
4540
4541     switch (fr->cutoff_scheme)
4542     {
4543         case ecutsGROUP:
4544             /* Recalculating cg_cm might be cheaper than communicating,
4545              * but that could give rise to rounding issues.
4546              */
4547             home_pos_cg =
4548                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4549                                         nvec, cg_cm, comm, bCompact);
4550             break;
4551         case ecutsVERLET:
4552             /* Without charge groups we send the moved atom coordinates
4553              * over twice. This is so the code below can be used without
4554              * many conditionals for both for with and without charge groups.
4555              */
4556             home_pos_cg =
4557                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4558                                         nvec, state->x, comm, FALSE);
4559             if (bCompact)
4560             {
4561                 home_pos_cg -= *ncg_moved;
4562             }
4563             break;
4564         default:
4565             gmx_incons("unimplemented");
4566             home_pos_cg = 0;
4567     }
4568
4569     vec         = 0;
4570     home_pos_at =
4571         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4572                                 nvec, vec++, state->x, comm, bCompact);
4573     if (bV)
4574     {
4575         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4576                                 nvec, vec++, state->v, comm, bCompact);
4577     }
4578     if (bSDX)
4579     {
4580         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4581                                 nvec, vec++, state->sd_X, comm, bCompact);
4582     }
4583     if (bCGP)
4584     {
4585         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4586                                 nvec, vec++, state->cg_p, comm, bCompact);
4587     }
4588
4589     if (bCompact)
4590     {
4591         compact_ind(dd->ncg_home, move,
4592                     dd->index_gl, dd->cgindex, dd->gatindex,
4593                     dd->ga2la, comm->bLocalCG,
4594                     fr->cginfo);
4595     }
4596     else
4597     {
4598         if (fr->cutoff_scheme == ecutsVERLET)
4599         {
4600             moved = get_moved(comm, dd->ncg_home);
4601
4602             for (k = 0; k < dd->ncg_home; k++)
4603             {
4604                 moved[k] = 0;
4605             }
4606         }
4607         else
4608         {
4609             moved = fr->ns->grid->cell_index;
4610         }
4611
4612         clear_and_mark_ind(dd->ncg_home, move,
4613                            dd->index_gl, dd->cgindex, dd->gatindex,
4614                            dd->ga2la, comm->bLocalCG,
4615                            moved);
4616     }
4617
4618     cginfo_mb = fr->cginfo_mb;
4619
4620     *ncg_stay_home = home_pos_cg;
4621     for (d = 0; d < dd->ndim; d++)
4622     {
4623         dim      = dd->dim[d];
4624         ncg_recv = 0;
4625         nvr      = 0;
4626         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4627         {
4628             cdd = d*2 + dir;
4629             /* Communicate the cg and atom counts */
4630             sbuf[0] = ncg[cdd];
4631             sbuf[1] = nat[cdd];
4632             if (debug)
4633             {
4634                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4635                         d, dir, sbuf[0], sbuf[1]);
4636             }
4637             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4638
4639             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4640             {
4641                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4642                 srenew(comm->buf_int, comm->nalloc_int);
4643             }
4644
4645             /* Communicate the charge group indices, sizes and flags */
4646             dd_sendrecv_int(dd, d, dir,
4647                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4648                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4649
4650             nvs = ncg[cdd] + nat[cdd]*nvec;
4651             i   = rbuf[0]  + rbuf[1] *nvec;
4652             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4653
4654             /* Communicate cgcm and state */
4655             dd_sendrecv_rvec(dd, d, dir,
4656                              comm->cgcm_state[cdd], nvs,
4657                              comm->vbuf.v+nvr, i);
4658             ncg_recv += rbuf[0];
4659             nvr      += i;
4660         }
4661
4662         /* Process the received charge groups */
4663         buf_pos = 0;
4664         for (cg = 0; cg < ncg_recv; cg++)
4665         {
4666             flag = comm->buf_int[cg*DD_CGIBS+1];
4667
4668             if (dim >= npbcdim && dd->nc[dim] > 2)
4669             {
4670                 /* No pbc in this dim and more than one domain boundary.
4671                  * We do a separate check if a charge group didn't move too far.
4672                  */
4673                 if (((flag & DD_FLAG_FW(d)) &&
4674                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4675                     ((flag & DD_FLAG_BW(d)) &&
4676                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4677                 {
4678                     cg_move_error(fplog, dd, step, cg, dim,
4679                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4680                                   fr->cutoff_scheme == ecutsGROUP, 0,
4681                                   comm->vbuf.v[buf_pos],
4682                                   comm->vbuf.v[buf_pos],
4683                                   comm->vbuf.v[buf_pos][dim]);
4684                 }
4685             }
4686
4687             mc = -1;
4688             if (d < dd->ndim-1)
4689             {
4690                 /* Check which direction this cg should go */
4691                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4692                 {
4693                     if (dlbIsOn(dd->comm))
4694                     {
4695                         /* The cell boundaries for dimension d2 are not equal
4696                          * for each cell row of the lower dimension(s),
4697                          * therefore we might need to redetermine where
4698                          * this cg should go.
4699                          */
4700                         dim2 = dd->dim[d2];
4701                         /* If this cg crosses the box boundary in dimension d2
4702                          * we can use the communicated flag, so we do not
4703                          * have to worry about pbc.
4704                          */
4705                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4706                                (flag & DD_FLAG_FW(d2))) ||
4707                               (dd->ci[dim2] == 0 &&
4708                                (flag & DD_FLAG_BW(d2)))))
4709                         {
4710                             /* Clear the two flags for this dimension */
4711                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4712                             /* Determine the location of this cg
4713                              * in lattice coordinates
4714                              */
4715                             pos_d = comm->vbuf.v[buf_pos][dim2];
4716                             if (tric_dir[dim2])
4717                             {
4718                                 for (d3 = dim2+1; d3 < DIM; d3++)
4719                                 {
4720                                     pos_d +=
4721                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4722                                 }
4723                             }
4724                             /* Check of we are not at the box edge.
4725                              * pbc is only handled in the first step above,
4726                              * but this check could move over pbc while
4727                              * the first step did not due to different rounding.
4728                              */
4729                             if (pos_d >= cell_x1[dim2] &&
4730                                 dd->ci[dim2] != dd->nc[dim2]-1)
4731                             {
4732                                 flag |= DD_FLAG_FW(d2);
4733                             }
4734                             else if (pos_d < cell_x0[dim2] &&
4735                                      dd->ci[dim2] != 0)
4736                             {
4737                                 flag |= DD_FLAG_BW(d2);
4738                             }
4739                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4740                         }
4741                     }
4742                     /* Set to which neighboring cell this cg should go */
4743                     if (flag & DD_FLAG_FW(d2))
4744                     {
4745                         mc = d2*2;
4746                     }
4747                     else if (flag & DD_FLAG_BW(d2))
4748                     {
4749                         if (dd->nc[dd->dim[d2]] > 2)
4750                         {
4751                             mc = d2*2+1;
4752                         }
4753                         else
4754                         {
4755                             mc = d2*2;
4756                         }
4757                     }
4758                 }
4759             }
4760
4761             nrcg = flag & DD_FLAG_NRCG;
4762             if (mc == -1)
4763             {
4764                 if (home_pos_cg+1 > dd->cg_nalloc)
4765                 {
4766                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4767                     srenew(dd->index_gl, dd->cg_nalloc);
4768                     srenew(dd->cgindex, dd->cg_nalloc+1);
4769                 }
4770                 /* Set the global charge group index and size */
4771                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
4772                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4773                 /* Copy the state from the buffer */
4774                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
4775                 if (fr->cutoff_scheme == ecutsGROUP)
4776                 {
4777                     cg_cm = fr->cg_cm;
4778                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4779                 }
4780                 buf_pos++;
4781
4782                 /* Set the cginfo */
4783                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4784                                                    dd->index_gl[home_pos_cg]);
4785                 if (comm->bLocalCG)
4786                 {
4787                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4788                 }
4789
4790                 if (home_pos_at+nrcg > state->nalloc)
4791                 {
4792                     dd_realloc_state(state, f, home_pos_at+nrcg);
4793                 }
4794                 for (i = 0; i < nrcg; i++)
4795                 {
4796                     copy_rvec(comm->vbuf.v[buf_pos++],
4797                               state->x[home_pos_at+i]);
4798                 }
4799                 if (bV)
4800                 {
4801                     for (i = 0; i < nrcg; i++)
4802                     {
4803                         copy_rvec(comm->vbuf.v[buf_pos++],
4804                                   state->v[home_pos_at+i]);
4805                     }
4806                 }
4807                 if (bSDX)
4808                 {
4809                     for (i = 0; i < nrcg; i++)
4810                     {
4811                         copy_rvec(comm->vbuf.v[buf_pos++],
4812                                   state->sd_X[home_pos_at+i]);
4813                     }
4814                 }
4815                 if (bCGP)
4816                 {
4817                     for (i = 0; i < nrcg; i++)
4818                     {
4819                         copy_rvec(comm->vbuf.v[buf_pos++],
4820                                   state->cg_p[home_pos_at+i]);
4821                     }
4822                 }
4823                 home_pos_cg += 1;
4824                 home_pos_at += nrcg;
4825             }
4826             else
4827             {
4828                 /* Reallocate the buffers if necessary  */
4829                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4830                 {
4831                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4832                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4833                 }
4834                 nvr = ncg[mc] + nat[mc]*nvec;
4835                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4836                 {
4837                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4838                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4839                 }
4840                 /* Copy from the receive to the send buffers */
4841                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4842                        comm->buf_int + cg*DD_CGIBS,
4843                        DD_CGIBS*sizeof(int));
4844                 memcpy(comm->cgcm_state[mc][nvr],
4845                        comm->vbuf.v[buf_pos],
4846                        (1+nrcg*nvec)*sizeof(rvec));
4847                 buf_pos += 1 + nrcg*nvec;
4848                 ncg[mc] += 1;
4849                 nat[mc] += nrcg;
4850             }
4851         }
4852     }
4853
4854     /* With sorting (!bCompact) the indices are now only partially up to date
4855      * and ncg_home and nat_home are not the real count, since there are
4856      * "holes" in the arrays for the charge groups that moved to neighbors.
4857      */
4858     if (fr->cutoff_scheme == ecutsVERLET)
4859     {
4860         moved = get_moved(comm, home_pos_cg);
4861
4862         for (i = dd->ncg_home; i < home_pos_cg; i++)
4863         {
4864             moved[i] = 0;
4865         }
4866     }
4867     dd->ncg_home = home_pos_cg;
4868     dd->nat_home = home_pos_at;
4869
4870     if (debug)
4871     {
4872         fprintf(debug,
4873                 "Finished repartitioning: cgs moved out %d, new home %d\n",
4874                 *ncg_moved, dd->ncg_home-*ncg_moved);
4875
4876     }
4877 }
4878
4879 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
4880 {
4881     /* Note that the cycles value can be incorrect, either 0 or some
4882      * extremely large value, when our thread migrated to another core
4883      * with an unsynchronized cycle counter. If this happens less often
4884      * that once per nstlist steps, this will not cause issues, since
4885      * we later subtract the maximum value from the sum over nstlist steps.
4886      * A zero count will slightly lower the total, but that's a small effect.
4887      * Note that the main purpose of the subtraction of the maximum value
4888      * is to avoid throwing off the load balancing when stalls occur due
4889      * e.g. system activity or network congestion.
4890      */
4891     dd->comm->cycl[ddCycl] += cycles;
4892     dd->comm->cycl_n[ddCycl]++;
4893     if (cycles > dd->comm->cycl_max[ddCycl])
4894     {
4895         dd->comm->cycl_max[ddCycl] = cycles;
4896     }
4897 }
4898
4899 static double force_flop_count(t_nrnb *nrnb)
4900 {
4901     int         i;
4902     double      sum;
4903     const char *name;
4904
4905     sum = 0;
4906     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
4907     {
4908         /* To get closer to the real timings, we half the count
4909          * for the normal loops and again half it for water loops.
4910          */
4911         name = nrnb_str(i);
4912         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
4913         {
4914             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4915         }
4916         else
4917         {
4918             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4919         }
4920     }
4921     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
4922     {
4923         name = nrnb_str(i);
4924         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
4925         {
4926             sum += nrnb->n[i]*cost_nrnb(i);
4927         }
4928     }
4929     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
4930     {
4931         sum += nrnb->n[i]*cost_nrnb(i);
4932     }
4933
4934     return sum;
4935 }
4936
4937 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
4938 {
4939     if (dd->comm->eFlop)
4940     {
4941         dd->comm->flop -= force_flop_count(nrnb);
4942     }
4943 }
4944 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
4945 {
4946     if (dd->comm->eFlop)
4947     {
4948         dd->comm->flop += force_flop_count(nrnb);
4949         dd->comm->flop_n++;
4950     }
4951 }
4952
4953 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4954 {
4955     int i;
4956
4957     for (i = 0; i < ddCyclNr; i++)
4958     {
4959         dd->comm->cycl[i]     = 0;
4960         dd->comm->cycl_n[i]   = 0;
4961         dd->comm->cycl_max[i] = 0;
4962     }
4963     dd->comm->flop   = 0;
4964     dd->comm->flop_n = 0;
4965 }
4966
4967 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
4968 {
4969     gmx_domdec_comm_t *comm;
4970     domdec_load_t     *load;
4971     domdec_root_t     *root = NULL;
4972     int                d, dim, i, pos;
4973     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
4974     gmx_bool           bSepPME;
4975
4976     if (debug)
4977     {
4978         fprintf(debug, "get_load_distribution start\n");
4979     }
4980
4981     wallcycle_start(wcycle, ewcDDCOMMLOAD);
4982
4983     comm = dd->comm;
4984
4985     bSepPME = (dd->pme_nodeid >= 0);
4986
4987     if (dd->ndim == 0 && bSepPME)
4988     {
4989         /* Without decomposition, but with PME nodes, we need the load */
4990         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
4991         comm->load[0].pme = comm->cycl[ddCyclPME];
4992     }
4993
4994     for (d = dd->ndim-1; d >= 0; d--)
4995     {
4996         dim = dd->dim[d];
4997         /* Check if we participate in the communication in this dimension */
4998         if (d == dd->ndim-1 ||
4999             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5000         {
5001             load = &comm->load[d];
5002             if (dlbIsOn(dd->comm))
5003             {
5004                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5005             }
5006             pos = 0;
5007             if (d == dd->ndim-1)
5008             {
5009                 sbuf[pos++] = dd_force_load(comm);
5010                 sbuf[pos++] = sbuf[0];
5011                 if (dlbIsOn(dd->comm))
5012                 {
5013                     sbuf[pos++] = sbuf[0];
5014                     sbuf[pos++] = cell_frac;
5015                     if (d > 0)
5016                     {
5017                         sbuf[pos++] = comm->cell_f_max0[d];
5018                         sbuf[pos++] = comm->cell_f_min1[d];
5019                     }
5020                 }
5021                 if (bSepPME)
5022                 {
5023                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5024                     sbuf[pos++] = comm->cycl[ddCyclPME];
5025                 }
5026             }
5027             else
5028             {
5029                 sbuf[pos++] = comm->load[d+1].sum;
5030                 sbuf[pos++] = comm->load[d+1].max;
5031                 if (dlbIsOn(dd->comm))
5032                 {
5033                     sbuf[pos++] = comm->load[d+1].sum_m;
5034                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5035                     sbuf[pos++] = comm->load[d+1].flags;
5036                     if (d > 0)
5037                     {
5038                         sbuf[pos++] = comm->cell_f_max0[d];
5039                         sbuf[pos++] = comm->cell_f_min1[d];
5040                     }
5041                 }
5042                 if (bSepPME)
5043                 {
5044                     sbuf[pos++] = comm->load[d+1].mdf;
5045                     sbuf[pos++] = comm->load[d+1].pme;
5046                 }
5047             }
5048             load->nload = pos;
5049             /* Communicate a row in DD direction d.
5050              * The communicators are setup such that the root always has rank 0.
5051              */
5052 #ifdef GMX_MPI
5053             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5054                        load->load, load->nload*sizeof(float), MPI_BYTE,
5055                        0, comm->mpi_comm_load[d]);
5056 #endif
5057             if (dd->ci[dim] == dd->master_ci[dim])
5058             {
5059                 /* We are the root, process this row */
5060                 if (dlbIsOn(comm))
5061                 {
5062                     root = comm->root[d];
5063                 }
5064                 load->sum      = 0;
5065                 load->max      = 0;
5066                 load->sum_m    = 0;
5067                 load->cvol_min = 1;
5068                 load->flags    = 0;
5069                 load->mdf      = 0;
5070                 load->pme      = 0;
5071                 pos            = 0;
5072                 for (i = 0; i < dd->nc[dim]; i++)
5073                 {
5074                     load->sum += load->load[pos++];
5075                     load->max  = std::max(load->max, load->load[pos]);
5076                     pos++;
5077                     if (dlbIsOn(dd->comm))
5078                     {
5079                         if (root->bLimited)
5080                         {
5081                             /* This direction could not be load balanced properly,
5082                              * therefore we need to use the maximum iso the average load.
5083                              */
5084                             load->sum_m = std::max(load->sum_m, load->load[pos]);
5085                         }
5086                         else
5087                         {
5088                             load->sum_m += load->load[pos];
5089                         }
5090                         pos++;
5091                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
5092                         pos++;
5093                         if (d < dd->ndim-1)
5094                         {
5095                             load->flags = (int)(load->load[pos++] + 0.5);
5096                         }
5097                         if (d > 0)
5098                         {
5099                             root->cell_f_max0[i] = load->load[pos++];
5100                             root->cell_f_min1[i] = load->load[pos++];
5101                         }
5102                     }
5103                     if (bSepPME)
5104                     {
5105                         load->mdf = std::max(load->mdf, load->load[pos]);
5106                         pos++;
5107                         load->pme = std::max(load->pme, load->load[pos]);
5108                         pos++;
5109                     }
5110                 }
5111                 if (dlbIsOn(comm) && root->bLimited)
5112                 {
5113                     load->sum_m *= dd->nc[dim];
5114                     load->flags |= (1<<d);
5115                 }
5116             }
5117         }
5118     }
5119
5120     if (DDMASTER(dd))
5121     {
5122         comm->nload      += dd_load_count(comm);
5123         comm->load_step  += comm->cycl[ddCyclStep];
5124         comm->load_sum   += comm->load[0].sum;
5125         comm->load_max   += comm->load[0].max;
5126         if (dlbIsOn(comm))
5127         {
5128             for (d = 0; d < dd->ndim; d++)
5129             {
5130                 if (comm->load[0].flags & (1<<d))
5131                 {
5132                     comm->load_lim[d]++;
5133                 }
5134             }
5135         }
5136         if (bSepPME)
5137         {
5138             comm->load_mdf += comm->load[0].mdf;
5139             comm->load_pme += comm->load[0].pme;
5140         }
5141     }
5142
5143     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5144
5145     if (debug)
5146     {
5147         fprintf(debug, "get_load_distribution finished\n");
5148     }
5149 }
5150
5151 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5152 {
5153     /* Return the relative performance loss on the total run time
5154      * due to the force calculation load imbalance.
5155      */
5156     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5157     {
5158         return
5159             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5160             (dd->comm->load_step*dd->nnodes);
5161     }
5162     else
5163     {
5164         return 0;
5165     }
5166 }
5167
5168 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5169 {
5170     char               buf[STRLEN];
5171     int                npp, npme, nnodes, d, limp;
5172     float              imbal, pme_f_ratio, lossf = 0, lossp = 0;
5173     gmx_bool           bLim;
5174     gmx_domdec_comm_t *comm;
5175
5176     comm = dd->comm;
5177     if (DDMASTER(dd) && comm->nload > 0)
5178     {
5179         npp    = dd->nnodes;
5180         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5181         nnodes = npp + npme;
5182         if (dd->nnodes > 1 && comm->load_sum > 0)
5183         {
5184             imbal  = comm->load_max*npp/comm->load_sum - 1;
5185             lossf  = dd_force_imb_perf_loss(dd);
5186             sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5187             fprintf(fplog, "%s", buf);
5188             fprintf(stderr, "\n");
5189             fprintf(stderr, "%s", buf);
5190             sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5191             fprintf(fplog, "%s", buf);
5192             fprintf(stderr, "%s", buf);
5193         }
5194         bLim = FALSE;
5195         if (dlbIsOn(comm))
5196         {
5197             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5198             for (d = 0; d < dd->ndim; d++)
5199             {
5200                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5201                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5202                 if (limp >= 50)
5203                 {
5204                     bLim = TRUE;
5205                 }
5206             }
5207             sprintf(buf+strlen(buf), "\n");
5208             fprintf(fplog, "%s", buf);
5209             fprintf(stderr, "%s", buf);
5210         }
5211         if (npme > 0 && comm->load_mdf > 0 && comm->load_step > 0)
5212         {
5213             pme_f_ratio = comm->load_pme/comm->load_mdf;
5214             lossp       = (comm->load_pme - comm->load_mdf)/comm->load_step;
5215             if (lossp <= 0)
5216             {
5217                 lossp *= (float)npme/(float)nnodes;
5218             }
5219             else
5220             {
5221                 lossp *= (float)npp/(float)nnodes;
5222             }
5223             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5224             fprintf(fplog, "%s", buf);
5225             fprintf(stderr, "%s", buf);
5226             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5227             fprintf(fplog, "%s", buf);
5228             fprintf(stderr, "%s", buf);
5229         }
5230         fprintf(fplog, "\n");
5231         fprintf(stderr, "\n");
5232
5233         if (lossf >= DD_PERF_LOSS_WARN)
5234         {
5235             sprintf(buf,
5236                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5237                     "      in the domain decomposition.\n", lossf*100);
5238             if (!dlbIsOn(comm))
5239             {
5240                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5241             }
5242             else if (bLim)
5243             {
5244                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5245             }
5246             fprintf(fplog, "%s\n", buf);
5247             fprintf(stderr, "%s\n", buf);
5248         }
5249         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS_WARN)
5250         {
5251             sprintf(buf,
5252                     "NOTE: %.1f %% performance was lost because the PME ranks\n"
5253                     "      had %s work to do than the PP ranks.\n"
5254                     "      You might want to %s the number of PME ranks\n"
5255                     "      or %s the cut-off and the grid spacing.\n",
5256                     fabs(lossp*100),
5257                     (lossp < 0) ? "less"     : "more",
5258                     (lossp < 0) ? "decrease" : "increase",
5259                     (lossp < 0) ? "decrease" : "increase");
5260             fprintf(fplog, "%s\n", buf);
5261             fprintf(stderr, "%s\n", buf);
5262         }
5263     }
5264 }
5265
5266 static float dd_vol_min(gmx_domdec_t *dd)
5267 {
5268     return dd->comm->load[0].cvol_min*dd->nnodes;
5269 }
5270
5271 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5272 {
5273     return dd->comm->load[0].flags;
5274 }
5275
5276 static float dd_f_imbal(gmx_domdec_t *dd)
5277 {
5278     if (dd->comm->load[0].sum > 0)
5279     {
5280         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
5281     }
5282     else
5283     {
5284         /* Something is wrong in the cycle counting, report no load imbalance */
5285         return 0.0f;
5286     }
5287 }
5288
5289 float dd_pme_f_ratio(gmx_domdec_t *dd)
5290 {
5291     /* Should only be called on the DD master rank */
5292     assert(DDMASTER(dd));
5293
5294     if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
5295     {
5296         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5297     }
5298     else
5299     {
5300         return -1.0;
5301     }
5302 }
5303
5304 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5305 {
5306     int  flags, d;
5307     char buf[22];
5308
5309     flags = dd_load_flags(dd);
5310     if (flags)
5311     {
5312         fprintf(fplog,
5313                 "DD  load balancing is limited by minimum cell size in dimension");
5314         for (d = 0; d < dd->ndim; d++)
5315         {
5316             if (flags & (1<<d))
5317             {
5318                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5319             }
5320         }
5321         fprintf(fplog, "\n");
5322     }
5323     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5324     if (dlbIsOn(dd->comm))
5325     {
5326         fprintf(fplog, "  vol min/aver %5.3f%c",
5327                 dd_vol_min(dd), flags ? '!' : ' ');
5328     }
5329     if (dd->nnodes > 1)
5330     {
5331         fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5332     }
5333     if (dd->comm->cycl_n[ddCyclPME])
5334     {
5335         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5336     }
5337     fprintf(fplog, "\n\n");
5338 }
5339
5340 static void dd_print_load_verbose(gmx_domdec_t *dd)
5341 {
5342     if (dlbIsOn(dd->comm))
5343     {
5344         fprintf(stderr, "vol %4.2f%c ",
5345                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5346     }
5347     if (dd->nnodes > 1)
5348     {
5349         fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5350     }
5351     if (dd->comm->cycl_n[ddCyclPME])
5352     {
5353         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5354     }
5355 }
5356
5357 #ifdef GMX_MPI
5358 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5359 {
5360     MPI_Comm           c_row;
5361     int                dim, i, rank;
5362     ivec               loc_c;
5363     domdec_root_t     *root;
5364     gmx_bool           bPartOfGroup = FALSE;
5365
5366     dim = dd->dim[dim_ind];
5367     copy_ivec(loc, loc_c);
5368     for (i = 0; i < dd->nc[dim]; i++)
5369     {
5370         loc_c[dim] = i;
5371         rank       = dd_index(dd->nc, loc_c);
5372         if (rank == dd->rank)
5373         {
5374             /* This process is part of the group */
5375             bPartOfGroup = TRUE;
5376         }
5377     }
5378     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5379                    &c_row);
5380     if (bPartOfGroup)
5381     {
5382         dd->comm->mpi_comm_load[dim_ind] = c_row;
5383         if (dd->comm->dlbState != edlbsOffForever)
5384         {
5385             if (dd->ci[dim] == dd->master_ci[dim])
5386             {
5387                 /* This is the root process of this row */
5388                 snew(dd->comm->root[dim_ind], 1);
5389                 root = dd->comm->root[dim_ind];
5390                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5391                 snew(root->old_cell_f, dd->nc[dim]+1);
5392                 snew(root->bCellMin, dd->nc[dim]);
5393                 if (dim_ind > 0)
5394                 {
5395                     snew(root->cell_f_max0, dd->nc[dim]);
5396                     snew(root->cell_f_min1, dd->nc[dim]);
5397                     snew(root->bound_min, dd->nc[dim]);
5398                     snew(root->bound_max, dd->nc[dim]);
5399                 }
5400                 snew(root->buf_ncd, dd->nc[dim]);
5401             }
5402             else
5403             {
5404                 /* This is not a root process, we only need to receive cell_f */
5405                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5406             }
5407         }
5408         if (dd->ci[dim] == dd->master_ci[dim])
5409         {
5410             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5411         }
5412     }
5413 }
5414 #endif
5415
5416 void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
5417                                    const gmx_hw_info_t gmx_unused *hwinfo,
5418                                    const gmx_hw_opt_t  gmx_unused *hw_opt)
5419 {
5420 #ifdef GMX_MPI
5421     int           physicalnode_id_hash;
5422     int           gpu_id;
5423     gmx_domdec_t *dd;
5424     MPI_Comm      mpi_comm_pp_physicalnode;
5425
5426     if (!(cr->duty & DUTY_PP) || hw_opt->gpu_opt.n_dev_use == 0)
5427     {
5428         /* Only PP nodes (currently) use GPUs.
5429          * If we don't have GPUs, there are no resources to share.
5430          */
5431         return;
5432     }
5433
5434     physicalnode_id_hash = gmx_physicalnode_id_hash();
5435
5436     gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
5437
5438     dd = cr->dd;
5439
5440     if (debug)
5441     {
5442         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5443         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5444                 dd->rank, physicalnode_id_hash, gpu_id);
5445     }
5446     /* Split the PP communicator over the physical nodes */
5447     /* TODO: See if we should store this (before), as it's also used for
5448      * for the nodecomm summution.
5449      */
5450     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5451                    &mpi_comm_pp_physicalnode);
5452     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5453                    &dd->comm->mpi_comm_gpu_shared);
5454     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5455     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5456
5457     if (debug)
5458     {
5459         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5460     }
5461
5462     /* Note that some ranks could share a GPU, while others don't */
5463
5464     if (dd->comm->nrank_gpu_shared == 1)
5465     {
5466         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5467     }
5468 #endif
5469 }
5470
5471 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5472 {
5473 #ifdef GMX_MPI
5474     int  dim0, dim1, i, j;
5475     ivec loc;
5476
5477     if (debug)
5478     {
5479         fprintf(debug, "Making load communicators\n");
5480     }
5481
5482     snew(dd->comm->load,          std::max(dd->ndim, 1));
5483     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
5484
5485     if (dd->ndim == 0)
5486     {
5487         return;
5488     }
5489
5490     clear_ivec(loc);
5491     make_load_communicator(dd, 0, loc);
5492     if (dd->ndim > 1)
5493     {
5494         dim0 = dd->dim[0];
5495         for (i = 0; i < dd->nc[dim0]; i++)
5496         {
5497             loc[dim0] = i;
5498             make_load_communicator(dd, 1, loc);
5499         }
5500     }
5501     if (dd->ndim > 2)
5502     {
5503         dim0 = dd->dim[0];
5504         for (i = 0; i < dd->nc[dim0]; i++)
5505         {
5506             loc[dim0] = i;
5507             dim1      = dd->dim[1];
5508             for (j = 0; j < dd->nc[dim1]; j++)
5509             {
5510                 loc[dim1] = j;
5511                 make_load_communicator(dd, 2, loc);
5512             }
5513         }
5514     }
5515
5516     if (debug)
5517     {
5518         fprintf(debug, "Finished making load communicators\n");
5519     }
5520 #endif
5521 }
5522
5523 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5524 {
5525     int                     d, dim, i, j, m;
5526     ivec                    tmp, s;
5527     int                     nzone, nzonep;
5528     ivec                    dd_zp[DD_MAXIZONE];
5529     gmx_domdec_zones_t     *zones;
5530     gmx_domdec_ns_ranges_t *izone;
5531
5532     for (d = 0; d < dd->ndim; d++)
5533     {
5534         dim = dd->dim[d];
5535         copy_ivec(dd->ci, tmp);
5536         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5537         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5538         copy_ivec(dd->ci, tmp);
5539         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5540         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5541         if (debug)
5542         {
5543             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5544                     dd->rank, dim,
5545                     dd->neighbor[d][0],
5546                     dd->neighbor[d][1]);
5547         }
5548     }
5549
5550     if (fplog)
5551     {
5552         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5553                 dd->ndim,
5554                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5555                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5556     }
5557     switch (dd->ndim)
5558     {
5559         case 3:
5560             nzone  = dd_z3n;
5561             nzonep = dd_zp3n;
5562             for (i = 0; i < nzonep; i++)
5563             {
5564                 copy_ivec(dd_zp3[i], dd_zp[i]);
5565             }
5566             break;
5567         case 2:
5568             nzone  = dd_z2n;
5569             nzonep = dd_zp2n;
5570             for (i = 0; i < nzonep; i++)
5571             {
5572                 copy_ivec(dd_zp2[i], dd_zp[i]);
5573             }
5574             break;
5575         case 1:
5576             nzone  = dd_z1n;
5577             nzonep = dd_zp1n;
5578             for (i = 0; i < nzonep; i++)
5579             {
5580                 copy_ivec(dd_zp1[i], dd_zp[i]);
5581             }
5582             break;
5583         case 0:
5584             nzone  = dd_z0n;
5585             nzonep = dd_zp0n;
5586             for (i = 0; i < nzonep; i++)
5587             {
5588                 copy_ivec(dd_zp0[i], dd_zp[i]);
5589             }
5590             break;
5591         default:
5592             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5593             nzone  = 0;
5594             nzonep = 0;
5595     }
5596
5597     zones = &dd->comm->zones;
5598
5599     for (i = 0; i < nzone; i++)
5600     {
5601         m = 0;
5602         clear_ivec(zones->shift[i]);
5603         for (d = 0; d < dd->ndim; d++)
5604         {
5605             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5606         }
5607     }
5608
5609     zones->n = nzone;
5610     for (i = 0; i < nzone; i++)
5611     {
5612         for (d = 0; d < DIM; d++)
5613         {
5614             s[d] = dd->ci[d] - zones->shift[i][d];
5615             if (s[d] < 0)
5616             {
5617                 s[d] += dd->nc[d];
5618             }
5619             else if (s[d] >= dd->nc[d])
5620             {
5621                 s[d] -= dd->nc[d];
5622             }
5623         }
5624     }
5625     zones->nizone = nzonep;
5626     for (i = 0; i < zones->nizone; i++)
5627     {
5628         if (dd_zp[i][0] != i)
5629         {
5630             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5631         }
5632         izone     = &zones->izone[i];
5633         izone->j0 = dd_zp[i][1];
5634         izone->j1 = dd_zp[i][2];
5635         for (dim = 0; dim < DIM; dim++)
5636         {
5637             if (dd->nc[dim] == 1)
5638             {
5639                 /* All shifts should be allowed */
5640                 izone->shift0[dim] = -1;
5641                 izone->shift1[dim] = 1;
5642             }
5643             else
5644             {
5645                 /*
5646                    izone->shift0[d] = 0;
5647                    izone->shift1[d] = 0;
5648                    for(j=izone->j0; j<izone->j1; j++) {
5649                    if (dd->shift[j][d] > dd->shift[i][d])
5650                    izone->shift0[d] = -1;
5651                    if (dd->shift[j][d] < dd->shift[i][d])
5652                    izone->shift1[d] = 1;
5653                    }
5654                  */
5655
5656                 int shift_diff;
5657
5658                 /* Assume the shift are not more than 1 cell */
5659                 izone->shift0[dim] = 1;
5660                 izone->shift1[dim] = -1;
5661                 for (j = izone->j0; j < izone->j1; j++)
5662                 {
5663                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5664                     if (shift_diff < izone->shift0[dim])
5665                     {
5666                         izone->shift0[dim] = shift_diff;
5667                     }
5668                     if (shift_diff > izone->shift1[dim])
5669                     {
5670                         izone->shift1[dim] = shift_diff;
5671                     }
5672                 }
5673             }
5674         }
5675     }
5676
5677     if (dd->comm->dlbState != edlbsOffForever)
5678     {
5679         snew(dd->comm->root, dd->ndim);
5680     }
5681
5682     if (dd->comm->bRecordLoad)
5683     {
5684         make_load_communicators(dd);
5685     }
5686 }
5687
5688 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
5689 {
5690     gmx_domdec_t      *dd;
5691     dd   = cr->dd;
5692
5693 #ifdef GMX_MPI
5694     gmx_domdec_comm_t *comm;
5695     int                rank, *buf;
5696     ivec               periods;
5697     MPI_Comm           comm_cart;
5698
5699     comm = dd->comm;
5700
5701     if (comm->bCartesianPP)
5702     {
5703         /* Set up cartesian communication for the particle-particle part */
5704         if (fplog)
5705         {
5706             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5707                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5708         }
5709
5710         for (int i = 0; i < DIM; i++)
5711         {
5712             periods[i] = TRUE;
5713         }
5714         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5715                         &comm_cart);
5716         /* We overwrite the old communicator with the new cartesian one */
5717         cr->mpi_comm_mygroup = comm_cart;
5718     }
5719
5720     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5721     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5722
5723     if (comm->bCartesianPP_PME)
5724     {
5725         /* Since we want to use the original cartesian setup for sim,
5726          * and not the one after split, we need to make an index.
5727          */
5728         snew(comm->ddindex2ddnodeid, dd->nnodes);
5729         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5730         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5731         /* Get the rank of the DD master,
5732          * above we made sure that the master node is a PP node.
5733          */
5734         if (MASTER(cr))
5735         {
5736             rank = dd->rank;
5737         }
5738         else
5739         {
5740             rank = 0;
5741         }
5742         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5743     }
5744     else if (comm->bCartesianPP)
5745     {
5746         if (cr->npmenodes == 0)
5747         {
5748             /* The PP communicator is also
5749              * the communicator for this simulation
5750              */
5751             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5752         }
5753         cr->nodeid = dd->rank;
5754
5755         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5756
5757         /* We need to make an index to go from the coordinates
5758          * to the nodeid of this simulation.
5759          */
5760         snew(comm->ddindex2simnodeid, dd->nnodes);
5761         snew(buf, dd->nnodes);
5762         if (cr->duty & DUTY_PP)
5763         {
5764             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5765         }
5766         /* Communicate the ddindex to simulation nodeid index */
5767         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5768                       cr->mpi_comm_mysim);
5769         sfree(buf);
5770
5771         /* Determine the master coordinates and rank.
5772          * The DD master should be the same node as the master of this sim.
5773          */
5774         for (int i = 0; i < dd->nnodes; i++)
5775         {
5776             if (comm->ddindex2simnodeid[i] == 0)
5777             {
5778                 ddindex2xyz(dd->nc, i, dd->master_ci);
5779                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5780             }
5781         }
5782         if (debug)
5783         {
5784             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5785         }
5786     }
5787     else
5788     {
5789         /* No Cartesian communicators */
5790         /* We use the rank in dd->comm->all as DD index */
5791         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5792         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5793         dd->masterrank = 0;
5794         clear_ivec(dd->master_ci);
5795     }
5796 #endif
5797
5798     if (fplog)
5799     {
5800         fprintf(fplog,
5801                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5802                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5803     }
5804     if (debug)
5805     {
5806         fprintf(debug,
5807                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5808                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5809     }
5810 }
5811
5812 static void receive_ddindex2simnodeid(t_commrec gmx_unused *cr)
5813 {
5814 #ifdef GMX_MPI
5815     gmx_domdec_t      *dd;
5816     gmx_domdec_comm_t *comm;
5817
5818     dd   = cr->dd;
5819     comm = dd->comm;
5820
5821     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5822     {
5823         int *buf;
5824         snew(comm->ddindex2simnodeid, dd->nnodes);
5825         snew(buf, dd->nnodes);
5826         if (cr->duty & DUTY_PP)
5827         {
5828             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5829         }
5830         /* Communicate the ddindex to simulation nodeid index */
5831         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5832                       cr->mpi_comm_mysim);
5833         sfree(buf);
5834     }
5835 #endif
5836 }
5837
5838 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5839                                                      int ncg, int natoms)
5840 {
5841     gmx_domdec_master_t *ma;
5842     int                  i;
5843
5844     snew(ma, 1);
5845
5846     snew(ma->ncg, dd->nnodes);
5847     snew(ma->index, dd->nnodes+1);
5848     snew(ma->cg, ncg);
5849     snew(ma->nat, dd->nnodes);
5850     snew(ma->ibuf, dd->nnodes*2);
5851     snew(ma->cell_x, DIM);
5852     for (i = 0; i < DIM; i++)
5853     {
5854         snew(ma->cell_x[i], dd->nc[i]+1);
5855     }
5856
5857     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5858     {
5859         ma->vbuf = NULL;
5860     }
5861     else
5862     {
5863         snew(ma->vbuf, natoms);
5864     }
5865
5866     return ma;
5867 }
5868
5869 static void split_communicator(FILE *fplog, t_commrec *cr, int gmx_unused dd_node_order,
5870                                int gmx_unused reorder)
5871 {
5872     gmx_domdec_t      *dd;
5873     gmx_domdec_comm_t *comm;
5874     int                i;
5875     gmx_bool           bDiv[DIM];
5876 #ifdef GMX_MPI
5877     MPI_Comm           comm_cart;
5878 #endif
5879
5880     dd   = cr->dd;
5881     comm = dd->comm;
5882
5883     if (comm->bCartesianPP)
5884     {
5885         for (i = 1; i < DIM; i++)
5886         {
5887             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5888         }
5889         if (bDiv[YY] || bDiv[ZZ])
5890         {
5891             comm->bCartesianPP_PME = TRUE;
5892             /* If we have 2D PME decomposition, which is always in x+y,
5893              * we stack the PME only nodes in z.
5894              * Otherwise we choose the direction that provides the thinnest slab
5895              * of PME only nodes as this will have the least effect
5896              * on the PP communication.
5897              * But for the PME communication the opposite might be better.
5898              */
5899             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5900                              !bDiv[YY] ||
5901                              dd->nc[YY] > dd->nc[ZZ]))
5902             {
5903                 comm->cartpmedim = ZZ;
5904             }
5905             else
5906             {
5907                 comm->cartpmedim = YY;
5908             }
5909             comm->ntot[comm->cartpmedim]
5910                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5911         }
5912         else if (fplog)
5913         {
5914             fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
5915             fprintf(fplog,
5916                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5917         }
5918     }
5919
5920 #ifdef GMX_MPI
5921     if (comm->bCartesianPP_PME)
5922     {
5923         int  rank;
5924         ivec periods;
5925
5926         if (fplog)
5927         {
5928             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
5929         }
5930
5931         for (i = 0; i < DIM; i++)
5932         {
5933             periods[i] = TRUE;
5934         }
5935         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
5936                         &comm_cart);
5937         MPI_Comm_rank(comm_cart, &rank);
5938         if (MASTER(cr) && rank != 0)
5939         {
5940             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5941         }
5942
5943         /* With this assigment we loose the link to the original communicator
5944          * which will usually be MPI_COMM_WORLD, unless have multisim.
5945          */
5946         cr->mpi_comm_mysim = comm_cart;
5947         cr->sim_nodeid     = rank;
5948
5949         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
5950
5951         if (fplog)
5952         {
5953             fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
5954                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5955         }
5956
5957         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5958         {
5959             cr->duty = DUTY_PP;
5960         }
5961         if (cr->npmenodes == 0 ||
5962             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5963         {
5964             cr->duty = DUTY_PME;
5965         }
5966
5967         /* Split the sim communicator into PP and PME only nodes */
5968         MPI_Comm_split(cr->mpi_comm_mysim,
5969                        cr->duty,
5970                        dd_index(comm->ntot, dd->ci),
5971                        &cr->mpi_comm_mygroup);
5972     }
5973     else
5974     {
5975         switch (dd_node_order)
5976         {
5977             case ddnoPP_PME:
5978                 if (fplog)
5979                 {
5980                     fprintf(fplog, "Order of the ranks: PP first, PME last\n");
5981                 }
5982                 break;
5983             case ddnoINTERLEAVE:
5984                 /* Interleave the PP-only and PME-only nodes,
5985                  * as on clusters with dual-core machines this will double
5986                  * the communication bandwidth of the PME processes
5987                  * and thus speed up the PP <-> PME and inter PME communication.
5988                  */
5989                 if (fplog)
5990                 {
5991                     fprintf(fplog, "Interleaving PP and PME ranks\n");
5992                 }
5993                 comm->pmenodes = dd_pmenodes(cr);
5994                 break;
5995             case ddnoCARTESIAN:
5996                 break;
5997             default:
5998                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
5999         }
6000
6001         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
6002         {
6003             cr->duty = DUTY_PME;
6004         }
6005         else
6006         {
6007             cr->duty = DUTY_PP;
6008         }
6009
6010         /* Split the sim communicator into PP and PME only nodes */
6011         MPI_Comm_split(cr->mpi_comm_mysim,
6012                        cr->duty,
6013                        cr->nodeid,
6014                        &cr->mpi_comm_mygroup);
6015         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6016     }
6017 #endif
6018
6019     if (fplog)
6020     {
6021         fprintf(fplog, "This rank does only %s work.\n\n",
6022                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6023     }
6024 }
6025
6026 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6027 {
6028     gmx_domdec_t      *dd;
6029     gmx_domdec_comm_t *comm;
6030     int                CartReorder;
6031
6032     dd   = cr->dd;
6033     comm = dd->comm;
6034
6035     copy_ivec(dd->nc, comm->ntot);
6036
6037     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6038     comm->bCartesianPP_PME = FALSE;
6039
6040     /* Reorder the nodes by default. This might change the MPI ranks.
6041      * Real reordering is only supported on very few architectures,
6042      * Blue Gene is one of them.
6043      */
6044     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6045
6046     if (cr->npmenodes > 0)
6047     {
6048         /* Split the communicator into a PP and PME part */
6049         split_communicator(fplog, cr, dd_node_order, CartReorder);
6050         if (comm->bCartesianPP_PME)
6051         {
6052             /* We (possibly) reordered the nodes in split_communicator,
6053              * so it is no longer required in make_pp_communicator.
6054              */
6055             CartReorder = FALSE;
6056         }
6057     }
6058     else
6059     {
6060         /* All nodes do PP and PME */
6061 #ifdef GMX_MPI
6062         /* We do not require separate communicators */
6063         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6064 #endif
6065     }
6066
6067     if (cr->duty & DUTY_PP)
6068     {
6069         /* Copy or make a new PP communicator */
6070         make_pp_communicator(fplog, cr, CartReorder);
6071     }
6072     else
6073     {
6074         receive_ddindex2simnodeid(cr);
6075     }
6076
6077     if (!(cr->duty & DUTY_PME))
6078     {
6079         /* Set up the commnuication to our PME node */
6080         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6081         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6082         if (debug)
6083         {
6084             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6085                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6086         }
6087     }
6088     else
6089     {
6090         dd->pme_nodeid = -1;
6091     }
6092
6093     if (DDMASTER(dd))
6094     {
6095         dd->ma = init_gmx_domdec_master_t(dd,
6096                                           comm->cgs_gl.nr,
6097                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6098     }
6099 }
6100
6101 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6102 {
6103     real  *slb_frac, tot;
6104     int    i, n;
6105     double dbl;
6106
6107     slb_frac = NULL;
6108     if (nc > 1 && size_string != NULL)
6109     {
6110         if (fplog)
6111         {
6112             fprintf(fplog, "Using static load balancing for the %s direction\n",
6113                     dir);
6114         }
6115         snew(slb_frac, nc);
6116         tot = 0;
6117         for (i = 0; i < nc; i++)
6118         {
6119             dbl = 0;
6120             sscanf(size_string, "%20lf%n", &dbl, &n);
6121             if (dbl == 0)
6122             {
6123                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6124             }
6125             slb_frac[i]  = dbl;
6126             size_string += n;
6127             tot         += slb_frac[i];
6128         }
6129         /* Normalize */
6130         if (fplog)
6131         {
6132             fprintf(fplog, "Relative cell sizes:");
6133         }
6134         for (i = 0; i < nc; i++)
6135         {
6136             slb_frac[i] /= tot;
6137             if (fplog)
6138             {
6139                 fprintf(fplog, " %5.3f", slb_frac[i]);
6140             }
6141         }
6142         if (fplog)
6143         {
6144             fprintf(fplog, "\n");
6145         }
6146     }
6147
6148     return slb_frac;
6149 }
6150
6151 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6152 {
6153     int                  n, nmol, ftype;
6154     gmx_mtop_ilistloop_t iloop;
6155     t_ilist             *il;
6156
6157     n     = 0;
6158     iloop = gmx_mtop_ilistloop_init(mtop);
6159     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6160     {
6161         for (ftype = 0; ftype < F_NRE; ftype++)
6162         {
6163             if ((interaction_function[ftype].flags & IF_BOND) &&
6164                 NRAL(ftype) >  2)
6165             {
6166                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6167             }
6168         }
6169     }
6170
6171     return n;
6172 }
6173
6174 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6175 {
6176     char *val;
6177     int   nst;
6178
6179     nst = def;
6180     val = getenv(env_var);
6181     if (val)
6182     {
6183         if (sscanf(val, "%20d", &nst) <= 0)
6184         {
6185             nst = 1;
6186         }
6187         if (fplog)
6188         {
6189             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6190                     env_var, val, nst);
6191         }
6192     }
6193
6194     return nst;
6195 }
6196
6197 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6198 {
6199     if (MASTER(cr))
6200     {
6201         fprintf(stderr, "\n%s\n", warn_string);
6202     }
6203     if (fplog)
6204     {
6205         fprintf(fplog, "\n%s\n", warn_string);
6206     }
6207 }
6208
6209 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6210                                   t_inputrec *ir, FILE *fplog)
6211 {
6212     if (ir->ePBC == epbcSCREW &&
6213         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6214     {
6215         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6216     }
6217
6218     if (ir->ns_type == ensSIMPLE)
6219     {
6220         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6221     }
6222
6223     if (ir->nstlist == 0)
6224     {
6225         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6226     }
6227
6228     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6229     {
6230         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6231     }
6232 }
6233
6234 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6235 {
6236     int  di, d;
6237     real r;
6238
6239     r = ddbox->box_size[XX];
6240     for (di = 0; di < dd->ndim; di++)
6241     {
6242         d = dd->dim[di];
6243         /* Check using the initial average cell size */
6244         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6245     }
6246
6247     return r;
6248 }
6249
6250 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6251                              const char *dlb_opt, gmx_bool bRecordLoad,
6252                              unsigned long Flags, t_inputrec *ir)
6253 {
6254     int           dlbState = -1;
6255     char          buf[STRLEN];
6256
6257     switch (dlb_opt[0])
6258     {
6259         case 'a': dlbState = edlbsOffCanTurnOn; break;
6260         case 'n': dlbState = edlbsOffForever;   break;
6261         case 'y': dlbState = edlbsOn;           break;
6262         default: gmx_incons("Unknown dlb_opt");
6263     }
6264
6265     if (Flags & MD_RERUN)
6266     {
6267         return edlbsOffForever;
6268     }
6269
6270     if (!EI_DYNAMICS(ir->eI))
6271     {
6272         if (dlbState == edlbsOn)
6273         {
6274             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6275             dd_warning(cr, fplog, buf);
6276         }
6277
6278         return edlbsOffForever;
6279     }
6280
6281     if (!bRecordLoad)
6282     {
6283         dd_warning(cr, fplog, "NOTE: Cycle counters unsupported or not enabled in kernel. Cannot use dynamic load balancing.\n");
6284         return edlbsOffForever;
6285     }
6286
6287     if (Flags & MD_REPRODUCIBLE)
6288     {
6289         switch (dlbState)
6290         {
6291             case edlbsOffForever:
6292                 break;
6293             case edlbsOffCanTurnOn:
6294                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6295                 dlbState = edlbsOffForever;
6296                 break;
6297             case edlbsOn:
6298                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6299                 break;
6300             default:
6301                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
6302                 break;
6303         }
6304     }
6305
6306     return dlbState;
6307 }
6308
6309 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6310 {
6311     int dim;
6312
6313     dd->ndim = 0;
6314     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6315     {
6316         /* Decomposition order z,y,x */
6317         if (fplog)
6318         {
6319             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6320         }
6321         for (dim = DIM-1; dim >= 0; dim--)
6322         {
6323             if (dd->nc[dim] > 1)
6324             {
6325                 dd->dim[dd->ndim++] = dim;
6326             }
6327         }
6328     }
6329     else
6330     {
6331         /* Decomposition order x,y,z */
6332         for (dim = 0; dim < DIM; dim++)
6333         {
6334             if (dd->nc[dim] > 1)
6335             {
6336                 dd->dim[dd->ndim++] = dim;
6337             }
6338         }
6339     }
6340 }
6341
6342 static gmx_domdec_comm_t *init_dd_comm()
6343 {
6344     gmx_domdec_comm_t *comm;
6345     int                i;
6346
6347     snew(comm, 1);
6348     snew(comm->cggl_flag, DIM*2);
6349     snew(comm->cgcm_state, DIM*2);
6350     for (i = 0; i < DIM*2; i++)
6351     {
6352         comm->cggl_flag_nalloc[i]  = 0;
6353         comm->cgcm_state_nalloc[i] = 0;
6354     }
6355
6356     comm->nalloc_int = 0;
6357     comm->buf_int    = NULL;
6358
6359     vec_rvec_init(&comm->vbuf);
6360
6361     comm->n_load_have    = 0;
6362     comm->n_load_collect = 0;
6363
6364     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6365     {
6366         comm->sum_nat[i] = 0;
6367     }
6368     comm->ndecomp   = 0;
6369     comm->nload     = 0;
6370     comm->load_step = 0;
6371     comm->load_sum  = 0;
6372     comm->load_max  = 0;
6373     clear_ivec(comm->load_lim);
6374     comm->load_mdf  = 0;
6375     comm->load_pme  = 0;
6376
6377     return comm;
6378 }
6379
6380 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6381                                         unsigned long Flags,
6382                                         ivec nc,
6383                                         real comm_distance_min, real rconstr,
6384                                         const char *dlb_opt, real dlb_scale,
6385                                         const char *sizex, const char *sizey, const char *sizez,
6386                                         gmx_mtop_t *mtop, t_inputrec *ir,
6387                                         matrix box, rvec *x,
6388                                         gmx_ddbox_t *ddbox,
6389                                         int *npme_x, int *npme_y)
6390 {
6391     gmx_domdec_t      *dd;
6392     gmx_domdec_comm_t *comm;
6393     int                recload;
6394     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6395     gmx_bool           bC;
6396     char               buf[STRLEN];
6397     const real         tenPercentMargin = 1.1;
6398
6399     if (fplog)
6400     {
6401         fprintf(fplog,
6402                 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
6403     }
6404
6405     snew(dd, 1);
6406
6407     dd->comm = init_dd_comm();
6408     comm     = dd->comm;
6409     snew(comm->cggl_flag, DIM*2);
6410     snew(comm->cgcm_state, DIM*2);
6411
6412     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6413     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6414
6415     dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
6416     comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
6417     comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
6418     recload             = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
6419     comm->nstSortCG     = dd_getenv(fplog, "GMX_DD_NST_SORT_CHARGE_GROUPS", 1);
6420     comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
6421     comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
6422     comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
6423
6424     dd->pme_recv_f_alloc = 0;
6425     dd->pme_recv_f_buf   = NULL;
6426
6427     if (dd->bSendRecv2 && fplog)
6428     {
6429         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6430     }
6431     if (comm->eFlop)
6432     {
6433         if (fplog)
6434         {
6435             fprintf(fplog, "Will load balance based on FLOP count\n");
6436         }
6437         if (comm->eFlop > 1)
6438         {
6439             srand(1+cr->nodeid);
6440         }
6441         comm->bRecordLoad = TRUE;
6442     }
6443     else
6444     {
6445         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6446
6447     }
6448
6449     /* Initialize to GPU share count to 0, might change later */
6450     comm->nrank_gpu_shared = 0;
6451
6452     comm->dlbState                 = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6453     comm->bCheckWhetherToTurnDlbOn = TRUE;
6454
6455     if (fplog)
6456     {
6457         fprintf(fplog, "Dynamic load balancing: %s\n",
6458                 edlbs_names[comm->dlbState]);
6459     }
6460     comm->bPMELoadBalDLBLimits = FALSE;
6461
6462     if (comm->nstSortCG)
6463     {
6464         if (fplog)
6465         {
6466             if (comm->nstSortCG == 1)
6467             {
6468                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6469             }
6470             else
6471             {
6472                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6473                         comm->nstSortCG);
6474             }
6475         }
6476         snew(comm->sort, 1);
6477     }
6478     else
6479     {
6480         if (fplog)
6481         {
6482             fprintf(fplog, "Will not sort the charge groups\n");
6483         }
6484     }
6485
6486     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6487
6488     comm->bInterCGBondeds = ((ncg_mtop(mtop) > mtop->mols.nr) ||
6489                              mtop->bIntermolecularInteractions);
6490     if (comm->bInterCGBondeds)
6491     {
6492         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6493     }
6494     else
6495     {
6496         comm->bInterCGMultiBody = FALSE;
6497     }
6498
6499     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6500     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6501
6502     if (ir->rlist == 0)
6503     {
6504         /* Set the cut-off to some very large value,
6505          * so we don't need if statements everywhere in the code.
6506          * We use sqrt, since the cut-off is squared in some places.
6507          */
6508         comm->cutoff   = GMX_CUTOFF_INF;
6509     }
6510     else
6511     {
6512         comm->cutoff   = ir->rlist;
6513     }
6514     comm->cutoff_mbody = 0;
6515
6516     comm->cellsize_limit = 0;
6517     comm->bBondComm      = FALSE;
6518
6519     /* Atoms should be able to move by up to half the list buffer size (if > 0)
6520      * within nstlist steps. Since boundaries are allowed to displace by half
6521      * a cell size, DD cells should be at least the size of the list buffer.
6522      */
6523     comm->cellsize_limit = std::max(comm->cellsize_limit,
6524                                     ir->rlist - std::max(ir->rvdw, ir->rcoulomb));
6525
6526     if (comm->bInterCGBondeds)
6527     {
6528         if (comm_distance_min > 0)
6529         {
6530             comm->cutoff_mbody = comm_distance_min;
6531             if (Flags & MD_DDBONDCOMM)
6532             {
6533                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6534             }
6535             else
6536             {
6537                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6538             }
6539             r_bonded_limit = comm->cutoff_mbody;
6540         }
6541         else if (ir->bPeriodicMols)
6542         {
6543             /* Can not easily determine the required cut-off */
6544             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6545             comm->cutoff_mbody = comm->cutoff/2;
6546             r_bonded_limit     = comm->cutoff_mbody;
6547         }
6548         else
6549         {
6550             if (MASTER(cr))
6551             {
6552                 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6553                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6554             }
6555             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6556             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6557
6558             /* We use an initial margin of 10% for the minimum cell size,
6559              * except when we are just below the non-bonded cut-off.
6560              */
6561             if (Flags & MD_DDBONDCOMM)
6562             {
6563                 if (std::max(r_2b, r_mb) > comm->cutoff)
6564                 {
6565                     r_bonded        = std::max(r_2b, r_mb);
6566                     r_bonded_limit  = tenPercentMargin*r_bonded;
6567                     comm->bBondComm = TRUE;
6568                 }
6569                 else
6570                 {
6571                     r_bonded       = r_mb;
6572                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
6573                 }
6574                 /* We determine cutoff_mbody later */
6575             }
6576             else
6577             {
6578                 /* No special bonded communication,
6579                  * simply increase the DD cut-off.
6580                  */
6581                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
6582                 comm->cutoff_mbody = r_bonded_limit;
6583                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
6584             }
6585         }
6586         if (fplog)
6587         {
6588             fprintf(fplog,
6589                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6590                     r_bonded_limit);
6591         }
6592         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
6593     }
6594
6595     if (dd->bInterCGcons && rconstr <= 0)
6596     {
6597         /* There is a cell size limit due to the constraints (P-LINCS) */
6598         rconstr = constr_r_max(fplog, mtop, ir);
6599         if (fplog)
6600         {
6601             fprintf(fplog,
6602                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6603                     rconstr);
6604             if (rconstr > comm->cellsize_limit)
6605             {
6606                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6607             }
6608         }
6609     }
6610     else if (rconstr > 0 && fplog)
6611     {
6612         /* Here we do not check for dd->bInterCGcons,
6613          * because one can also set a cell size limit for virtual sites only
6614          * and at this point we don't know yet if there are intercg v-sites.
6615          */
6616         fprintf(fplog,
6617                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6618                 rconstr);
6619     }
6620     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
6621
6622     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6623
6624     if (nc[XX] > 0)
6625     {
6626         copy_ivec(nc, dd->nc);
6627         set_dd_dim(fplog, dd);
6628         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6629
6630         if (cr->npmenodes == -1)
6631         {
6632             cr->npmenodes = 0;
6633         }
6634         acs = average_cellsize_min(dd, ddbox);
6635         if (acs < comm->cellsize_limit)
6636         {
6637             if (fplog)
6638             {
6639                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6640             }
6641             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6642                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6643                                  acs, comm->cellsize_limit);
6644         }
6645     }
6646     else
6647     {
6648         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6649
6650         /* We need to choose the optimal DD grid and possibly PME nodes */
6651         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6652                                comm->dlbState != edlbsOffForever, dlb_scale,
6653                                comm->cellsize_limit, comm->cutoff,
6654                                comm->bInterCGBondeds);
6655
6656         if (dd->nc[XX] == 0)
6657         {
6658             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6659             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6660                     !bC ? "-rdd" : "-rcon",
6661                     comm->dlbState != edlbsOffForever ? " or -dds" : "",
6662                     bC ? " or your LINCS settings" : "");
6663
6664             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6665                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6666                                  "%s\n"
6667                                  "Look in the log file for details on the domain decomposition",
6668                                  cr->nnodes-cr->npmenodes, limit, buf);
6669         }
6670         set_dd_dim(fplog, dd);
6671     }
6672
6673     if (fplog)
6674     {
6675         fprintf(fplog,
6676                 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6677                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6678     }
6679
6680     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6681     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6682     {
6683         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6684                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6685                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6686     }
6687     if (cr->npmenodes > dd->nnodes)
6688     {
6689         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6690                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6691     }
6692     if (cr->npmenodes > 0)
6693     {
6694         comm->npmenodes = cr->npmenodes;
6695     }
6696     else
6697     {
6698         comm->npmenodes = dd->nnodes;
6699     }
6700
6701     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6702     {
6703         /* The following choices should match those
6704          * in comm_cost_est in domdec_setup.c.
6705          * Note that here the checks have to take into account
6706          * that the decomposition might occur in a different order than xyz
6707          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6708          * in which case they will not match those in comm_cost_est,
6709          * but since that is mainly for testing purposes that's fine.
6710          */
6711         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6712             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6713             getenv("GMX_PMEONEDD") == NULL)
6714         {
6715             comm->npmedecompdim = 2;
6716             comm->npmenodes_x   = dd->nc[XX];
6717             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6718         }
6719         else
6720         {
6721             /* In case nc is 1 in both x and y we could still choose to
6722              * decompose pme in y instead of x, but we use x for simplicity.
6723              */
6724             comm->npmedecompdim = 1;
6725             if (dd->dim[0] == YY)
6726             {
6727                 comm->npmenodes_x = 1;
6728                 comm->npmenodes_y = comm->npmenodes;
6729             }
6730             else
6731             {
6732                 comm->npmenodes_x = comm->npmenodes;
6733                 comm->npmenodes_y = 1;
6734             }
6735         }
6736         if (fplog)
6737         {
6738             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6739                     comm->npmenodes_x, comm->npmenodes_y, 1);
6740         }
6741     }
6742     else
6743     {
6744         comm->npmedecompdim = 0;
6745         comm->npmenodes_x   = 0;
6746         comm->npmenodes_y   = 0;
6747     }
6748
6749     /* Technically we don't need both of these,
6750      * but it simplifies code not having to recalculate it.
6751      */
6752     *npme_x = comm->npmenodes_x;
6753     *npme_y = comm->npmenodes_y;
6754
6755     snew(comm->slb_frac, DIM);
6756     if (comm->dlbState == edlbsOffForever)
6757     {
6758         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6759         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6760         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6761     }
6762
6763     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6764     {
6765         if (comm->bBondComm || comm->dlbState != edlbsOffForever)
6766         {
6767             /* Set the bonded communication distance to halfway
6768              * the minimum and the maximum,
6769              * since the extra communication cost is nearly zero.
6770              */
6771             acs                = average_cellsize_min(dd, ddbox);
6772             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6773             if (comm->dlbState != edlbsOffForever)
6774             {
6775                 /* Check if this does not limit the scaling */
6776                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
6777             }
6778             if (!comm->bBondComm)
6779             {
6780                 /* Without bBondComm do not go beyond the n.b. cut-off */
6781                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
6782                 if (comm->cellsize_limit >= comm->cutoff)
6783                 {
6784                     /* We don't loose a lot of efficieny
6785                      * when increasing it to the n.b. cut-off.
6786                      * It can even be slightly faster, because we need
6787                      * less checks for the communication setup.
6788                      */
6789                     comm->cutoff_mbody = comm->cutoff;
6790                 }
6791             }
6792             /* Check if we did not end up below our original limit */
6793             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
6794
6795             if (comm->cutoff_mbody > comm->cellsize_limit)
6796             {
6797                 comm->cellsize_limit = comm->cutoff_mbody;
6798             }
6799         }
6800         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6801     }
6802
6803     if (debug)
6804     {
6805         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6806                 "cellsize limit %f\n",
6807                 comm->bBondComm, comm->cellsize_limit);
6808     }
6809
6810     if (MASTER(cr))
6811     {
6812         check_dd_restrictions(cr, dd, ir, fplog);
6813     }
6814
6815     comm->partition_step = INT_MIN;
6816     dd->ddp_count        = 0;
6817
6818     clear_dd_cycle_counts(dd);
6819
6820     return dd;
6821 }
6822
6823 static void set_dlb_limits(gmx_domdec_t *dd)
6824
6825 {
6826     int d;
6827
6828     for (d = 0; d < dd->ndim; d++)
6829     {
6830         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6831         dd->comm->cellsize_min[dd->dim[d]] =
6832             dd->comm->cellsize_min_dlb[dd->dim[d]];
6833     }
6834 }
6835
6836
6837 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6838 {
6839     gmx_domdec_t      *dd;
6840     gmx_domdec_comm_t *comm;
6841     real               cellsize_min;
6842     int                d, nc, i;
6843     char               buf[STRLEN];
6844
6845     dd   = cr->dd;
6846     comm = dd->comm;
6847
6848     if (fplog)
6849     {
6850         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
6851     }
6852
6853     cellsize_min = comm->cellsize_min[dd->dim[0]];
6854     for (d = 1; d < dd->ndim; d++)
6855     {
6856         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6857     }
6858
6859     if (cellsize_min < comm->cellsize_limit*1.05)
6860     {
6861         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6862
6863         /* Change DLB from "auto" to "no". */
6864         comm->dlbState = edlbsOffForever;
6865
6866         return;
6867     }
6868
6869     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
6870     comm->dlbState = edlbsOn;
6871
6872     set_dlb_limits(dd);
6873
6874     /* We can set the required cell size info here,
6875      * so we do not need to communicate this.
6876      * The grid is completely uniform.
6877      */
6878     for (d = 0; d < dd->ndim; d++)
6879     {
6880         if (comm->root[d])
6881         {
6882             comm->load[d].sum_m = comm->load[d].sum;
6883
6884             nc = dd->nc[dd->dim[d]];
6885             for (i = 0; i < nc; i++)
6886             {
6887                 comm->root[d]->cell_f[i]    = i/(real)nc;
6888                 if (d > 0)
6889                 {
6890                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6891                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6892                 }
6893             }
6894             comm->root[d]->cell_f[nc] = 1.0;
6895         }
6896     }
6897 }
6898
6899 static char *init_bLocalCG(gmx_mtop_t *mtop)
6900 {
6901     int   ncg, cg;
6902     char *bLocalCG;
6903
6904     ncg = ncg_mtop(mtop);
6905     snew(bLocalCG, ncg);
6906     for (cg = 0; cg < ncg; cg++)
6907     {
6908         bLocalCG[cg] = FALSE;
6909     }
6910
6911     return bLocalCG;
6912 }
6913
6914 void dd_init_bondeds(FILE *fplog,
6915                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
6916                      gmx_vsite_t *vsite,
6917                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
6918 {
6919     gmx_domdec_comm_t *comm;
6920
6921     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
6922
6923     comm = dd->comm;
6924
6925     if (comm->bBondComm)
6926     {
6927         /* Communicate atoms beyond the cut-off for bonded interactions */
6928         comm = dd->comm;
6929
6930         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
6931
6932         comm->bLocalCG = init_bLocalCG(mtop);
6933     }
6934     else
6935     {
6936         /* Only communicate atoms based on cut-off */
6937         comm->cglink   = NULL;
6938         comm->bLocalCG = NULL;
6939     }
6940 }
6941
6942 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
6943                               t_inputrec *ir,
6944                               gmx_bool bDynLoadBal, real dlb_scale,
6945                               gmx_ddbox_t *ddbox)
6946 {
6947     gmx_domdec_comm_t *comm;
6948     int                d;
6949     ivec               np;
6950     real               limit, shrink;
6951     char               buf[64];
6952
6953     if (fplog == NULL)
6954     {
6955         return;
6956     }
6957
6958     comm = dd->comm;
6959
6960     if (bDynLoadBal)
6961     {
6962         fprintf(fplog, "The maximum number of communication pulses is:");
6963         for (d = 0; d < dd->ndim; d++)
6964         {
6965             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
6966         }
6967         fprintf(fplog, "\n");
6968         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
6969         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
6970         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
6971         for (d = 0; d < DIM; d++)
6972         {
6973             if (dd->nc[d] > 1)
6974             {
6975                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6976                 {
6977                     shrink = 0;
6978                 }
6979                 else
6980                 {
6981                     shrink =
6982                         comm->cellsize_min_dlb[d]/
6983                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6984                 }
6985                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
6986             }
6987         }
6988         fprintf(fplog, "\n");
6989     }
6990     else
6991     {
6992         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
6993         fprintf(fplog, "The initial number of communication pulses is:");
6994         for (d = 0; d < dd->ndim; d++)
6995         {
6996             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
6997         }
6998         fprintf(fplog, "\n");
6999         fprintf(fplog, "The initial domain decomposition cell size is:");
7000         for (d = 0; d < DIM; d++)
7001         {
7002             if (dd->nc[d] > 1)
7003             {
7004                 fprintf(fplog, " %c %.2f nm",
7005                         dim2char(d), dd->comm->cellsize_min[d]);
7006             }
7007         }
7008         fprintf(fplog, "\n\n");
7009     }
7010
7011     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7012     {
7013         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7014         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7015                 "non-bonded interactions", "", comm->cutoff);
7016
7017         if (bDynLoadBal)
7018         {
7019             limit = dd->comm->cellsize_limit;
7020         }
7021         else
7022         {
7023             if (dynamic_dd_box(ddbox, ir))
7024             {
7025                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7026             }
7027             limit = dd->comm->cellsize_min[XX];
7028             for (d = 1; d < DIM; d++)
7029             {
7030                 limit = std::min(limit, dd->comm->cellsize_min[d]);
7031             }
7032         }
7033
7034         if (comm->bInterCGBondeds)
7035         {
7036             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7037                     "two-body bonded interactions", "(-rdd)",
7038                     std::max(comm->cutoff, comm->cutoff_mbody));
7039             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7040                     "multi-body bonded interactions", "(-rdd)",
7041                     (comm->bBondComm || dlbIsOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
7042         }
7043         if (dd->vsite_comm)
7044         {
7045             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7046                     "virtual site constructions", "(-rcon)", limit);
7047         }
7048         if (dd->constraint_comm)
7049         {
7050             sprintf(buf, "atoms separated by up to %d constraints",
7051                     1+ir->nProjOrder);
7052             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7053                     buf, "(-rcon)", limit);
7054         }
7055         fprintf(fplog, "\n");
7056     }
7057
7058     fflush(fplog);
7059 }
7060
7061 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7062                                 real               dlb_scale,
7063                                 const t_inputrec  *ir,
7064                                 const gmx_ddbox_t *ddbox)
7065 {
7066     gmx_domdec_comm_t *comm;
7067     int                d, dim, npulse, npulse_d_max, npulse_d;
7068     gmx_bool           bNoCutOff;
7069
7070     comm = dd->comm;
7071
7072     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7073
7074     /* Determine the maximum number of comm. pulses in one dimension */
7075
7076     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7077
7078     /* Determine the maximum required number of grid pulses */
7079     if (comm->cellsize_limit >= comm->cutoff)
7080     {
7081         /* Only a single pulse is required */
7082         npulse = 1;
7083     }
7084     else if (!bNoCutOff && comm->cellsize_limit > 0)
7085     {
7086         /* We round down slightly here to avoid overhead due to the latency
7087          * of extra communication calls when the cut-off
7088          * would be only slightly longer than the cell size.
7089          * Later cellsize_limit is redetermined,
7090          * so we can not miss interactions due to this rounding.
7091          */
7092         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7093     }
7094     else
7095     {
7096         /* There is no cell size limit */
7097         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7098     }
7099
7100     if (!bNoCutOff && npulse > 1)
7101     {
7102         /* See if we can do with less pulses, based on dlb_scale */
7103         npulse_d_max = 0;
7104         for (d = 0; d < dd->ndim; d++)
7105         {
7106             dim      = dd->dim[d];
7107             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7108                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7109             npulse_d_max = std::max(npulse_d_max, npulse_d);
7110         }
7111         npulse = std::min(npulse, npulse_d_max);
7112     }
7113
7114     /* This env var can override npulse */
7115     d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
7116     if (d > 0)
7117     {
7118         npulse = d;
7119     }
7120
7121     comm->maxpulse       = 1;
7122     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7123     for (d = 0; d < dd->ndim; d++)
7124     {
7125         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
7126         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7127         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7128         comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
7129         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7130         {
7131             comm->bVacDLBNoLimit = FALSE;
7132         }
7133     }
7134
7135     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7136     if (!comm->bVacDLBNoLimit)
7137     {
7138         comm->cellsize_limit = std::max(comm->cellsize_limit,
7139                                         comm->cutoff/comm->maxpulse);
7140     }
7141     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7142     /* Set the minimum cell size for each DD dimension */
7143     for (d = 0; d < dd->ndim; d++)
7144     {
7145         if (comm->bVacDLBNoLimit ||
7146             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7147         {
7148             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7149         }
7150         else
7151         {
7152             comm->cellsize_min_dlb[dd->dim[d]] =
7153                 comm->cutoff/comm->cd[d].np_dlb;
7154         }
7155     }
7156     if (comm->cutoff_mbody <= 0)
7157     {
7158         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
7159     }
7160     if (dlbIsOn(comm))
7161     {
7162         set_dlb_limits(dd);
7163     }
7164 }
7165
7166 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7167 {
7168     /* If each molecule is a single charge group
7169      * or we use domain decomposition for each periodic dimension,
7170      * we do not need to take pbc into account for the bonded interactions.
7171      */
7172     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7173             !(dd->nc[XX] > 1 &&
7174               dd->nc[YY] > 1 &&
7175               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7176 }
7177
7178 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7179                        t_inputrec *ir, gmx_ddbox_t *ddbox)
7180 {
7181     gmx_domdec_comm_t *comm;
7182     int                natoms_tot;
7183     real               vol_frac;
7184
7185     comm = dd->comm;
7186
7187     /* Initialize the thread data.
7188      * This can not be done in init_domain_decomposition,
7189      * as the numbers of threads is determined later.
7190      */
7191     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7192     if (comm->nth > 1)
7193     {
7194         snew(comm->dth, comm->nth);
7195     }
7196
7197     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7198     {
7199         init_ddpme(dd, &comm->ddpme[0], 0);
7200         if (comm->npmedecompdim >= 2)
7201         {
7202             init_ddpme(dd, &comm->ddpme[1], 1);
7203         }
7204     }
7205     else
7206     {
7207         comm->npmenodes = 0;
7208         if (dd->pme_nodeid >= 0)
7209         {
7210             gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
7211                                  "Can not have separate PME ranks without PME electrostatics");
7212         }
7213     }
7214
7215     if (debug)
7216     {
7217         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7218     }
7219     if (comm->dlbState != edlbsOffForever)
7220     {
7221         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7222     }
7223
7224     print_dd_settings(fplog, dd, ir, dlbIsOn(comm), dlb_scale, ddbox);
7225     if (comm->dlbState == edlbsOffCanTurnOn)
7226     {
7227         if (fplog)
7228         {
7229             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7230         }
7231         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7232     }
7233
7234     if (ir->ePBC == epbcNONE)
7235     {
7236         vol_frac = 1 - 1/(double)dd->nnodes;
7237     }
7238     else
7239     {
7240         vol_frac =
7241             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7242     }
7243     if (debug)
7244     {
7245         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7246     }
7247     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7248
7249     dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
7250 }
7251
7252 static gmx_bool test_dd_cutoff(t_commrec *cr,
7253                                t_state *state, const t_inputrec *ir,
7254                                real cutoff_req)
7255 {
7256     gmx_domdec_t *dd;
7257     gmx_ddbox_t   ddbox;
7258     int           d, dim, np;
7259     real          inv_cell_size;
7260     int           LocallyLimited;
7261
7262     dd = cr->dd;
7263
7264     set_ddbox(dd, FALSE, cr, ir, state->box,
7265               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7266
7267     LocallyLimited = 0;
7268
7269     for (d = 0; d < dd->ndim; d++)
7270     {
7271         dim = dd->dim[d];
7272
7273         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7274         if (dynamic_dd_box(&ddbox, ir))
7275         {
7276             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7277         }
7278
7279         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7280
7281         if (dd->comm->dlbState != edlbsOffForever && dim < ddbox.npbcdim &&
7282             dd->comm->cd[d].np_dlb > 0)
7283         {
7284             if (np > dd->comm->cd[d].np_dlb)
7285             {
7286                 return FALSE;
7287             }
7288
7289             /* If a current local cell size is smaller than the requested
7290              * cut-off, we could still fix it, but this gets very complicated.
7291              * Without fixing here, we might actually need more checks.
7292              */
7293             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7294             {
7295                 LocallyLimited = 1;
7296             }
7297         }
7298     }
7299
7300     if (dd->comm->dlbState != edlbsOffForever)
7301     {
7302         /* If DLB is not active yet, we don't need to check the grid jumps.
7303          * Actually we shouldn't, because then the grid jump data is not set.
7304          */
7305         if (dlbIsOn(dd->comm) &&
7306             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7307         {
7308             LocallyLimited = 1;
7309         }
7310
7311         gmx_sumi(1, &LocallyLimited, cr);
7312
7313         if (LocallyLimited > 0)
7314         {
7315             return FALSE;
7316         }
7317     }
7318
7319     return TRUE;
7320 }
7321
7322 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, const t_inputrec *ir,
7323                           real cutoff_req)
7324 {
7325     gmx_bool bCutoffAllowed;
7326
7327     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7328
7329     if (bCutoffAllowed)
7330     {
7331         cr->dd->comm->cutoff = cutoff_req;
7332     }
7333
7334     return bCutoffAllowed;
7335 }
7336
7337 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
7338 {
7339     gmx_domdec_comm_t *comm;
7340
7341     comm = cr->dd->comm;
7342
7343     /* Turn on the DLB limiting (might have been on already) */
7344     comm->bPMELoadBalDLBLimits = TRUE;
7345
7346     /* Change the cut-off limit */
7347     comm->PMELoadBal_max_cutoff = cutoff;
7348
7349     if (debug)
7350     {
7351         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
7352                 comm->PMELoadBal_max_cutoff);
7353     }
7354 }
7355
7356 /* Sets whether we should later check the load imbalance data, so that
7357  * we can trigger dynamic load balancing if enough imbalance has
7358  * arisen.
7359  *
7360  * Used after PME load balancing unlocks DLB, so that the check
7361  * whether DLB will be useful can happen immediately.
7362  */
7363 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
7364 {
7365     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7366     {
7367         dd->comm->bCheckWhetherToTurnDlbOn = bValue;
7368     }
7369 }
7370
7371 /* Returns if we should check whether there has been enough load
7372  * imbalance to trigger dynamic load balancing.
7373  */
7374 static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
7375 {
7376     const int nddp_chk_dlb = 100;
7377
7378     if (dd->comm->dlbState != edlbsOffCanTurnOn)
7379     {
7380         return FALSE;
7381     }
7382
7383     /* We should check whether we should use DLB directly after
7384      * unlocking DLB. */
7385     if (dd->comm->bCheckWhetherToTurnDlbOn)
7386     {
7387         /* This flag was set when the PME load-balancing routines
7388            unlocked DLB, and should now be cleared. */
7389         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
7390         return TRUE;
7391     }
7392     /* We should also check whether we should use DLB every 100
7393      * partitionings (we do not do this every partioning, so that we
7394      * avoid excessive communication). */
7395     if (dd->comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1)
7396     {
7397         return TRUE;
7398     }
7399
7400     return FALSE;
7401 }
7402
7403 gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
7404 {
7405     return (dd->comm->dlbState == edlbsOn);
7406 }
7407
7408 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
7409 {
7410     return (dd->comm->dlbState == edlbsOffTemporarilyLocked);
7411 }
7412
7413 void dd_dlb_lock(gmx_domdec_t *dd)
7414 {
7415     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7416     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7417     {
7418         dd->comm->dlbState = edlbsOffTemporarilyLocked;
7419     }
7420 }
7421
7422 void dd_dlb_unlock(gmx_domdec_t *dd)
7423 {
7424     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7425     if (dd->comm->dlbState == edlbsOffTemporarilyLocked)
7426     {
7427         dd->comm->dlbState = edlbsOffCanTurnOn;
7428         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
7429     }
7430 }
7431
7432 static void merge_cg_buffers(int ncell,
7433                              gmx_domdec_comm_dim_t *cd, int pulse,
7434                              int  *ncg_cell,
7435                              int  *index_gl, int  *recv_i,
7436                              rvec *cg_cm,    rvec *recv_vr,
7437                              int *cgindex,
7438                              cginfo_mb_t *cginfo_mb, int *cginfo)
7439 {
7440     gmx_domdec_ind_t *ind, *ind_p;
7441     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7442     int               shift, shift_at;
7443
7444     ind = &cd->ind[pulse];
7445
7446     /* First correct the already stored data */
7447     shift = ind->nrecv[ncell];
7448     for (cell = ncell-1; cell >= 0; cell--)
7449     {
7450         shift -= ind->nrecv[cell];
7451         if (shift > 0)
7452         {
7453             /* Move the cg's present from previous grid pulses */
7454             cg0                = ncg_cell[ncell+cell];
7455             cg1                = ncg_cell[ncell+cell+1];
7456             cgindex[cg1+shift] = cgindex[cg1];
7457             for (cg = cg1-1; cg >= cg0; cg--)
7458             {
7459                 index_gl[cg+shift] = index_gl[cg];
7460                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7461                 cgindex[cg+shift] = cgindex[cg];
7462                 cginfo[cg+shift]  = cginfo[cg];
7463             }
7464             /* Correct the already stored send indices for the shift */
7465             for (p = 1; p <= pulse; p++)
7466             {
7467                 ind_p = &cd->ind[p];
7468                 cg0   = 0;
7469                 for (c = 0; c < cell; c++)
7470                 {
7471                     cg0 += ind_p->nsend[c];
7472                 }
7473                 cg1 = cg0 + ind_p->nsend[cell];
7474                 for (cg = cg0; cg < cg1; cg++)
7475                 {
7476                     ind_p->index[cg] += shift;
7477                 }
7478             }
7479         }
7480     }
7481
7482     /* Merge in the communicated buffers */
7483     shift    = 0;
7484     shift_at = 0;
7485     cg0      = 0;
7486     for (cell = 0; cell < ncell; cell++)
7487     {
7488         cg1 = ncg_cell[ncell+cell+1] + shift;
7489         if (shift_at > 0)
7490         {
7491             /* Correct the old cg indices */
7492             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7493             {
7494                 cgindex[cg+1] += shift_at;
7495             }
7496         }
7497         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7498         {
7499             /* Copy this charge group from the buffer */
7500             index_gl[cg1] = recv_i[cg0];
7501             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7502             /* Add it to the cgindex */
7503             cg_gl          = index_gl[cg1];
7504             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7505             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7506             cgindex[cg1+1] = cgindex[cg1] + nat;
7507             cg0++;
7508             cg1++;
7509             shift_at += nat;
7510         }
7511         shift                 += ind->nrecv[cell];
7512         ncg_cell[ncell+cell+1] = cg1;
7513     }
7514 }
7515
7516 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7517                                int nzone, int cg0, const int *cgindex)
7518 {
7519     int cg, zone, p;
7520
7521     /* Store the atom block boundaries for easy copying of communication buffers
7522      */
7523     cg = cg0;
7524     for (zone = 0; zone < nzone; zone++)
7525     {
7526         for (p = 0; p < cd->np; p++)
7527         {
7528             cd->ind[p].cell2at0[zone] = cgindex[cg];
7529             cg += cd->ind[p].nrecv[zone];
7530             cd->ind[p].cell2at1[zone] = cgindex[cg];
7531         }
7532     }
7533 }
7534
7535 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7536 {
7537     int      i;
7538     gmx_bool bMiss;
7539
7540     bMiss = FALSE;
7541     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7542     {
7543         if (!bLocalCG[link->a[i]])
7544         {
7545             bMiss = TRUE;
7546         }
7547     }
7548
7549     return bMiss;
7550 }
7551
7552 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7553 typedef struct {
7554     real c[DIM][4]; /* the corners for the non-bonded communication */
7555     real cr0;       /* corner for rounding */
7556     real cr1[4];    /* corners for rounding */
7557     real bc[DIM];   /* corners for bounded communication */
7558     real bcr1;      /* corner for rounding for bonded communication */
7559 } dd_corners_t;
7560
7561 /* Determine the corners of the domain(s) we are communicating with */
7562 static void
7563 set_dd_corners(const gmx_domdec_t *dd,
7564                int dim0, int dim1, int dim2,
7565                gmx_bool bDistMB,
7566                dd_corners_t *c)
7567 {
7568     const gmx_domdec_comm_t  *comm;
7569     const gmx_domdec_zones_t *zones;
7570     int i, j;
7571
7572     comm = dd->comm;
7573
7574     zones = &comm->zones;
7575
7576     /* Keep the compiler happy */
7577     c->cr0  = 0;
7578     c->bcr1 = 0;
7579
7580     /* The first dimension is equal for all cells */
7581     c->c[0][0] = comm->cell_x0[dim0];
7582     if (bDistMB)
7583     {
7584         c->bc[0] = c->c[0][0];
7585     }
7586     if (dd->ndim >= 2)
7587     {
7588         dim1 = dd->dim[1];
7589         /* This cell row is only seen from the first row */
7590         c->c[1][0] = comm->cell_x0[dim1];
7591         /* All rows can see this row */
7592         c->c[1][1] = comm->cell_x0[dim1];
7593         if (dlbIsOn(dd->comm))
7594         {
7595             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7596             if (bDistMB)
7597             {
7598                 /* For the multi-body distance we need the maximum */
7599                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7600             }
7601         }
7602         /* Set the upper-right corner for rounding */
7603         c->cr0 = comm->cell_x1[dim0];
7604
7605         if (dd->ndim >= 3)
7606         {
7607             dim2 = dd->dim[2];
7608             for (j = 0; j < 4; j++)
7609             {
7610                 c->c[2][j] = comm->cell_x0[dim2];
7611             }
7612             if (dlbIsOn(dd->comm))
7613             {
7614                 /* Use the maximum of the i-cells that see a j-cell */
7615                 for (i = 0; i < zones->nizone; i++)
7616                 {
7617                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7618                     {
7619                         if (j >= 4)
7620                         {
7621                             c->c[2][j-4] =
7622                                 std::max(c->c[2][j-4],
7623                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7624                         }
7625                     }
7626                 }
7627                 if (bDistMB)
7628                 {
7629                     /* For the multi-body distance we need the maximum */
7630                     c->bc[2] = comm->cell_x0[dim2];
7631                     for (i = 0; i < 2; i++)
7632                     {
7633                         for (j = 0; j < 2; j++)
7634                         {
7635                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
7636                         }
7637                     }
7638                 }
7639             }
7640
7641             /* Set the upper-right corner for rounding */
7642             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7643              * Only cell (0,0,0) can see cell 7 (1,1,1)
7644              */
7645             c->cr1[0] = comm->cell_x1[dim1];
7646             c->cr1[3] = comm->cell_x1[dim1];
7647             if (dlbIsOn(dd->comm))
7648             {
7649                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7650                 if (bDistMB)
7651                 {
7652                     /* For the multi-body distance we need the maximum */
7653                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7654                 }
7655             }
7656         }
7657     }
7658 }
7659
7660 /* Determine which cg's we need to send in this pulse from this zone */
7661 static void
7662 get_zone_pulse_cgs(gmx_domdec_t *dd,
7663                    int zonei, int zone,
7664                    int cg0, int cg1,
7665                    const int *index_gl,
7666                    const int *cgindex,
7667                    int dim, int dim_ind,
7668                    int dim0, int dim1, int dim2,
7669                    real r_comm2, real r_bcomm2,
7670                    matrix box,
7671                    ivec tric_dist,
7672                    rvec *normal,
7673                    real skew_fac2_d, real skew_fac_01,
7674                    rvec *v_d, rvec *v_0, rvec *v_1,
7675                    const dd_corners_t *c,
7676                    rvec sf2_round,
7677                    gmx_bool bDistBonded,
7678                    gmx_bool bBondComm,
7679                    gmx_bool bDist2B,
7680                    gmx_bool bDistMB,
7681                    rvec *cg_cm,
7682                    int *cginfo,
7683                    gmx_domdec_ind_t *ind,
7684                    int **ibuf, int *ibuf_nalloc,
7685                    vec_rvec_t *vbuf,
7686                    int *nsend_ptr,
7687                    int *nat_ptr,
7688                    int *nsend_z_ptr)
7689 {
7690     gmx_domdec_comm_t *comm;
7691     gmx_bool           bScrew;
7692     gmx_bool           bDistMB_pulse;
7693     int                cg, i;
7694     real               r2, rb2, r, tric_sh;
7695     rvec               rn, rb;
7696     int                dimd;
7697     int                nsend_z, nsend, nat;
7698
7699     comm = dd->comm;
7700
7701     bScrew = (dd->bScrewPBC && dim == XX);
7702
7703     bDistMB_pulse = (bDistMB && bDistBonded);
7704
7705     nsend_z = 0;
7706     nsend   = *nsend_ptr;
7707     nat     = *nat_ptr;
7708
7709     for (cg = cg0; cg < cg1; cg++)
7710     {
7711         r2  = 0;
7712         rb2 = 0;
7713         if (tric_dist[dim_ind] == 0)
7714         {
7715             /* Rectangular direction, easy */
7716             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7717             if (r > 0)
7718             {
7719                 r2 += r*r;
7720             }
7721             if (bDistMB_pulse)
7722             {
7723                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7724                 if (r > 0)
7725                 {
7726                     rb2 += r*r;
7727                 }
7728             }
7729             /* Rounding gives at most a 16% reduction
7730              * in communicated atoms
7731              */
7732             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7733             {
7734                 r = cg_cm[cg][dim0] - c->cr0;
7735                 /* This is the first dimension, so always r >= 0 */
7736                 r2 += r*r;
7737                 if (bDistMB_pulse)
7738                 {
7739                     rb2 += r*r;
7740                 }
7741             }
7742             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7743             {
7744                 r = cg_cm[cg][dim1] - c->cr1[zone];
7745                 if (r > 0)
7746                 {
7747                     r2 += r*r;
7748                 }
7749                 if (bDistMB_pulse)
7750                 {
7751                     r = cg_cm[cg][dim1] - c->bcr1;
7752                     if (r > 0)
7753                     {
7754                         rb2 += r*r;
7755                     }
7756                 }
7757             }
7758         }
7759         else
7760         {
7761             /* Triclinic direction, more complicated */
7762             clear_rvec(rn);
7763             clear_rvec(rb);
7764             /* Rounding, conservative as the skew_fac multiplication
7765              * will slightly underestimate the distance.
7766              */
7767             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7768             {
7769                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7770                 for (i = dim0+1; i < DIM; i++)
7771                 {
7772                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7773                 }
7774                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7775                 if (bDistMB_pulse)
7776                 {
7777                     rb[dim0] = rn[dim0];
7778                     rb2      = r2;
7779                 }
7780                 /* Take care that the cell planes along dim0 might not
7781                  * be orthogonal to those along dim1 and dim2.
7782                  */
7783                 for (i = 1; i <= dim_ind; i++)
7784                 {
7785                     dimd = dd->dim[i];
7786                     if (normal[dim0][dimd] > 0)
7787                     {
7788                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7789                         if (bDistMB_pulse)
7790                         {
7791                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7792                         }
7793                     }
7794                 }
7795             }
7796             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7797             {
7798                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7799                 tric_sh   = 0;
7800                 for (i = dim1+1; i < DIM; i++)
7801                 {
7802                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7803                 }
7804                 rn[dim1] += tric_sh;
7805                 if (rn[dim1] > 0)
7806                 {
7807                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7808                     /* Take care of coupling of the distances
7809                      * to the planes along dim0 and dim1 through dim2.
7810                      */
7811                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7812                     /* Take care that the cell planes along dim1
7813                      * might not be orthogonal to that along dim2.
7814                      */
7815                     if (normal[dim1][dim2] > 0)
7816                     {
7817                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7818                     }
7819                 }
7820                 if (bDistMB_pulse)
7821                 {
7822                     rb[dim1] +=
7823                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7824                     if (rb[dim1] > 0)
7825                     {
7826                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7827                         /* Take care of coupling of the distances
7828                          * to the planes along dim0 and dim1 through dim2.
7829                          */
7830                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7831                         /* Take care that the cell planes along dim1
7832                          * might not be orthogonal to that along dim2.
7833                          */
7834                         if (normal[dim1][dim2] > 0)
7835                         {
7836                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7837                         }
7838                     }
7839                 }
7840             }
7841             /* The distance along the communication direction */
7842             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7843             tric_sh  = 0;
7844             for (i = dim+1; i < DIM; i++)
7845             {
7846                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7847             }
7848             rn[dim] += tric_sh;
7849             if (rn[dim] > 0)
7850             {
7851                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7852                 /* Take care of coupling of the distances
7853                  * to the planes along dim0 and dim1 through dim2.
7854                  */
7855                 if (dim_ind == 1 && zonei == 1)
7856                 {
7857                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7858                 }
7859             }
7860             if (bDistMB_pulse)
7861             {
7862                 clear_rvec(rb);
7863                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7864                 if (rb[dim] > 0)
7865                 {
7866                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7867                     /* Take care of coupling of the distances
7868                      * to the planes along dim0 and dim1 through dim2.
7869                      */
7870                     if (dim_ind == 1 && zonei == 1)
7871                     {
7872                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7873                     }
7874                 }
7875             }
7876         }
7877
7878         if (r2 < r_comm2 ||
7879             (bDistBonded &&
7880              ((bDistMB && rb2 < r_bcomm2) ||
7881               (bDist2B && r2  < r_bcomm2)) &&
7882              (!bBondComm ||
7883               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7884                missing_link(comm->cglink, index_gl[cg],
7885                             comm->bLocalCG)))))
7886         {
7887             /* Make an index to the local charge groups */
7888             if (nsend+1 > ind->nalloc)
7889             {
7890                 ind->nalloc = over_alloc_large(nsend+1);
7891                 srenew(ind->index, ind->nalloc);
7892             }
7893             if (nsend+1 > *ibuf_nalloc)
7894             {
7895                 *ibuf_nalloc = over_alloc_large(nsend+1);
7896                 srenew(*ibuf, *ibuf_nalloc);
7897             }
7898             ind->index[nsend] = cg;
7899             (*ibuf)[nsend]    = index_gl[cg];
7900             nsend_z++;
7901             vec_rvec_check_alloc(vbuf, nsend+1);
7902
7903             if (dd->ci[dim] == 0)
7904             {
7905                 /* Correct cg_cm for pbc */
7906                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7907                 if (bScrew)
7908                 {
7909                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7910                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7911                 }
7912             }
7913             else
7914             {
7915                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7916             }
7917             nsend++;
7918             nat += cgindex[cg+1] - cgindex[cg];
7919         }
7920     }
7921
7922     *nsend_ptr   = nsend;
7923     *nat_ptr     = nat;
7924     *nsend_z_ptr = nsend_z;
7925 }
7926
7927 static void setup_dd_communication(gmx_domdec_t *dd,
7928                                    matrix box, gmx_ddbox_t *ddbox,
7929                                    t_forcerec *fr, t_state *state, rvec **f)
7930 {
7931     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7932     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7933     int                    c, i, cg, cg_gl, nrcg;
7934     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7935     gmx_domdec_comm_t     *comm;
7936     gmx_domdec_zones_t    *zones;
7937     gmx_domdec_comm_dim_t *cd;
7938     gmx_domdec_ind_t      *ind;
7939     cginfo_mb_t           *cginfo_mb;
7940     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
7941     real                   r_comm2, r_bcomm2;
7942     dd_corners_t           corners;
7943     ivec                   tric_dist;
7944     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
7945     real                   skew_fac2_d, skew_fac_01;
7946     rvec                   sf2_round;
7947     int                    nsend, nat;
7948     int                    th;
7949
7950     if (debug)
7951     {
7952         fprintf(debug, "Setting up DD communication\n");
7953     }
7954
7955     comm  = dd->comm;
7956
7957     switch (fr->cutoff_scheme)
7958     {
7959         case ecutsGROUP:
7960             cg_cm = fr->cg_cm;
7961             break;
7962         case ecutsVERLET:
7963             cg_cm = state->x;
7964             break;
7965         default:
7966             gmx_incons("unimplemented");
7967             cg_cm = NULL;
7968     }
7969
7970     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
7971     {
7972         /* Check if we need to use triclinic distances */
7973         tric_dist[dim_ind] = 0;
7974         for (i = 0; i <= dim_ind; i++)
7975         {
7976             if (ddbox->tric_dir[dd->dim[i]])
7977             {
7978                 tric_dist[dim_ind] = 1;
7979             }
7980         }
7981     }
7982
7983     bBondComm = comm->bBondComm;
7984
7985     /* Do we need to determine extra distances for multi-body bondeds? */
7986     bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
7987
7988     /* Do we need to determine extra distances for only two-body bondeds? */
7989     bDist2B = (bBondComm && !bDistMB);
7990
7991     r_comm2  = gmx::square(comm->cutoff);
7992     r_bcomm2 = gmx::square(comm->cutoff_mbody);
7993
7994     if (debug)
7995     {
7996         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, std::sqrt(r_bcomm2));
7997     }
7998
7999     zones = &comm->zones;
8000
8001     dim0 = dd->dim[0];
8002     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8003     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8004
8005     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8006
8007     /* Triclinic stuff */
8008     normal      = ddbox->normal;
8009     skew_fac_01 = 0;
8010     if (dd->ndim >= 2)
8011     {
8012         v_0 = ddbox->v[dim0];
8013         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8014         {
8015             /* Determine the coupling coefficient for the distances
8016              * to the cell planes along dim0 and dim1 through dim2.
8017              * This is required for correct rounding.
8018              */
8019             skew_fac_01 =
8020                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8021             if (debug)
8022             {
8023                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8024             }
8025         }
8026     }
8027     if (dd->ndim >= 3)
8028     {
8029         v_1 = ddbox->v[dim1];
8030     }
8031
8032     zone_cg_range = zones->cg_range;
8033     index_gl      = dd->index_gl;
8034     cgindex       = dd->cgindex;
8035     cginfo_mb     = fr->cginfo_mb;
8036
8037     zone_cg_range[0]   = 0;
8038     zone_cg_range[1]   = dd->ncg_home;
8039     comm->zone_ncg1[0] = dd->ncg_home;
8040     pos_cg             = dd->ncg_home;
8041
8042     nat_tot = dd->nat_home;
8043     nzone   = 1;
8044     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8045     {
8046         dim = dd->dim[dim_ind];
8047         cd  = &comm->cd[dim_ind];
8048
8049         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8050         {
8051             /* No pbc in this dimension, the first node should not comm. */
8052             nzone_send = 0;
8053         }
8054         else
8055         {
8056             nzone_send = nzone;
8057         }
8058
8059         v_d         = ddbox->v[dim];
8060         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
8061
8062         cd->bInPlace = TRUE;
8063         for (p = 0; p < cd->np; p++)
8064         {
8065             /* Only atoms communicated in the first pulse are used
8066              * for multi-body bonded interactions or for bBondComm.
8067              */
8068             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8069
8070             ind   = &cd->ind[p];
8071             nsend = 0;
8072             nat   = 0;
8073             for (zone = 0; zone < nzone_send; zone++)
8074             {
8075                 if (tric_dist[dim_ind] && dim_ind > 0)
8076                 {
8077                     /* Determine slightly more optimized skew_fac's
8078                      * for rounding.
8079                      * This reduces the number of communicated atoms
8080                      * by about 10% for 3D DD of rhombic dodecahedra.
8081                      */
8082                     for (dimd = 0; dimd < dim; dimd++)
8083                     {
8084                         sf2_round[dimd] = 1;
8085                         if (ddbox->tric_dir[dimd])
8086                         {
8087                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8088                             {
8089                                 /* If we are shifted in dimension i
8090                                  * and the cell plane is tilted forward
8091                                  * in dimension i, skip this coupling.
8092                                  */
8093                                 if (!(zones->shift[nzone+zone][i] &&
8094                                       ddbox->v[dimd][i][dimd] >= 0))
8095                                 {
8096                                     sf2_round[dimd] +=
8097                                         gmx::square(ddbox->v[dimd][i][dimd]);
8098                                 }
8099                             }
8100                             sf2_round[dimd] = 1/sf2_round[dimd];
8101                         }
8102                     }
8103                 }
8104
8105                 zonei = zone_perm[dim_ind][zone];
8106                 if (p == 0)
8107                 {
8108                     /* Here we permutate the zones to obtain a convenient order
8109                      * for neighbor searching
8110                      */
8111                     cg0 = zone_cg_range[zonei];
8112                     cg1 = zone_cg_range[zonei+1];
8113                 }
8114                 else
8115                 {
8116                     /* Look only at the cg's received in the previous grid pulse
8117                      */
8118                     cg1 = zone_cg_range[nzone+zone+1];
8119                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8120                 }
8121
8122 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8123                 for (th = 0; th < comm->nth; th++)
8124                 {
8125                     try
8126                     {
8127                         gmx_domdec_ind_t *ind_p;
8128                         int             **ibuf_p, *ibuf_nalloc_p;
8129                         vec_rvec_t       *vbuf_p;
8130                         int              *nsend_p, *nat_p;
8131                         int              *nsend_zone_p;
8132                         int               cg0_th, cg1_th;
8133
8134                         if (th == 0)
8135                         {
8136                             /* Thread 0 writes in the comm buffers */
8137                             ind_p         = ind;
8138                             ibuf_p        = &comm->buf_int;
8139                             ibuf_nalloc_p = &comm->nalloc_int;
8140                             vbuf_p        = &comm->vbuf;
8141                             nsend_p       = &nsend;
8142                             nat_p         = &nat;
8143                             nsend_zone_p  = &ind->nsend[zone];
8144                         }
8145                         else
8146                         {
8147                             /* Other threads write into temp buffers */
8148                             ind_p         = &comm->dth[th].ind;
8149                             ibuf_p        = &comm->dth[th].ibuf;
8150                             ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8151                             vbuf_p        = &comm->dth[th].vbuf;
8152                             nsend_p       = &comm->dth[th].nsend;
8153                             nat_p         = &comm->dth[th].nat;
8154                             nsend_zone_p  = &comm->dth[th].nsend_zone;
8155
8156                             comm->dth[th].nsend      = 0;
8157                             comm->dth[th].nat        = 0;
8158                             comm->dth[th].nsend_zone = 0;
8159                         }
8160
8161                         if (comm->nth == 1)
8162                         {
8163                             cg0_th = cg0;
8164                             cg1_th = cg1;
8165                         }
8166                         else
8167                         {
8168                             cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8169                             cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8170                         }
8171
8172                         /* Get the cg's for this pulse in this zone */
8173                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8174                                            index_gl, cgindex,
8175                                            dim, dim_ind, dim0, dim1, dim2,
8176                                            r_comm2, r_bcomm2,
8177                                            box, tric_dist,
8178                                            normal, skew_fac2_d, skew_fac_01,
8179                                            v_d, v_0, v_1, &corners, sf2_round,
8180                                            bDistBonded, bBondComm,
8181                                            bDist2B, bDistMB,
8182                                            cg_cm, fr->cginfo,
8183                                            ind_p,
8184                                            ibuf_p, ibuf_nalloc_p,
8185                                            vbuf_p,
8186                                            nsend_p, nat_p,
8187                                            nsend_zone_p);
8188                     }
8189                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
8190                 } // END
8191
8192                 /* Append data of threads>=1 to the communication buffers */
8193                 for (th = 1; th < comm->nth; th++)
8194                 {
8195                     dd_comm_setup_work_t *dth;
8196                     int                   i, ns1;
8197
8198                     dth = &comm->dth[th];
8199
8200                     ns1 = nsend + dth->nsend_zone;
8201                     if (ns1 > ind->nalloc)
8202                     {
8203                         ind->nalloc = over_alloc_dd(ns1);
8204                         srenew(ind->index, ind->nalloc);
8205                     }
8206                     if (ns1 > comm->nalloc_int)
8207                     {
8208                         comm->nalloc_int = over_alloc_dd(ns1);
8209                         srenew(comm->buf_int, comm->nalloc_int);
8210                     }
8211                     if (ns1 > comm->vbuf.nalloc)
8212                     {
8213                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8214                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8215                     }
8216
8217                     for (i = 0; i < dth->nsend_zone; i++)
8218                     {
8219                         ind->index[nsend]    = dth->ind.index[i];
8220                         comm->buf_int[nsend] = dth->ibuf[i];
8221                         copy_rvec(dth->vbuf.v[i],
8222                                   comm->vbuf.v[nsend]);
8223                         nsend++;
8224                     }
8225                     nat              += dth->nat;
8226                     ind->nsend[zone] += dth->nsend_zone;
8227                 }
8228             }
8229             /* Clear the counts in case we do not have pbc */
8230             for (zone = nzone_send; zone < nzone; zone++)
8231             {
8232                 ind->nsend[zone] = 0;
8233             }
8234             ind->nsend[nzone]   = nsend;
8235             ind->nsend[nzone+1] = nat;
8236             /* Communicate the number of cg's and atoms to receive */
8237             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8238                             ind->nsend, nzone+2,
8239                             ind->nrecv, nzone+2);
8240
8241             /* The rvec buffer is also required for atom buffers of size nsend
8242              * in dd_move_x and dd_move_f.
8243              */
8244             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8245
8246             if (p > 0)
8247             {
8248                 /* We can receive in place if only the last zone is not empty */
8249                 for (zone = 0; zone < nzone-1; zone++)
8250                 {
8251                     if (ind->nrecv[zone] > 0)
8252                     {
8253                         cd->bInPlace = FALSE;
8254                     }
8255                 }
8256                 if (!cd->bInPlace)
8257                 {
8258                     /* The int buffer is only required here for the cg indices */
8259                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8260                     {
8261                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8262                         srenew(comm->buf_int2, comm->nalloc_int2);
8263                     }
8264                     /* The rvec buffer is also required for atom buffers
8265                      * of size nrecv in dd_move_x and dd_move_f.
8266                      */
8267                     i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8268                     vec_rvec_check_alloc(&comm->vbuf2, i);
8269                 }
8270             }
8271
8272             /* Make space for the global cg indices */
8273             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8274                 || dd->cg_nalloc == 0)
8275             {
8276                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8277                 srenew(index_gl, dd->cg_nalloc);
8278                 srenew(cgindex, dd->cg_nalloc+1);
8279             }
8280             /* Communicate the global cg indices */
8281             if (cd->bInPlace)
8282             {
8283                 recv_i = index_gl + pos_cg;
8284             }
8285             else
8286             {
8287                 recv_i = comm->buf_int2;
8288             }
8289             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8290                             comm->buf_int, nsend,
8291                             recv_i,        ind->nrecv[nzone]);
8292
8293             /* Make space for cg_cm */
8294             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8295             if (fr->cutoff_scheme == ecutsGROUP)
8296             {
8297                 cg_cm = fr->cg_cm;
8298             }
8299             else
8300             {
8301                 cg_cm = state->x;
8302             }
8303             /* Communicate cg_cm */
8304             if (cd->bInPlace)
8305             {
8306                 recv_vr = cg_cm + pos_cg;
8307             }
8308             else
8309             {
8310                 recv_vr = comm->vbuf2.v;
8311             }
8312             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8313                              comm->vbuf.v, nsend,
8314                              recv_vr,      ind->nrecv[nzone]);
8315
8316             /* Make the charge group index */
8317             if (cd->bInPlace)
8318             {
8319                 zone = (p == 0 ? 0 : nzone - 1);
8320                 while (zone < nzone)
8321                 {
8322                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8323                     {
8324                         cg_gl              = index_gl[pos_cg];
8325                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8326                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8327                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8328                         if (bBondComm)
8329                         {
8330                             /* Update the charge group presence,
8331                              * so we can use it in the next pass of the loop.
8332                              */
8333                             comm->bLocalCG[cg_gl] = TRUE;
8334                         }
8335                         pos_cg++;
8336                     }
8337                     if (p == 0)
8338                     {
8339                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8340                     }
8341                     zone++;
8342                     zone_cg_range[nzone+zone] = pos_cg;
8343                 }
8344             }
8345             else
8346             {
8347                 /* This part of the code is never executed with bBondComm. */
8348                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8349                                  index_gl, recv_i, cg_cm, recv_vr,
8350                                  cgindex, fr->cginfo_mb, fr->cginfo);
8351                 pos_cg += ind->nrecv[nzone];
8352             }
8353             nat_tot += ind->nrecv[nzone+1];
8354         }
8355         if (!cd->bInPlace)
8356         {
8357             /* Store the atom block for easy copying of communication buffers */
8358             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8359         }
8360         nzone += nzone;
8361     }
8362     dd->index_gl = index_gl;
8363     dd->cgindex  = cgindex;
8364
8365     dd->ncg_tot          = zone_cg_range[zones->n];
8366     dd->nat_tot          = nat_tot;
8367     comm->nat[ddnatHOME] = dd->nat_home;
8368     for (i = ddnatZONE; i < ddnatNR; i++)
8369     {
8370         comm->nat[i] = dd->nat_tot;
8371     }
8372
8373     if (!bBondComm)
8374     {
8375         /* We don't need to update cginfo, since that was alrady done above.
8376          * So we pass NULL for the forcerec.
8377          */
8378         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8379                       NULL, comm->bLocalCG);
8380     }
8381
8382     if (debug)
8383     {
8384         fprintf(debug, "Finished setting up DD communication, zones:");
8385         for (c = 0; c < zones->n; c++)
8386         {
8387             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8388         }
8389         fprintf(debug, "\n");
8390     }
8391 }
8392
8393 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8394 {
8395     int c;
8396
8397     for (c = 0; c < zones->nizone; c++)
8398     {
8399         zones->izone[c].cg1  = zones->cg_range[c+1];
8400         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8401         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8402     }
8403 }
8404
8405 static void set_zones_size(gmx_domdec_t *dd,
8406                            matrix box, const gmx_ddbox_t *ddbox,
8407                            int zone_start, int zone_end)
8408 {
8409     gmx_domdec_comm_t  *comm;
8410     gmx_domdec_zones_t *zones;
8411     gmx_bool            bDistMB;
8412     int                 z, zi, d, dim;
8413     real                rcs, rcmbs;
8414     int                 i, j;
8415     real                vol;
8416
8417     comm = dd->comm;
8418
8419     zones = &comm->zones;
8420
8421     /* Do we need to determine extra distances for multi-body bondeds? */
8422     bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
8423
8424     for (z = zone_start; z < zone_end; z++)
8425     {
8426         /* Copy cell limits to zone limits.
8427          * Valid for non-DD dims and non-shifted dims.
8428          */
8429         copy_rvec(comm->cell_x0, zones->size[z].x0);
8430         copy_rvec(comm->cell_x1, zones->size[z].x1);
8431     }
8432
8433     for (d = 0; d < dd->ndim; d++)
8434     {
8435         dim = dd->dim[d];
8436
8437         for (z = 0; z < zones->n; z++)
8438         {
8439             /* With a staggered grid we have different sizes
8440              * for non-shifted dimensions.
8441              */
8442             if (dlbIsOn(dd->comm) && zones->shift[z][dim] == 0)
8443             {
8444                 if (d == 1)
8445                 {
8446                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8447                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8448                 }
8449                 else if (d == 2)
8450                 {
8451                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8452                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8453                 }
8454             }
8455         }
8456
8457         rcs   = comm->cutoff;
8458         rcmbs = comm->cutoff_mbody;
8459         if (ddbox->tric_dir[dim])
8460         {
8461             rcs   /= ddbox->skew_fac[dim];
8462             rcmbs /= ddbox->skew_fac[dim];
8463         }
8464
8465         /* Set the lower limit for the shifted zone dimensions */
8466         for (z = zone_start; z < zone_end; z++)
8467         {
8468             if (zones->shift[z][dim] > 0)
8469             {
8470                 dim = dd->dim[d];
8471                 if (!dlbIsOn(dd->comm) || d == 0)
8472                 {
8473                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8474                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8475                 }
8476                 else
8477                 {
8478                     /* Here we take the lower limit of the zone from
8479                      * the lowest domain of the zone below.
8480                      */
8481                     if (z < 4)
8482                     {
8483                         zones->size[z].x0[dim] =
8484                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8485                     }
8486                     else
8487                     {
8488                         if (d == 1)
8489                         {
8490                             zones->size[z].x0[dim] =
8491                                 zones->size[zone_perm[2][z-4]].x0[dim];
8492                         }
8493                         else
8494                         {
8495                             zones->size[z].x0[dim] =
8496                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8497                         }
8498                     }
8499                     /* A temporary limit, is updated below */
8500                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8501
8502                     if (bDistMB)
8503                     {
8504                         for (zi = 0; zi < zones->nizone; zi++)
8505                         {
8506                             if (zones->shift[zi][dim] == 0)
8507                             {
8508                                 /* This takes the whole zone into account.
8509                                  * With multiple pulses this will lead
8510                                  * to a larger zone then strictly necessary.
8511                                  */
8512                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8513                                                                   zones->size[zi].x1[dim]+rcmbs);
8514                             }
8515                         }
8516                     }
8517                 }
8518             }
8519         }
8520
8521         /* Loop over the i-zones to set the upper limit of each
8522          * j-zone they see.
8523          */
8524         for (zi = 0; zi < zones->nizone; zi++)
8525         {
8526             if (zones->shift[zi][dim] == 0)
8527             {
8528                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8529                 {
8530                     if (zones->shift[z][dim] > 0)
8531                     {
8532                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8533                                                           zones->size[zi].x1[dim]+rcs);
8534                     }
8535                 }
8536             }
8537         }
8538     }
8539
8540     for (z = zone_start; z < zone_end; z++)
8541     {
8542         /* Initialization only required to keep the compiler happy */
8543         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8544         int  nc, c;
8545
8546         /* To determine the bounding box for a zone we need to find
8547          * the extreme corners of 4, 2 or 1 corners.
8548          */
8549         nc = 1 << (ddbox->nboundeddim - 1);
8550
8551         for (c = 0; c < nc; c++)
8552         {
8553             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8554             corner[XX] = 0;
8555             if ((c & 1) == 0)
8556             {
8557                 corner[YY] = zones->size[z].x0[YY];
8558             }
8559             else
8560             {
8561                 corner[YY] = zones->size[z].x1[YY];
8562             }
8563             if ((c & 2) == 0)
8564             {
8565                 corner[ZZ] = zones->size[z].x0[ZZ];
8566             }
8567             else
8568             {
8569                 corner[ZZ] = zones->size[z].x1[ZZ];
8570             }
8571             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
8572                 box[ZZ][1 - dd->dim[0]] != 0)
8573             {
8574                 /* With 1D domain decomposition the cg's are not in
8575                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
8576                  * Shift the corner of the z-vector back to along the box
8577                  * vector of dimension d, so it will later end up at 0 along d.
8578                  * This can affect the location of this corner along dd->dim[0]
8579                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
8580                  */
8581                 int d = 1 - dd->dim[0];
8582
8583                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
8584             }
8585             /* Apply the triclinic couplings */
8586             assert(ddbox->npbcdim <= DIM);
8587             for (i = YY; i < ddbox->npbcdim; i++)
8588             {
8589                 for (j = XX; j < i; j++)
8590                 {
8591                     corner[j] += corner[i]*box[i][j]/box[i][i];
8592                 }
8593             }
8594             if (c == 0)
8595             {
8596                 copy_rvec(corner, corner_min);
8597                 copy_rvec(corner, corner_max);
8598             }
8599             else
8600             {
8601                 for (i = 0; i < DIM; i++)
8602                 {
8603                     corner_min[i] = std::min(corner_min[i], corner[i]);
8604                     corner_max[i] = std::max(corner_max[i], corner[i]);
8605                 }
8606             }
8607         }
8608         /* Copy the extreme cornes without offset along x */
8609         for (i = 0; i < DIM; i++)
8610         {
8611             zones->size[z].bb_x0[i] = corner_min[i];
8612             zones->size[z].bb_x1[i] = corner_max[i];
8613         }
8614         /* Add the offset along x */
8615         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8616         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8617     }
8618
8619     if (zone_start == 0)
8620     {
8621         vol = 1;
8622         for (dim = 0; dim < DIM; dim++)
8623         {
8624             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8625         }
8626         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8627     }
8628
8629     if (debug)
8630     {
8631         for (z = zone_start; z < zone_end; z++)
8632         {
8633             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8634                     z,
8635                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8636                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8637                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8638             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8639                     z,
8640                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8641                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8642                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8643         }
8644     }
8645 }
8646
8647 static int comp_cgsort(const void *a, const void *b)
8648 {
8649     int           comp;
8650
8651     gmx_cgsort_t *cga, *cgb;
8652     cga = (gmx_cgsort_t *)a;
8653     cgb = (gmx_cgsort_t *)b;
8654
8655     comp = cga->nsc - cgb->nsc;
8656     if (comp == 0)
8657     {
8658         comp = cga->ind_gl - cgb->ind_gl;
8659     }
8660
8661     return comp;
8662 }
8663
8664 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8665                          int *a, int *buf)
8666 {
8667     int i;
8668
8669     /* Order the data */
8670     for (i = 0; i < n; i++)
8671     {
8672         buf[i] = a[sort[i].ind];
8673     }
8674
8675     /* Copy back to the original array */
8676     for (i = 0; i < n; i++)
8677     {
8678         a[i] = buf[i];
8679     }
8680 }
8681
8682 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8683                          rvec *v, rvec *buf)
8684 {
8685     int i;
8686
8687     /* Order the data */
8688     for (i = 0; i < n; i++)
8689     {
8690         copy_rvec(v[sort[i].ind], buf[i]);
8691     }
8692
8693     /* Copy back to the original array */
8694     for (i = 0; i < n; i++)
8695     {
8696         copy_rvec(buf[i], v[i]);
8697     }
8698 }
8699
8700 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8701                            rvec *v, rvec *buf)
8702 {
8703     int a, atot, cg, cg0, cg1, i;
8704
8705     if (cgindex == NULL)
8706     {
8707         /* Avoid the useless loop of the atoms within a cg */
8708         order_vec_cg(ncg, sort, v, buf);
8709
8710         return;
8711     }
8712
8713     /* Order the data */
8714     a = 0;
8715     for (cg = 0; cg < ncg; cg++)
8716     {
8717         cg0 = cgindex[sort[cg].ind];
8718         cg1 = cgindex[sort[cg].ind+1];
8719         for (i = cg0; i < cg1; i++)
8720         {
8721             copy_rvec(v[i], buf[a]);
8722             a++;
8723         }
8724     }
8725     atot = a;
8726
8727     /* Copy back to the original array */
8728     for (a = 0; a < atot; a++)
8729     {
8730         copy_rvec(buf[a], v[a]);
8731     }
8732 }
8733
8734 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8735                          int nsort_new, gmx_cgsort_t *sort_new,
8736                          gmx_cgsort_t *sort1)
8737 {
8738     int i1, i2, i_new;
8739
8740     /* The new indices are not very ordered, so we qsort them */
8741     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8742
8743     /* sort2 is already ordered, so now we can merge the two arrays */
8744     i1    = 0;
8745     i2    = 0;
8746     i_new = 0;
8747     while (i2 < nsort2 || i_new < nsort_new)
8748     {
8749         if (i2 == nsort2)
8750         {
8751             sort1[i1++] = sort_new[i_new++];
8752         }
8753         else if (i_new == nsort_new)
8754         {
8755             sort1[i1++] = sort2[i2++];
8756         }
8757         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8758                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8759                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8760         {
8761             sort1[i1++] = sort2[i2++];
8762         }
8763         else
8764         {
8765             sort1[i1++] = sort_new[i_new++];
8766         }
8767     }
8768 }
8769
8770 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8771 {
8772     gmx_domdec_sort_t *sort;
8773     gmx_cgsort_t      *cgsort, *sort_i;
8774     int                ncg_new, nsort2, nsort_new, i, *a, moved;
8775
8776     sort = dd->comm->sort;
8777
8778     a = fr->ns->grid->cell_index;
8779
8780     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
8781
8782     if (ncg_home_old >= 0)
8783     {
8784         /* The charge groups that remained in the same ns grid cell
8785          * are completely ordered. So we can sort efficiently by sorting
8786          * the charge groups that did move into the stationary list.
8787          */
8788         ncg_new   = 0;
8789         nsort2    = 0;
8790         nsort_new = 0;
8791         for (i = 0; i < dd->ncg_home; i++)
8792         {
8793             /* Check if this cg did not move to another node */
8794             if (a[i] < moved)
8795             {
8796                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8797                 {
8798                     /* This cg is new on this node or moved ns grid cell */
8799                     if (nsort_new >= sort->sort_new_nalloc)
8800                     {
8801                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8802                         srenew(sort->sort_new, sort->sort_new_nalloc);
8803                     }
8804                     sort_i = &(sort->sort_new[nsort_new++]);
8805                 }
8806                 else
8807                 {
8808                     /* This cg did not move */
8809                     sort_i = &(sort->sort2[nsort2++]);
8810                 }
8811                 /* Sort on the ns grid cell indices
8812                  * and the global topology index.
8813                  * index_gl is irrelevant with cell ns,
8814                  * but we set it here anyhow to avoid a conditional.
8815                  */
8816                 sort_i->nsc    = a[i];
8817                 sort_i->ind_gl = dd->index_gl[i];
8818                 sort_i->ind    = i;
8819                 ncg_new++;
8820             }
8821         }
8822         if (debug)
8823         {
8824             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8825                     nsort2, nsort_new);
8826         }
8827         /* Sort efficiently */
8828         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8829                      sort->sort);
8830     }
8831     else
8832     {
8833         cgsort  = sort->sort;
8834         ncg_new = 0;
8835         for (i = 0; i < dd->ncg_home; i++)
8836         {
8837             /* Sort on the ns grid cell indices
8838              * and the global topology index
8839              */
8840             cgsort[i].nsc    = a[i];
8841             cgsort[i].ind_gl = dd->index_gl[i];
8842             cgsort[i].ind    = i;
8843             if (cgsort[i].nsc < moved)
8844             {
8845                 ncg_new++;
8846             }
8847         }
8848         if (debug)
8849         {
8850             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8851         }
8852         /* Determine the order of the charge groups using qsort */
8853         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8854     }
8855
8856     return ncg_new;
8857 }
8858
8859 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8860 {
8861     gmx_cgsort_t *sort;
8862     int           ncg_new, i, na;
8863     const int    *a;
8864
8865     sort = dd->comm->sort->sort;
8866
8867     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8868
8869     ncg_new = 0;
8870     for (i = 0; i < na; i++)
8871     {
8872         if (a[i] >= 0)
8873         {
8874             sort[ncg_new].ind = a[i];
8875             ncg_new++;
8876         }
8877     }
8878
8879     return ncg_new;
8880 }
8881
8882 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
8883                           int ncg_home_old)
8884 {
8885     gmx_domdec_sort_t *sort;
8886     gmx_cgsort_t      *cgsort;
8887     int               *cgindex;
8888     int                ncg_new, i, *ibuf, cgsize;
8889     rvec              *vbuf;
8890
8891     sort = dd->comm->sort;
8892
8893     if (dd->ncg_home > sort->sort_nalloc)
8894     {
8895         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8896         srenew(sort->sort, sort->sort_nalloc);
8897         srenew(sort->sort2, sort->sort_nalloc);
8898     }
8899     cgsort = sort->sort;
8900
8901     switch (fr->cutoff_scheme)
8902     {
8903         case ecutsGROUP:
8904             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8905             break;
8906         case ecutsVERLET:
8907             ncg_new = dd_sort_order_nbnxn(dd, fr);
8908             break;
8909         default:
8910             gmx_incons("unimplemented");
8911             ncg_new = 0;
8912     }
8913
8914     /* We alloc with the old size, since cgindex is still old */
8915     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8916     vbuf = dd->comm->vbuf.v;
8917
8918     if (dd->comm->bCGs)
8919     {
8920         cgindex = dd->cgindex;
8921     }
8922     else
8923     {
8924         cgindex = NULL;
8925     }
8926
8927     /* Remove the charge groups which are no longer at home here */
8928     dd->ncg_home = ncg_new;
8929     if (debug)
8930     {
8931         fprintf(debug, "Set the new home charge group count to %d\n",
8932                 dd->ncg_home);
8933     }
8934
8935     /* Reorder the state */
8936     for (i = 0; i < estNR; i++)
8937     {
8938         if (EST_DISTR(i) && (state->flags & (1<<i)))
8939         {
8940             switch (i)
8941             {
8942                 case estX:
8943                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
8944                     break;
8945                 case estV:
8946                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
8947                     break;
8948                 case estSDX:
8949                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
8950                     break;
8951                 case estCGP:
8952                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
8953                     break;
8954                 case estLD_RNG:
8955                 case estLD_RNGI:
8956                 case estDISRE_INITF:
8957                 case estDISRE_RM3TAV:
8958                 case estORIRE_INITF:
8959                 case estORIRE_DTAV:
8960                     /* No ordering required */
8961                     break;
8962                 default:
8963                     gmx_incons("Unknown state entry encountered in dd_sort_state");
8964                     break;
8965             }
8966         }
8967     }
8968     if (fr->cutoff_scheme == ecutsGROUP)
8969     {
8970         /* Reorder cgcm */
8971         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
8972     }
8973
8974     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8975     {
8976         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8977         srenew(sort->ibuf, sort->ibuf_nalloc);
8978     }
8979     ibuf = sort->ibuf;
8980     /* Reorder the global cg index */
8981     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
8982     /* Reorder the cginfo */
8983     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
8984     /* Rebuild the local cg index */
8985     if (dd->comm->bCGs)
8986     {
8987         ibuf[0] = 0;
8988         for (i = 0; i < dd->ncg_home; i++)
8989         {
8990             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8991             ibuf[i+1] = ibuf[i] + cgsize;
8992         }
8993         for (i = 0; i < dd->ncg_home+1; i++)
8994         {
8995             dd->cgindex[i] = ibuf[i];
8996         }
8997     }
8998     else
8999     {
9000         for (i = 0; i < dd->ncg_home+1; i++)
9001         {
9002             dd->cgindex[i] = i;
9003         }
9004     }
9005     /* Set the home atom number */
9006     dd->nat_home = dd->cgindex[dd->ncg_home];
9007
9008     if (fr->cutoff_scheme == ecutsVERLET)
9009     {
9010         /* The atoms are now exactly in grid order, update the grid order */
9011         nbnxn_set_atomorder(fr->nbv->nbs);
9012     }
9013     else
9014     {
9015         /* Copy the sorted ns cell indices back to the ns grid struct */
9016         for (i = 0; i < dd->ncg_home; i++)
9017         {
9018             fr->ns->grid->cell_index[i] = cgsort[i].nsc;
9019         }
9020         fr->ns->grid->nr = dd->ncg_home;
9021     }
9022 }
9023
9024 static void add_dd_statistics(gmx_domdec_t *dd)
9025 {
9026     gmx_domdec_comm_t *comm;
9027     int                ddnat;
9028
9029     comm = dd->comm;
9030
9031     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9032     {
9033         comm->sum_nat[ddnat-ddnatZONE] +=
9034             comm->nat[ddnat] - comm->nat[ddnat-1];
9035     }
9036     comm->ndecomp++;
9037 }
9038
9039 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9040 {
9041     gmx_domdec_comm_t *comm;
9042     int                ddnat;
9043
9044     comm = dd->comm;
9045
9046     /* Reset all the statistics and counters for total run counting */
9047     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9048     {
9049         comm->sum_nat[ddnat-ddnatZONE] = 0;
9050     }
9051     comm->ndecomp   = 0;
9052     comm->nload     = 0;
9053     comm->load_step = 0;
9054     comm->load_sum  = 0;
9055     comm->load_max  = 0;
9056     clear_ivec(comm->load_lim);
9057     comm->load_mdf = 0;
9058     comm->load_pme = 0;
9059 }
9060
9061 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9062 {
9063     gmx_domdec_comm_t *comm;
9064     int                ddnat;
9065     double             av;
9066
9067     comm = cr->dd->comm;
9068
9069     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9070
9071     if (fplog == NULL)
9072     {
9073         return;
9074     }
9075
9076     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9077
9078     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9079     {
9080         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9081         switch (ddnat)
9082         {
9083             case ddnatZONE:
9084                 fprintf(fplog,
9085                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9086                         2, av);
9087                 break;
9088             case ddnatVSITE:
9089                 if (cr->dd->vsite_comm)
9090                 {
9091                     fprintf(fplog,
9092                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9093                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9094                             av);
9095                 }
9096                 break;
9097             case ddnatCON:
9098                 if (cr->dd->constraint_comm)
9099                 {
9100                     fprintf(fplog,
9101                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9102                             1 + ir->nLincsIter, av);
9103                 }
9104                 break;
9105             default:
9106                 gmx_incons(" Unknown type for DD statistics");
9107         }
9108     }
9109     fprintf(fplog, "\n");
9110
9111     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9112     {
9113         print_dd_load_av(fplog, cr->dd);
9114     }
9115 }
9116
9117 void dd_partition_system(FILE                *fplog,
9118                          gmx_int64_t          step,
9119                          t_commrec           *cr,
9120                          gmx_bool             bMasterState,
9121                          int                  nstglobalcomm,
9122                          t_state             *state_global,
9123                          gmx_mtop_t          *top_global,
9124                          t_inputrec          *ir,
9125                          t_state             *state_local,
9126                          rvec               **f,
9127                          t_mdatoms           *mdatoms,
9128                          gmx_localtop_t      *top_local,
9129                          t_forcerec          *fr,
9130                          gmx_vsite_t         *vsite,
9131                          gmx_shellfc_t       *shellfc,
9132                          gmx_constr_t         constr,
9133                          t_nrnb              *nrnb,
9134                          gmx_wallcycle_t      wcycle,
9135                          gmx_bool             bVerbose)
9136 {
9137     gmx_domdec_t      *dd;
9138     gmx_domdec_comm_t *comm;
9139     gmx_ddbox_t        ddbox = {0};
9140     t_block           *cgs_gl;
9141     gmx_int64_t        step_pcoupl;
9142     rvec               cell_ns_x0, cell_ns_x1;
9143     int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9144     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bTurnOnDLB, bLogLoad;
9145     gmx_bool           bRedist, bSortCG, bResortAll;
9146     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9147     real               grid_density;
9148     char               sbuf[22];
9149
9150     wallcycle_start(wcycle, ewcDOMDEC);
9151
9152     dd   = cr->dd;
9153     comm = dd->comm;
9154
9155     bBoxChanged = (bMasterState || inputrecDeform(ir));
9156     if (ir->epc != epcNO)
9157     {
9158         /* With nstpcouple > 1 pressure coupling happens.
9159          * one step after calculating the pressure.
9160          * Box scaling happens at the end of the MD step,
9161          * after the DD partitioning.
9162          * We therefore have to do DLB in the first partitioning
9163          * after an MD step where P-coupling occured.
9164          * We need to determine the last step in which p-coupling occurred.
9165          * MRS -- need to validate this for vv?
9166          */
9167         n = ir->nstpcouple;
9168         if (n == 1)
9169         {
9170             step_pcoupl = step - 1;
9171         }
9172         else
9173         {
9174             step_pcoupl = ((step - 1)/n)*n + 1;
9175         }
9176         if (step_pcoupl >= comm->partition_step)
9177         {
9178             bBoxChanged = TRUE;
9179         }
9180     }
9181
9182     bNStGlobalComm = (step % nstglobalcomm == 0);
9183
9184     if (!dlbIsOn(comm))
9185     {
9186         bDoDLB = FALSE;
9187     }
9188     else
9189     {
9190         /* Should we do dynamic load balacing this step?
9191          * Since it requires (possibly expensive) global communication,
9192          * we might want to do DLB less frequently.
9193          */
9194         if (bBoxChanged || ir->epc != epcNO)
9195         {
9196             bDoDLB = bBoxChanged;
9197         }
9198         else
9199         {
9200             bDoDLB = bNStGlobalComm;
9201         }
9202     }
9203
9204     /* Check if we have recorded loads on the nodes */
9205     if (comm->bRecordLoad && dd_load_count(comm) > 0)
9206     {
9207         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
9208
9209         /* Print load every nstlog, first and last step to the log file */
9210         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9211                     comm->n_load_collect == 0 ||
9212                     (ir->nsteps >= 0 &&
9213                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9214
9215         /* Avoid extra communication due to verbose screen output
9216          * when nstglobalcomm is set.
9217          */
9218         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
9219             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9220         {
9221             get_load_distribution(dd, wcycle);
9222             if (DDMASTER(dd))
9223             {
9224                 if (bLogLoad)
9225                 {
9226                     dd_print_load(fplog, dd, step-1);
9227                 }
9228                 if (bVerbose)
9229                 {
9230                     dd_print_load_verbose(dd);
9231                 }
9232             }
9233             comm->n_load_collect++;
9234
9235             if (bCheckWhetherToTurnDlbOn)
9236             {
9237                 /* Since the timings are node dependent, the master decides */
9238                 if (DDMASTER(dd))
9239                 {
9240                     /* Here we check if the max PME rank load is more than 0.98
9241                      * the max PP force load. If so, PP DLB will not help,
9242                      * since we are (almost) limited by PME. Furthermore,
9243                      * DLB will cause a significant extra x/f redistribution
9244                      * cost on the PME ranks, which will then surely result
9245                      * in lower total performance.
9246                      * This check might be fragile, since one measurement
9247                      * below 0.98 (although only done once every 100 DD part.)
9248                      * could turn on DLB for the rest of the run.
9249                      */
9250                     if (cr->npmenodes > 0 &&
9251                         dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
9252                     {
9253                         bTurnOnDLB = FALSE;
9254                     }
9255                     else
9256                     {
9257                         bTurnOnDLB =
9258                             (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9259                     }
9260                     if (debug)
9261                     {
9262                         fprintf(debug, "step %s, imb loss %f\n",
9263                                 gmx_step_str(step, sbuf),
9264                                 dd_force_imb_perf_loss(dd));
9265                     }
9266                 }
9267                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9268                 if (bTurnOnDLB)
9269                 {
9270                     turn_on_dlb(fplog, cr, step);
9271                     bDoDLB = TRUE;
9272                 }
9273             }
9274         }
9275         comm->n_load_have++;
9276     }
9277
9278     cgs_gl = &comm->cgs_gl;
9279
9280     bRedist = FALSE;
9281     if (bMasterState)
9282     {
9283         /* Clear the old state */
9284         clear_dd_indices(dd, 0, 0);
9285         ncgindex_set = 0;
9286
9287         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9288                   TRUE, cgs_gl, state_global->x, &ddbox);
9289
9290         get_cg_distribution(fplog, dd, cgs_gl,
9291                             state_global->box, &ddbox, state_global->x);
9292
9293         dd_distribute_state(dd, cgs_gl,
9294                             state_global, state_local, f);
9295
9296         dd_make_local_cgs(dd, &top_local->cgs);
9297
9298         /* Ensure that we have space for the new distribution */
9299         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9300
9301         if (fr->cutoff_scheme == ecutsGROUP)
9302         {
9303             calc_cgcm(fplog, 0, dd->ncg_home,
9304                       &top_local->cgs, state_local->x, fr->cg_cm);
9305         }
9306
9307         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9308
9309         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9310     }
9311     else if (state_local->ddp_count != dd->ddp_count)
9312     {
9313         if (state_local->ddp_count > dd->ddp_count)
9314         {
9315             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9316         }
9317
9318         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9319         {
9320             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9321         }
9322
9323         /* Clear the old state */
9324         clear_dd_indices(dd, 0, 0);
9325
9326         /* Build the new indices */
9327         rebuild_cgindex(dd, cgs_gl->index, state_local);
9328         make_dd_indices(dd, cgs_gl->index, 0);
9329         ncgindex_set = dd->ncg_home;
9330
9331         if (fr->cutoff_scheme == ecutsGROUP)
9332         {
9333             /* Redetermine the cg COMs */
9334             calc_cgcm(fplog, 0, dd->ncg_home,
9335                       &top_local->cgs, state_local->x, fr->cg_cm);
9336         }
9337
9338         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9339
9340         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9341
9342         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9343                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9344
9345         bRedist = dlbIsOn(comm);
9346     }
9347     else
9348     {
9349         /* We have the full state, only redistribute the cgs */
9350
9351         /* Clear the non-home indices */
9352         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9353         ncgindex_set = 0;
9354
9355         /* Avoid global communication for dim's without pbc and -gcom */
9356         if (!bNStGlobalComm)
9357         {
9358             copy_rvec(comm->box0, ddbox.box0    );
9359             copy_rvec(comm->box_size, ddbox.box_size);
9360         }
9361         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9362                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9363
9364         bBoxChanged = TRUE;
9365         bRedist     = TRUE;
9366     }
9367     /* For dim's without pbc and -gcom */
9368     copy_rvec(ddbox.box0, comm->box0    );
9369     copy_rvec(ddbox.box_size, comm->box_size);
9370
9371     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9372                       step, wcycle);
9373
9374     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9375     {
9376         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9377     }
9378
9379     /* Check if we should sort the charge groups */
9380     if (comm->nstSortCG > 0)
9381     {
9382         bSortCG = (bMasterState ||
9383                    (bRedist && (step % comm->nstSortCG == 0)));
9384     }
9385     else
9386     {
9387         bSortCG = FALSE;
9388     }
9389
9390     ncg_home_old = dd->ncg_home;
9391
9392     ncg_moved = 0;
9393     if (bRedist)
9394     {
9395         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9396
9397         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9398                            state_local, f, fr,
9399                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9400
9401         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9402     }
9403
9404     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9405                           dd, &ddbox,
9406                           &comm->cell_x0, &comm->cell_x1,
9407                           dd->ncg_home, fr->cg_cm,
9408                           cell_ns_x0, cell_ns_x1, &grid_density);
9409
9410     if (bBoxChanged)
9411     {
9412         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9413     }
9414
9415     switch (fr->cutoff_scheme)
9416     {
9417         case ecutsGROUP:
9418             copy_ivec(fr->ns->grid->n, ncells_old);
9419             grid_first(fplog, fr->ns->grid, dd, &ddbox,
9420                        state_local->box, cell_ns_x0, cell_ns_x1,
9421                        fr->rlist, grid_density);
9422             break;
9423         case ecutsVERLET:
9424             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9425             break;
9426         default:
9427             gmx_incons("unimplemented");
9428     }
9429     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9430     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9431
9432     if (bSortCG)
9433     {
9434         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9435
9436         /* Sort the state on charge group position.
9437          * This enables exact restarts from this step.
9438          * It also improves performance by about 15% with larger numbers
9439          * of atoms per node.
9440          */
9441
9442         /* Fill the ns grid with the home cell,
9443          * so we can sort with the indices.
9444          */
9445         set_zones_ncg_home(dd);
9446
9447         switch (fr->cutoff_scheme)
9448         {
9449             case ecutsVERLET:
9450                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9451
9452                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9453                                   0,
9454                                   comm->zones.size[0].bb_x0,
9455                                   comm->zones.size[0].bb_x1,
9456                                   0, dd->ncg_home,
9457                                   comm->zones.dens_zone0,
9458                                   fr->cginfo,
9459                                   state_local->x,
9460                                   ncg_moved, bRedist ? comm->moved : NULL,
9461                                   fr->nbv->grp[eintLocal].kernel_type,
9462                                   fr->nbv->grp[eintLocal].nbat);
9463
9464                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9465                 break;
9466             case ecutsGROUP:
9467                 fill_grid(&comm->zones, fr->ns->grid, dd->ncg_home,
9468                           0, dd->ncg_home, fr->cg_cm);
9469
9470                 copy_ivec(fr->ns->grid->n, ncells_new);
9471                 break;
9472             default:
9473                 gmx_incons("unimplemented");
9474         }
9475
9476         bResortAll = bMasterState;
9477
9478         /* Check if we can user the old order and ns grid cell indices
9479          * of the charge groups to sort the charge groups efficiently.
9480          */
9481         if (ncells_new[XX] != ncells_old[XX] ||
9482             ncells_new[YY] != ncells_old[YY] ||
9483             ncells_new[ZZ] != ncells_old[ZZ])
9484         {
9485             bResortAll = TRUE;
9486         }
9487
9488         if (debug)
9489         {
9490             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9491                     gmx_step_str(step, sbuf), dd->ncg_home);
9492         }
9493         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9494                       bResortAll ? -1 : ncg_home_old);
9495         /* Rebuild all the indices */
9496         ga2la_clear(dd->ga2la);
9497         ncgindex_set = 0;
9498
9499         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9500     }
9501
9502     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9503
9504     /* Setup up the communication and communicate the coordinates */
9505     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9506
9507     /* Set the indices */
9508     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9509
9510     /* Set the charge group boundaries for neighbor searching */
9511     set_cg_boundaries(&comm->zones);
9512
9513     if (fr->cutoff_scheme == ecutsVERLET)
9514     {
9515         set_zones_size(dd, state_local->box, &ddbox,
9516                        bSortCG ? 1 : 0, comm->zones.n);
9517     }
9518
9519     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9520
9521     /*
9522        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9523                  -1,state_local->x,state_local->box);
9524      */
9525
9526     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9527
9528     /* Extract a local topology from the global topology */
9529     for (i = 0; i < dd->ndim; i++)
9530     {
9531         np[dd->dim[i]] = comm->cd[i].np;
9532     }
9533     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9534                       comm->cellsize_min, np,
9535                       fr,
9536                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9537                       vsite, top_global, top_local);
9538
9539     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9540
9541     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9542
9543     /* Set up the special atom communication */
9544     n = comm->nat[ddnatZONE];
9545     for (i = ddnatZONE+1; i < ddnatNR; i++)
9546     {
9547         switch (i)
9548         {
9549             case ddnatVSITE:
9550                 if (vsite && vsite->n_intercg_vsite)
9551                 {
9552                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9553                 }
9554                 break;
9555             case ddnatCON:
9556                 if (dd->bInterCGcons || dd->bInterCGsettles)
9557                 {
9558                     /* Only for inter-cg constraints we need special code */
9559                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9560                                                   constr, ir->nProjOrder,
9561                                                   top_local->idef.il);
9562                 }
9563                 break;
9564             default:
9565                 gmx_incons("Unknown special atom type setup");
9566         }
9567         comm->nat[i] = n;
9568     }
9569
9570     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9571
9572     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9573
9574     /* Make space for the extra coordinates for virtual site
9575      * or constraint communication.
9576      */
9577     state_local->natoms = comm->nat[ddnatNR-1];
9578     if (state_local->natoms > state_local->nalloc)
9579     {
9580         dd_realloc_state(state_local, f, state_local->natoms);
9581     }
9582
9583     if (fr->bF_NoVirSum)
9584     {
9585         if (vsite && vsite->n_intercg_vsite)
9586         {
9587             nat_f_novirsum = comm->nat[ddnatVSITE];
9588         }
9589         else
9590         {
9591             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9592             {
9593                 nat_f_novirsum = dd->nat_tot;
9594             }
9595             else
9596             {
9597                 nat_f_novirsum = dd->nat_home;
9598             }
9599         }
9600     }
9601     else
9602     {
9603         nat_f_novirsum = 0;
9604     }
9605
9606     /* Set the number of atoms required for the force calculation.
9607      * Forces need to be constrained when doing energy
9608      * minimization. For simple simulations we could avoid some
9609      * allocation, zeroing and copying, but this is probably not worth
9610      * the complications and checking.
9611      */
9612     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9613                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9614
9615     /* We make the all mdatoms up to nat_tot_con.
9616      * We could save some work by only setting invmass
9617      * between nat_tot and nat_tot_con.
9618      */
9619     /* This call also sets the new number of home particles to dd->nat_home */
9620     atoms2md(top_global, ir,
9621              comm->nat[ddnatCON], dd->gatindex, dd->nat_home, mdatoms);
9622
9623     /* Now we have the charges we can sort the FE interactions */
9624     dd_sort_local_top(dd, mdatoms, top_local);
9625
9626     if (vsite != NULL)
9627     {
9628         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9629         split_vsites_over_threads(top_local->idef.il, top_local->idef.iparams,
9630                                   mdatoms, FALSE, vsite);
9631     }
9632
9633     if (shellfc)
9634     {
9635         /* Make the local shell stuff, currently no communication is done */
9636         make_local_shells(cr, mdatoms, shellfc);
9637     }
9638
9639     if (ir->implicit_solvent)
9640     {
9641         make_local_gb(cr, fr->born, ir->gb_algorithm);
9642     }
9643
9644     setup_bonded_threading(fr, &top_local->idef);
9645
9646     if (!(cr->duty & DUTY_PME))
9647     {
9648         /* Send the charges and/or c6/sigmas to our PME only node */
9649         gmx_pme_send_parameters(cr,
9650                                 fr->ic,
9651                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9652                                 mdatoms->chargeA, mdatoms->chargeB,
9653                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9654                                 mdatoms->sigmaA, mdatoms->sigmaB,
9655                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9656     }
9657
9658     if (constr)
9659     {
9660         set_constraints(constr, top_local, ir, mdatoms, cr);
9661     }
9662
9663     if (ir->bPull)
9664     {
9665         /* Update the local pull groups */
9666         dd_make_local_pull_groups(cr, ir->pull_work, mdatoms);
9667     }
9668
9669     if (ir->bRot)
9670     {
9671         /* Update the local rotation groups */
9672         dd_make_local_rotation_groups(dd, ir->rot);
9673     }
9674
9675     if (ir->eSwapCoords != eswapNO)
9676     {
9677         /* Update the local groups needed for ion swapping */
9678         dd_make_local_swap_groups(dd, ir->swap);
9679     }
9680
9681     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9682     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9683
9684     add_dd_statistics(dd);
9685
9686     /* Make sure we only count the cycles for this DD partitioning */
9687     clear_dd_cycle_counts(dd);
9688
9689     /* Because the order of the atoms might have changed since
9690      * the last vsite construction, we need to communicate the constructing
9691      * atom coordinates again (for spreading the forces this MD step).
9692      */
9693     dd_move_x_vsites(dd, state_local->box, state_local->x);
9694
9695     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9696
9697     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9698     {
9699         dd_move_x(dd, state_local->box, state_local->x);
9700         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9701                      -1, state_local->x, state_local->box);
9702     }
9703
9704     /* Store the partitioning step */
9705     comm->partition_step = step;
9706
9707     /* Increase the DD partitioning counter */
9708     dd->ddp_count++;
9709     /* The state currently matches this DD partitioning count, store it */
9710     state_local->ddp_count = dd->ddp_count;
9711     if (bMasterState)
9712     {
9713         /* The DD master node knows the complete cg distribution,
9714          * store the count so we can possibly skip the cg info communication.
9715          */
9716         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9717     }
9718
9719     if (comm->DD_debug > 0)
9720     {
9721         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9722         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9723                                 "after partitioning");
9724     }
9725
9726     wallcycle_stop(wcycle, ewcDOMDEC);
9727 }