src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <limits.h>
  44 #include <math.h>
  45 #include <stdio.h>
  46 #include <stdlib.h>
  47 #include <string.h>
  48
  49 #include <algorithm>
  50
  51 #include "gromacs/domdec/domdec_network.h"
  52 #include "gromacs/ewald/pme.h"
  53 #include "gromacs/fileio/gmxfio.h"
  54 #include "gromacs/fileio/pdbio.h"
  55 #include "gromacs/gmxlib/chargegroup.h"
  56 #include "gromacs/gmxlib/df_history.h"
  57 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
  58 #include "gromacs/imd/imd.h"
  59 #include "gromacs/legacyheaders/force.h"
  60 #include "gromacs/legacyheaders/genborn.h"
  61 #include "gromacs/legacyheaders/gmx_ga2la.h"
  62 #include "gromacs/legacyheaders/gmx_omp_nthreads.h"
  63 #include "gromacs/legacyheaders/names.h"
  64 #include "gromacs/legacyheaders/network.h"
  65 #include "gromacs/legacyheaders/nrnb.h"
  66 #include "gromacs/legacyheaders/nsgrid.h"
  67 #include "gromacs/legacyheaders/typedefs.h"
  68 #include "gromacs/legacyheaders/vsite.h"
  69 #include "gromacs/legacyheaders/types/commrec.h"
  70 #include "gromacs/legacyheaders/types/enums.h"
  71 #include "gromacs/legacyheaders/types/forcerec.h"
  72 #include "gromacs/legacyheaders/types/hw_info.h"
  73 #include "gromacs/legacyheaders/types/ifunc.h"
  74 #include "gromacs/legacyheaders/types/inputrec.h"
  75 #include "gromacs/legacyheaders/types/mdatom.h"
  76 #include "gromacs/legacyheaders/types/nrnb.h"
  77 #include "gromacs/legacyheaders/types/ns.h"
  78 #include "gromacs/legacyheaders/types/nsgrid.h"
  79 #include "gromacs/legacyheaders/types/state.h"
  80 #include "gromacs/listed-forces/manage-threading.h"
  81 #include "gromacs/math/vec.h"
  82 #include "gromacs/math/vectypes.h"
  83 #include "gromacs/mdlib/constr.h"
  84 #include "gromacs/mdlib/forcerec.h"
  85 #include "gromacs/mdlib/mdatoms.h"
  86 #include "gromacs/mdlib/mdrun.h"
  87 #include "gromacs/mdlib/nb_verlet.h"
  88 #include "gromacs/mdlib/nbnxn_grid.h"
  89 #include "gromacs/mdlib/shellfc.h"
  90 #include "gromacs/pbcutil/ishift.h"
  91 #include "gromacs/pbcutil/pbc.h"
  92 #include "gromacs/pulling/pull.h"
  93 #include "gromacs/pulling/pull_rotation.h"
  94 #include "gromacs/swap/swapcoords.h"
  95 #include "gromacs/timing/wallcycle.h"
  96 #include "gromacs/topology/block.h"
  97 #include "gromacs/topology/idef.h"
  98 #include "gromacs/topology/mtop_util.h"
  99 #include "gromacs/topology/topology.h"
 100 #include "gromacs/utility/basedefinitions.h"
 101 #include "gromacs/utility/basenetwork.h"
 102 #include "gromacs/utility/cstringutil.h"
 103 #include "gromacs/utility/exceptions.h"
 104 #include "gromacs/utility/fatalerror.h"
 105 #include "gromacs/utility/gmxmpi.h"
 106 #include "gromacs/utility/qsort_threadsafe.h"
 107 #include "gromacs/utility/real.h"
 108 #include "gromacs/utility/smalloc.h"
 109
 110 #include "domdec_constraints.h"
 111 #include "domdec_internal.h"
 112 #include "domdec_vsite.h"
 113
 114 #define DDRANK(dd, rank)    (rank)
 115 #define DDMASTERRANK(dd)   (dd->masterrank)
 116
 117 typedef struct gmx_domdec_master
 118 {
 119     /* The cell boundaries */
 120     real **cell_x;
 121     /* The global charge group division */
 122     int   *ncg;    /* Number of home charge groups for each node */
 123     int   *index;  /* Index of nnodes+1 into cg */
 124     int   *cg;     /* Global charge group index */
 125     int   *nat;    /* Number of home atoms for each node. */
 126     int   *ibuf;   /* Buffer for communication */
 127     rvec  *vbuf;   /* Buffer for state scattering and gathering */
 128 } gmx_domdec_master_t;
 129
 130 #define DD_NLOAD_MAX 9
 131
 132 const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on" };
 133
 134 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 135 #define DD_CGIBS 2
 136
 137 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 138 #define DD_FLAG_NRCG  65535
 139 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 140 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 141
 142 /* The DD zone order */
 143 static const ivec dd_zo[DD_MAXZONE] =
 144 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 145
 146 /* The 3D setup */
 147 #define dd_z3n  8
 148 #define dd_zp3n 4
 149 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 150
 151 /* The 2D setup */
 152 #define dd_z2n  4
 153 #define dd_zp2n 2
 154 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 155
 156 /* The 1D setup */
 157 #define dd_z1n  2
 158 #define dd_zp1n 1
 159 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 160
 161 /* The 0D setup */
 162 #define dd_z0n  1
 163 #define dd_zp0n 1
 164 static const ivec dd_zp0[dd_zp0n] = {{0, 0, 1}};
 165
 166 /* Factors used to avoid problems due to rounding issues */
 167 #define DD_CELL_MARGIN       1.0001
 168 #define DD_CELL_MARGIN2      1.00005
 169 /* Factor to account for pressure scaling during nstlist steps */
 170 #define DD_PRES_SCALE_MARGIN 1.02
 171
 172 /* Turn on DLB when the load imbalance causes this amount of total loss.
 173  * There is a bit of overhead with DLB and it's difficult to achieve
 174  * a load imbalance of less than 2% with DLB.
 175  */
 176 #define DD_PERF_LOSS_DLB_ON  0.02
 177
 178 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 179 #define DD_PERF_LOSS_WARN    0.05
 180
 181 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 182
 183 /* Use separate MPI send and receive commands
 184  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 185  * This saves memory (and some copying for small nnodes).
 186  * For high parallelization scatter and gather calls are used.
 187  */
 188 #define GMX_DD_NNODES_SENDRECV 4
 189
 190
 191 /*
 192    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 193
 194    static void index2xyz(ivec nc,int ind,ivec xyz)
 195    {
 196    xyz[XX] = ind % nc[XX];
 197    xyz[YY] = (ind / nc[XX]) % nc[YY];
 198    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 199    }
 200  */
 201
 202 /* This order is required to minimize the coordinate communication in PME
 203  * which uses decomposition in the x direction.
 204  */
 205 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 206
 207 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 208 {
 209     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 210     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 211     xyz[ZZ] = ind % nc[ZZ];
 212 }
 213
 214 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 215 {
 216     int ddindex;
 217     int ddnodeid = -1;
 218
 219     ddindex = dd_index(dd->nc, c);
 220     if (dd->comm->bCartesianPP_PME)
 221     {
 222         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 223     }
 224     else if (dd->comm->bCartesianPP)
 225     {
 226 #ifdef GMX_MPI
 227         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 228 #endif
 229     }
 230     else
 231     {
 232         ddnodeid = ddindex;
 233     }
 234
 235     return ddnodeid;
 236 }
 237
 238 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 239 {
 240     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 241 }
 242
 243 int ddglatnr(gmx_domdec_t *dd, int i)
 244 {
 245     int atnr;
 246
 247     if (dd == NULL)
 248     {
 249         atnr = i + 1;
 250     }
 251     else
 252     {
 253         if (i >= dd->comm->nat[ddnatNR-1])
 254         {
 255             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 256         }
 257         atnr = dd->gatindex[i] + 1;
 258     }
 259
 260     return atnr;
 261 }
 262
 263 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 264 {
 265     return &dd->comm->cgs_gl;
 266 }
 267
 268 static bool dlbIsOn(const gmx_domdec_comm_t *comm)
 269 {
 270     return (comm->dlbState == edlbsOn);
 271 }
 272
 273 static void vec_rvec_init(vec_rvec_t *v)
 274 {
 275     v->nalloc = 0;
 276     v->v      = NULL;
 277 }
 278
 279 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 280 {
 281     if (n > v->nalloc)
 282     {
 283         v->nalloc = over_alloc_dd(n);
 284         srenew(v->v, v->nalloc);
 285     }
 286 }
 287
 288 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 289 {
 290     int i;
 291
 292     if (state->ddp_count != dd->ddp_count)
 293     {
 294         gmx_incons("The state does not the domain decomposition state");
 295     }
 296
 297     state->ncg_gl = dd->ncg_home;
 298     if (state->ncg_gl > state->cg_gl_nalloc)
 299     {
 300         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 301         srenew(state->cg_gl, state->cg_gl_nalloc);
 302     }
 303     for (i = 0; i < state->ncg_gl; i++)
 304     {
 305         state->cg_gl[i] = dd->index_gl[i];
 306     }
 307
 308     state->ddp_count_cg_gl = dd->ddp_count;
 309 }
 310
 311 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 312 {
 313     return &dd->comm->zones;
 314 }
 315
 316 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 317                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 318 {
 319     gmx_domdec_zones_t *zones;
 320     int                 izone, d, dim;
 321
 322     zones = &dd->comm->zones;
 323
 324     izone = 0;
 325     while (icg >= zones->izone[izone].cg1)
 326     {
 327         izone++;
 328     }
 329
 330     if (izone == 0)
 331     {
 332         *jcg0 = icg;
 333     }
 334     else if (izone < zones->nizone)
 335     {
 336         *jcg0 = zones->izone[izone].jcg0;
 337     }
 338     else
 339     {
 340         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 341                   icg, izone, zones->nizone);
 342     }
 343
 344     *jcg1 = zones->izone[izone].jcg1;
 345
 346     for (d = 0; d < dd->ndim; d++)
 347     {
 348         dim         = dd->dim[d];
 349         shift0[dim] = zones->izone[izone].shift0[dim];
 350         shift1[dim] = zones->izone[izone].shift1[dim];
 351         if (dd->comm->tric_dir[dim] || (dlbIsOn(dd->comm) && d > 0))
 352         {
 353             /* A conservative approach, this can be optimized */
 354             shift0[dim] -= 1;
 355             shift1[dim] += 1;
 356         }
 357     }
 358 }
 359
 360 int dd_natoms_vsite(gmx_domdec_t *dd)
 361 {
 362     return dd->comm->nat[ddnatVSITE];
 363 }
 364
 365 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 366 {
 367     *at_start = dd->comm->nat[ddnatCON-1];
 368     *at_end   = dd->comm->nat[ddnatCON];
 369 }
 370
 371 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 372 {
 373     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 374     int                   *index, *cgindex;
 375     gmx_domdec_comm_t     *comm;
 376     gmx_domdec_comm_dim_t *cd;
 377     gmx_domdec_ind_t      *ind;
 378     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 379     gmx_bool               bPBC, bScrew;
 380
 381     comm = dd->comm;
 382
 383     cgindex = dd->cgindex;
 384
 385     buf = comm->vbuf.v;
 386
 387     nzone   = 1;
 388     nat_tot = dd->nat_home;
 389     for (d = 0; d < dd->ndim; d++)
 390     {
 391         bPBC   = (dd->ci[dd->dim[d]] == 0);
 392         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 393         if (bPBC)
 394         {
 395             copy_rvec(box[dd->dim[d]], shift);
 396         }
 397         cd = &comm->cd[d];
 398         for (p = 0; p < cd->np; p++)
 399         {
 400             ind   = &cd->ind[p];
 401             index = ind->index;
 402             n     = 0;
 403             if (!bPBC)
 404             {
 405                 for (i = 0; i < ind->nsend[nzone]; i++)
 406                 {
 407                     at0 = cgindex[index[i]];
 408                     at1 = cgindex[index[i]+1];
 409                     for (j = at0; j < at1; j++)
 410                     {
 411                         copy_rvec(x[j], buf[n]);
 412                         n++;
 413                     }
 414                 }
 415             }
 416             else if (!bScrew)
 417             {
 418                 for (i = 0; i < ind->nsend[nzone]; i++)
 419                 {
 420                     at0 = cgindex[index[i]];
 421                     at1 = cgindex[index[i]+1];
 422                     for (j = at0; j < at1; j++)
 423                     {
 424                         /* We need to shift the coordinates */
 425                         rvec_add(x[j], shift, buf[n]);
 426                         n++;
 427                     }
 428                 }
 429             }
 430             else
 431             {
 432                 for (i = 0; i < ind->nsend[nzone]; i++)
 433                 {
 434                     at0 = cgindex[index[i]];
 435                     at1 = cgindex[index[i]+1];
 436                     for (j = at0; j < at1; j++)
 437                     {
 438                         /* Shift x */
 439                         buf[n][XX] = x[j][XX] + shift[XX];
 440                         /* Rotate y and z.
 441                          * This operation requires a special shift force
 442                          * treatment, which is performed in calc_vir.
 443                          */
 444                         buf[n][YY] = box[YY][YY] - x[j][YY];
 445                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 446                         n++;
 447                     }
 448                 }
 449             }
 450
 451             if (cd->bInPlace)
 452             {
 453                 rbuf = x + nat_tot;
 454             }
 455             else
 456             {
 457                 rbuf = comm->vbuf2.v;
 458             }
 459             /* Send and receive the coordinates */
 460             dd_sendrecv_rvec(dd, d, dddirBackward,
 461                              buf,  ind->nsend[nzone+1],
 462                              rbuf, ind->nrecv[nzone+1]);
 463             if (!cd->bInPlace)
 464             {
 465                 j = 0;
 466                 for (zone = 0; zone < nzone; zone++)
 467                 {
 468                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 469                     {
 470                         copy_rvec(rbuf[j], x[i]);
 471                         j++;
 472                     }
 473                 }
 474             }
 475             nat_tot += ind->nrecv[nzone+1];
 476         }
 477         nzone += nzone;
 478     }
 479 }
 480
 481 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 482 {
 483     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 484     int                   *index, *cgindex;
 485     gmx_domdec_comm_t     *comm;
 486     gmx_domdec_comm_dim_t *cd;
 487     gmx_domdec_ind_t      *ind;
 488     rvec                  *buf, *sbuf;
 489     ivec                   vis;
 490     int                    is;
 491     gmx_bool               bShiftForcesNeedPbc, bScrew;
 492
 493     comm = dd->comm;
 494
 495     cgindex = dd->cgindex;
 496
 497     buf = comm->vbuf.v;
 498
 499     nzone   = comm->zones.n/2;
 500     nat_tot = dd->nat_tot;
 501     for (d = dd->ndim-1; d >= 0; d--)
 502     {
 503         /* Only forces in domains near the PBC boundaries need to
 504            consider PBC in the treatment of fshift */
 505         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 506         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 507         if (fshift == NULL && !bScrew)
 508         {
 509             bShiftForcesNeedPbc = FALSE;
 510         }
 511         /* Determine which shift vector we need */
 512         clear_ivec(vis);
 513         vis[dd->dim[d]] = 1;
 514         is              = IVEC2IS(vis);
 515
 516         cd = &comm->cd[d];
 517         for (p = cd->np-1; p >= 0; p--)
 518         {
 519             ind      = &cd->ind[p];
 520             nat_tot -= ind->nrecv[nzone+1];
 521             if (cd->bInPlace)
 522             {
 523                 sbuf = f + nat_tot;
 524             }
 525             else
 526             {
 527                 sbuf = comm->vbuf2.v;
 528                 j    = 0;
 529                 for (zone = 0; zone < nzone; zone++)
 530                 {
 531                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 532                     {
 533                         copy_rvec(f[i], sbuf[j]);
 534                         j++;
 535                     }
 536                 }
 537             }
 538             /* Communicate the forces */
 539             dd_sendrecv_rvec(dd, d, dddirForward,
 540                              sbuf, ind->nrecv[nzone+1],
 541                              buf,  ind->nsend[nzone+1]);
 542             index = ind->index;
 543             /* Add the received forces */
 544             n = 0;
 545             if (!bShiftForcesNeedPbc)
 546             {
 547                 for (i = 0; i < ind->nsend[nzone]; i++)
 548                 {
 549                     at0 = cgindex[index[i]];
 550                     at1 = cgindex[index[i]+1];
 551                     for (j = at0; j < at1; j++)
 552                     {
 553                         rvec_inc(f[j], buf[n]);
 554                         n++;
 555                     }
 556                 }
 557             }
 558             else if (!bScrew)
 559             {
 560                 /* fshift should always be defined if this function is
 561                  * called when bShiftForcesNeedPbc is true */
 562                 assert(NULL != fshift);
 563                 for (i = 0; i < ind->nsend[nzone]; i++)
 564                 {
 565                     at0 = cgindex[index[i]];
 566                     at1 = cgindex[index[i]+1];
 567                     for (j = at0; j < at1; j++)
 568                     {
 569                         rvec_inc(f[j], buf[n]);
 570                         /* Add this force to the shift force */
 571                         rvec_inc(fshift[is], buf[n]);
 572                         n++;
 573                     }
 574                 }
 575             }
 576             else
 577             {
 578                 for (i = 0; i < ind->nsend[nzone]; i++)
 579                 {
 580                     at0 = cgindex[index[i]];
 581                     at1 = cgindex[index[i]+1];
 582                     for (j = at0; j < at1; j++)
 583                     {
 584                         /* Rotate the force */
 585                         f[j][XX] += buf[n][XX];
 586                         f[j][YY] -= buf[n][YY];
 587                         f[j][ZZ] -= buf[n][ZZ];
 588                         if (fshift)
 589                         {
 590                             /* Add this force to the shift force */
 591                             rvec_inc(fshift[is], buf[n]);
 592                         }
 593                         n++;
 594                     }
 595                 }
 596             }
 597         }
 598         nzone /= 2;
 599     }
 600 }
 601
 602 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 603 {
 604     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 605     int                   *index, *cgindex;
 606     gmx_domdec_comm_t     *comm;
 607     gmx_domdec_comm_dim_t *cd;
 608     gmx_domdec_ind_t      *ind;
 609     real                  *buf, *rbuf;
 610
 611     comm = dd->comm;
 612
 613     cgindex = dd->cgindex;
 614
 615     buf = &comm->vbuf.v[0][0];
 616
 617     nzone   = 1;
 618     nat_tot = dd->nat_home;
 619     for (d = 0; d < dd->ndim; d++)
 620     {
 621         cd = &comm->cd[d];
 622         for (p = 0; p < cd->np; p++)
 623         {
 624             ind   = &cd->ind[p];
 625             index = ind->index;
 626             n     = 0;
 627             for (i = 0; i < ind->nsend[nzone]; i++)
 628             {
 629                 at0 = cgindex[index[i]];
 630                 at1 = cgindex[index[i]+1];
 631                 for (j = at0; j < at1; j++)
 632                 {
 633                     buf[n] = v[j];
 634                     n++;
 635                 }
 636             }
 637
 638             if (cd->bInPlace)
 639             {
 640                 rbuf = v + nat_tot;
 641             }
 642             else
 643             {
 644                 rbuf = &comm->vbuf2.v[0][0];
 645             }
 646             /* Send and receive the coordinates */
 647             dd_sendrecv_real(dd, d, dddirBackward,
 648                              buf,  ind->nsend[nzone+1],
 649                              rbuf, ind->nrecv[nzone+1]);
 650             if (!cd->bInPlace)
 651             {
 652                 j = 0;
 653                 for (zone = 0; zone < nzone; zone++)
 654                 {
 655                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 656                     {
 657                         v[i] = rbuf[j];
 658                         j++;
 659                     }
 660                 }
 661             }
 662             nat_tot += ind->nrecv[nzone+1];
 663         }
 664         nzone += nzone;
 665     }
 666 }
 667
 668 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 669 {
 670     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 671     int                   *index, *cgindex;
 672     gmx_domdec_comm_t     *comm;
 673     gmx_domdec_comm_dim_t *cd;
 674     gmx_domdec_ind_t      *ind;
 675     real                  *buf, *sbuf;
 676
 677     comm = dd->comm;
 678
 679     cgindex = dd->cgindex;
 680
 681     buf = &comm->vbuf.v[0][0];
 682
 683     nzone   = comm->zones.n/2;
 684     nat_tot = dd->nat_tot;
 685     for (d = dd->ndim-1; d >= 0; d--)
 686     {
 687         cd = &comm->cd[d];
 688         for (p = cd->np-1; p >= 0; p--)
 689         {
 690             ind      = &cd->ind[p];
 691             nat_tot -= ind->nrecv[nzone+1];
 692             if (cd->bInPlace)
 693             {
 694                 sbuf = v + nat_tot;
 695             }
 696             else
 697             {
 698                 sbuf = &comm->vbuf2.v[0][0];
 699                 j    = 0;
 700                 for (zone = 0; zone < nzone; zone++)
 701                 {
 702                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 703                     {
 704                         sbuf[j] = v[i];
 705                         j++;
 706                     }
 707                 }
 708             }
 709             /* Communicate the forces */
 710             dd_sendrecv_real(dd, d, dddirForward,
 711                              sbuf, ind->nrecv[nzone+1],
 712                              buf,  ind->nsend[nzone+1]);
 713             index = ind->index;
 714             /* Add the received forces */
 715             n = 0;
 716             for (i = 0; i < ind->nsend[nzone]; i++)
 717             {
 718                 at0 = cgindex[index[i]];
 719                 at1 = cgindex[index[i]+1];
 720                 for (j = at0; j < at1; j++)
 721                 {
 722                     v[j] += buf[n];
 723                     n++;
 724                 }
 725             }
 726         }
 727         nzone /= 2;
 728     }
 729 }
 730
 731 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 732 {
 733     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 734             d, i, j,
 735             zone->min0, zone->max1,
 736             zone->mch0, zone->mch0,
 737             zone->p1_0, zone->p1_1);
 738 }
 739
 740
 741 #define DDZONECOMM_MAXZONE  5
 742 #define DDZONECOMM_BUFSIZE  3
 743
 744 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 745                                int ddimind, int direction,
 746                                gmx_ddzone_t *buf_s, int n_s,
 747                                gmx_ddzone_t *buf_r, int n_r)
 748 {
 749 #define ZBS  DDZONECOMM_BUFSIZE
 750     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 751     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 752     int  i;
 753
 754     for (i = 0; i < n_s; i++)
 755     {
 756         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 757         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 758         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 759         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 760         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 761         vbuf_s[i*ZBS+1][2] = 0;
 762         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 763         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 764         vbuf_s[i*ZBS+2][2] = 0;
 765     }
 766
 767     dd_sendrecv_rvec(dd, ddimind, direction,
 768                      vbuf_s, n_s*ZBS,
 769                      vbuf_r, n_r*ZBS);
 770
 771     for (i = 0; i < n_r; i++)
 772     {
 773         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 774         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 775         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 776         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 777         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 778         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 779         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 780     }
 781
 782 #undef ZBS
 783 }
 784
 785 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 786                           rvec cell_ns_x0, rvec cell_ns_x1)
 787 {
 788     int                d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
 789     gmx_ddzone_t      *zp;
 790     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 791     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 792     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 793     rvec               extr_s[2], extr_r[2];
 794     rvec               dh;
 795     real               dist_d, c = 0, det;
 796     gmx_domdec_comm_t *comm;
 797     gmx_bool           bPBC, bUse;
 798
 799     comm = dd->comm;
 800
 801     for (d = 1; d < dd->ndim; d++)
 802     {
 803         dim      = dd->dim[d];
 804         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 805         zp->min0 = cell_ns_x0[dim];
 806         zp->max1 = cell_ns_x1[dim];
 807         zp->min1 = cell_ns_x1[dim];
 808         zp->mch0 = cell_ns_x0[dim];
 809         zp->mch1 = cell_ns_x1[dim];
 810         zp->p1_0 = cell_ns_x0[dim];
 811         zp->p1_1 = cell_ns_x1[dim];
 812     }
 813
 814     for (d = dd->ndim-2; d >= 0; d--)
 815     {
 816         dim  = dd->dim[d];
 817         bPBC = (dim < ddbox->npbcdim);
 818
 819         /* Use an rvec to store two reals */
 820         extr_s[d][0] = comm->cell_f0[d+1];
 821         extr_s[d][1] = comm->cell_f1[d+1];
 822         extr_s[d][2] = comm->cell_f1[d+1];
 823
 824         pos = 0;
 825         /* Store the extremes in the backward sending buffer,
 826          * so the get updated separately from the forward communication.
 827          */
 828         for (d1 = d; d1 < dd->ndim-1; d1++)
 829         {
 830             /* We invert the order to be able to use the same loop for buf_e */
 831             buf_s[pos].min0 = extr_s[d1][1];
 832             buf_s[pos].max1 = extr_s[d1][0];
 833             buf_s[pos].min1 = extr_s[d1][2];
 834             buf_s[pos].mch0 = 0;
 835             buf_s[pos].mch1 = 0;
 836             /* Store the cell corner of the dimension we communicate along */
 837             buf_s[pos].p1_0 = comm->cell_x0[dim];
 838             buf_s[pos].p1_1 = 0;
 839             pos++;
 840         }
 841
 842         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 843         pos++;
 844
 845         if (dd->ndim == 3 && d == 0)
 846         {
 847             buf_s[pos] = comm->zone_d2[0][1];
 848             pos++;
 849             buf_s[pos] = comm->zone_d1[0];
 850             pos++;
 851         }
 852
 853         /* We only need to communicate the extremes
 854          * in the forward direction
 855          */
 856         npulse = comm->cd[d].np;
 857         if (bPBC)
 858         {
 859             /* Take the minimum to avoid double communication */
 860             npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
 861         }
 862         else
 863         {
 864             /* Without PBC we should really not communicate over
 865              * the boundaries, but implementing that complicates
 866              * the communication setup and therefore we simply
 867              * do all communication, but ignore some data.
 868              */
 869             npulse_min = npulse;
 870         }
 871         for (p = 0; p < npulse_min; p++)
 872         {
 873             /* Communicate the extremes forward */
 874             bUse = (bPBC || dd->ci[dim] > 0);
 875
 876             dd_sendrecv_rvec(dd, d, dddirForward,
 877                              extr_s+d, dd->ndim-d-1,
 878                              extr_r+d, dd->ndim-d-1);
 879
 880             if (bUse)
 881             {
 882                 for (d1 = d; d1 < dd->ndim-1; d1++)
 883                 {
 884                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 885                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 886                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 887                 }
 888             }
 889         }
 890
 891         buf_size = pos;
 892         for (p = 0; p < npulse; p++)
 893         {
 894             /* Communicate all the zone information backward */
 895             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 896
 897             dd_sendrecv_ddzone(dd, d, dddirBackward,
 898                                buf_s, buf_size,
 899                                buf_r, buf_size);
 900
 901             clear_rvec(dh);
 902             if (p > 0)
 903             {
 904                 for (d1 = d+1; d1 < dd->ndim; d1++)
 905                 {
 906                     /* Determine the decrease of maximum required
 907                      * communication height along d1 due to the distance along d,
 908                      * this avoids a lot of useless atom communication.
 909                      */
 910                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 911
 912                     if (ddbox->tric_dir[dim])
 913                     {
 914                         /* c is the off-diagonal coupling between the cell planes
 915                          * along directions d and d1.
 916                          */
 917                         c = ddbox->v[dim][dd->dim[d1]][dim];
 918                     }
 919                     else
 920                     {
 921                         c = 0;
 922                     }
 923                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 924                     if (det > 0)
 925                     {
 926                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
 927                     }
 928                     else
 929                     {
 930                         /* A negative value signals out of range */
 931                         dh[d1] = -1;
 932                     }
 933                 }
 934             }
 935
 936             /* Accumulate the extremes over all pulses */
 937             for (i = 0; i < buf_size; i++)
 938             {
 939                 if (p == 0)
 940                 {
 941                     buf_e[i] = buf_r[i];
 942                 }
 943                 else
 944                 {
 945                     if (bUse)
 946                     {
 947                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 948                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 949                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 950                     }
 951
 952                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 953                     {
 954                         d1 = 1;
 955                     }
 956                     else
 957                     {
 958                         d1 = d + 1;
 959                     }
 960                     if (bUse && dh[d1] >= 0)
 961                     {
 962                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 963                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 964                     }
 965                 }
 966                 /* Copy the received buffer to the send buffer,
 967                  * to pass the data through with the next pulse.
 968                  */
 969                 buf_s[i] = buf_r[i];
 970             }
 971             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 972                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 973             {
 974                 /* Store the extremes */
 975                 pos = 0;
 976
 977                 for (d1 = d; d1 < dd->ndim-1; d1++)
 978                 {
 979                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 980                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 981                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
 982                     pos++;
 983                 }
 984
 985                 if (d == 1 || (d == 0 && dd->ndim == 3))
 986                 {
 987                     for (i = d; i < 2; i++)
 988                     {
 989                         comm->zone_d2[1-d][i] = buf_e[pos];
 990                         pos++;
 991                     }
 992                 }
 993                 if (d == 0)
 994                 {
 995                     comm->zone_d1[1] = buf_e[pos];
 996                     pos++;
 997                 }
 998             }
 999         }
1000     }
1001
1002     if (dd->ndim >= 2)
1003     {
1004         dim = dd->dim[1];
1005         for (i = 0; i < 2; i++)
1006         {
1007             if (debug)
1008             {
1009                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1010             }
1011             cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1012             cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1013         }
1014     }
1015     if (dd->ndim >= 3)
1016     {
1017         dim = dd->dim[2];
1018         for (i = 0; i < 2; i++)
1019         {
1020             for (j = 0; j < 2; j++)
1021             {
1022                 if (debug)
1023                 {
1024                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1025                 }
1026                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1027                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1028             }
1029         }
1030     }
1031     for (d = 1; d < dd->ndim; d++)
1032     {
1033         comm->cell_f_max0[d] = extr_s[d-1][0];
1034         comm->cell_f_min1[d] = extr_s[d-1][1];
1035         if (debug)
1036         {
1037             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1038                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1039         }
1040     }
1041 }
1042
1043 static void dd_collect_cg(gmx_domdec_t *dd,
1044                           t_state      *state_local)
1045 {
1046     gmx_domdec_master_t *ma = NULL;
1047     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1048
1049     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1050     {
1051         /* The master has the correct distribution */
1052         return;
1053     }
1054
1055     if (state_local->ddp_count == dd->ddp_count)
1056     {
1057         /* The local state and DD are in sync, use the DD indices */
1058         ncg_home = dd->ncg_home;
1059         cg       = dd->index_gl;
1060         nat_home = dd->nat_home;
1061     }
1062     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1063     {
1064         /* The DD is out of sync with the local state, but we have stored
1065          * the cg indices with the local state, so we can use those.
1066          */
1067         t_block *cgs_gl;
1068
1069         cgs_gl = &dd->comm->cgs_gl;
1070
1071         ncg_home = state_local->ncg_gl;
1072         cg       = state_local->cg_gl;
1073         nat_home = 0;
1074         for (i = 0; i < ncg_home; i++)
1075         {
1076             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1077         }
1078     }
1079     else
1080     {
1081         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1082     }
1083
1084     buf2[0] = ncg_home;
1085     buf2[1] = nat_home;
1086     if (DDMASTER(dd))
1087     {
1088         ma   = dd->ma;
1089         ibuf = ma->ibuf;
1090     }
1091     else
1092     {
1093         ibuf = NULL;
1094     }
1095     /* Collect the charge group and atom counts on the master */
1096     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1097
1098     if (DDMASTER(dd))
1099     {
1100         ma->index[0] = 0;
1101         for (i = 0; i < dd->nnodes; i++)
1102         {
1103             ma->ncg[i]     = ma->ibuf[2*i];
1104             ma->nat[i]     = ma->ibuf[2*i+1];
1105             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1106
1107         }
1108         /* Make byte counts and indices */
1109         for (i = 0; i < dd->nnodes; i++)
1110         {
1111             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1112             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1113         }
1114         if (debug)
1115         {
1116             fprintf(debug, "Initial charge group distribution: ");
1117             for (i = 0; i < dd->nnodes; i++)
1118             {
1119                 fprintf(debug, " %d", ma->ncg[i]);
1120             }
1121             fprintf(debug, "\n");
1122         }
1123     }
1124
1125     /* Collect the charge group indices on the master */
1126     dd_gatherv(dd,
1127                ncg_home*sizeof(int), cg,
1128                DDMASTER(dd) ? ma->ibuf : NULL,
1129                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1130                DDMASTER(dd) ? ma->cg : NULL);
1131
1132     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1133 }
1134
1135 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1136                                     rvec *lv, rvec *v)
1137 {
1138     gmx_domdec_master_t *ma;
1139     int                  n, i, c, a, nalloc = 0;
1140     rvec                *buf = NULL;
1141     t_block             *cgs_gl;
1142
1143     ma = dd->ma;
1144
1145     if (!DDMASTER(dd))
1146     {
1147 #ifdef GMX_MPI
1148         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1149                  dd->rank, dd->mpi_comm_all);
1150 #endif
1151     }
1152     else
1153     {
1154         /* Copy the master coordinates to the global array */
1155         cgs_gl = &dd->comm->cgs_gl;
1156
1157         n = DDMASTERRANK(dd);
1158         a = 0;
1159         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1160         {
1161             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1162             {
1163                 copy_rvec(lv[a++], v[c]);
1164             }
1165         }
1166
1167         for (n = 0; n < dd->nnodes; n++)
1168         {
1169             if (n != dd->rank)
1170             {
1171                 if (ma->nat[n] > nalloc)
1172                 {
1173                     nalloc = over_alloc_dd(ma->nat[n]);
1174                     srenew(buf, nalloc);
1175                 }
1176 #ifdef GMX_MPI
1177                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1178                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1179 #endif
1180                 a = 0;
1181                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1182                 {
1183                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1184                     {
1185                         copy_rvec(buf[a++], v[c]);
1186                     }
1187                 }
1188             }
1189         }
1190         sfree(buf);
1191     }
1192 }
1193
1194 static void get_commbuffer_counts(gmx_domdec_t *dd,
1195                                   int **counts, int **disps)
1196 {
1197     gmx_domdec_master_t *ma;
1198     int                  n;
1199
1200     ma = dd->ma;
1201
1202     /* Make the rvec count and displacment arrays */
1203     *counts  = ma->ibuf;
1204     *disps   = ma->ibuf + dd->nnodes;
1205     for (n = 0; n < dd->nnodes; n++)
1206     {
1207         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1208         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1209     }
1210 }
1211
1212 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1213                                    rvec *lv, rvec *v)
1214 {
1215     gmx_domdec_master_t *ma;
1216     int                 *rcounts = NULL, *disps = NULL;
1217     int                  n, i, c, a;
1218     rvec                *buf = NULL;
1219     t_block             *cgs_gl;
1220
1221     ma = dd->ma;
1222
1223     if (DDMASTER(dd))
1224     {
1225         get_commbuffer_counts(dd, &rcounts, &disps);
1226
1227         buf = ma->vbuf;
1228     }
1229
1230     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1231
1232     if (DDMASTER(dd))
1233     {
1234         cgs_gl = &dd->comm->cgs_gl;
1235
1236         a = 0;
1237         for (n = 0; n < dd->nnodes; n++)
1238         {
1239             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1240             {
1241                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1242                 {
1243                     copy_rvec(buf[a++], v[c]);
1244                 }
1245             }
1246         }
1247     }
1248 }
1249
1250 void dd_collect_vec(gmx_domdec_t *dd,
1251                     t_state *state_local, rvec *lv, rvec *v)
1252 {
1253     dd_collect_cg(dd, state_local);
1254
1255     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1256     {
1257         dd_collect_vec_sendrecv(dd, lv, v);
1258     }
1259     else
1260     {
1261         dd_collect_vec_gatherv(dd, lv, v);
1262     }
1263 }
1264
1265
1266 void dd_collect_state(gmx_domdec_t *dd,
1267                       t_state *state_local, t_state *state)
1268 {
1269     int est, i, j, nh;
1270
1271     nh = state->nhchainlength;
1272
1273     if (DDMASTER(dd))
1274     {
1275         for (i = 0; i < efptNR; i++)
1276         {
1277             state->lambda[i] = state_local->lambda[i];
1278         }
1279         state->fep_state = state_local->fep_state;
1280         state->veta      = state_local->veta;
1281         state->vol0      = state_local->vol0;
1282         copy_mat(state_local->box, state->box);
1283         copy_mat(state_local->boxv, state->boxv);
1284         copy_mat(state_local->svir_prev, state->svir_prev);
1285         copy_mat(state_local->fvir_prev, state->fvir_prev);
1286         copy_mat(state_local->pres_prev, state->pres_prev);
1287
1288         for (i = 0; i < state_local->ngtc; i++)
1289         {
1290             for (j = 0; j < nh; j++)
1291             {
1292                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1293                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1294             }
1295             state->therm_integral[i] = state_local->therm_integral[i];
1296         }
1297         for (i = 0; i < state_local->nnhpres; i++)
1298         {
1299             for (j = 0; j < nh; j++)
1300             {
1301                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1302                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1303             }
1304         }
1305     }
1306     for (est = 0; est < estNR; est++)
1307     {
1308         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1309         {
1310             switch (est)
1311             {
1312                 case estX:
1313                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1314                     break;
1315                 case estV:
1316                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1317                     break;
1318                 case estSDX:
1319                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1320                     break;
1321                 case estCGP:
1322                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1323                     break;
1324                 case estDISRE_INITF:
1325                 case estDISRE_RM3TAV:
1326                 case estORIRE_INITF:
1327                 case estORIRE_DTAV:
1328                     break;
1329                 default:
1330                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1331             }
1332         }
1333     }
1334 }
1335
1336 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1337 {
1338     int est;
1339
1340     if (debug)
1341     {
1342         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1343     }
1344
1345     state->nalloc = over_alloc_dd(nalloc);
1346
1347     for (est = 0; est < estNR; est++)
1348     {
1349         if (EST_DISTR(est) && (state->flags & (1<<est)))
1350         {
1351             switch (est)
1352             {
1353                 case estX:
1354                     srenew(state->x, state->nalloc);
1355                     break;
1356                 case estV:
1357                     srenew(state->v, state->nalloc);
1358                     break;
1359                 case estSDX:
1360                     srenew(state->sd_X, state->nalloc);
1361                     break;
1362                 case estCGP:
1363                     srenew(state->cg_p, state->nalloc);
1364                     break;
1365                 case estDISRE_INITF:
1366                 case estDISRE_RM3TAV:
1367                 case estORIRE_INITF:
1368                 case estORIRE_DTAV:
1369                     /* No reallocation required */
1370                     break;
1371                 default:
1372                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1373             }
1374         }
1375     }
1376
1377     if (f != NULL)
1378     {
1379         srenew(*f, state->nalloc);
1380     }
1381 }
1382
1383 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1384                                int nalloc)
1385 {
1386     if (nalloc > fr->cg_nalloc)
1387     {
1388         if (debug)
1389         {
1390             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1391         }
1392         fr->cg_nalloc = over_alloc_dd(nalloc);
1393         srenew(fr->cginfo, fr->cg_nalloc);
1394         if (fr->cutoff_scheme == ecutsGROUP)
1395         {
1396             srenew(fr->cg_cm, fr->cg_nalloc);
1397         }
1398     }
1399     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1400     {
1401         /* We don't use charge groups, we use x in state to set up
1402          * the atom communication.
1403          */
1404         dd_realloc_state(state, f, nalloc);
1405     }
1406 }
1407
1408 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1409                                        rvec *v, rvec *lv)
1410 {
1411     gmx_domdec_master_t *ma;
1412     int                  n, i, c, a, nalloc = 0;
1413     rvec                *buf = NULL;
1414
1415     if (DDMASTER(dd))
1416     {
1417         ma  = dd->ma;
1418
1419         for (n = 0; n < dd->nnodes; n++)
1420         {
1421             if (n != dd->rank)
1422             {
1423                 if (ma->nat[n] > nalloc)
1424                 {
1425                     nalloc = over_alloc_dd(ma->nat[n]);
1426                     srenew(buf, nalloc);
1427                 }
1428                 /* Use lv as a temporary buffer */
1429                 a = 0;
1430                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1431                 {
1432                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1433                     {
1434                         copy_rvec(v[c], buf[a++]);
1435                     }
1436                 }
1437                 if (a != ma->nat[n])
1438                 {
1439                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1440                               a, ma->nat[n]);
1441                 }
1442
1443 #ifdef GMX_MPI
1444                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1445                          DDRANK(dd, n), n, dd->mpi_comm_all);
1446 #endif
1447             }
1448         }
1449         sfree(buf);
1450         n = DDMASTERRANK(dd);
1451         a = 0;
1452         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1453         {
1454             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1455             {
1456                 copy_rvec(v[c], lv[a++]);
1457             }
1458         }
1459     }
1460     else
1461     {
1462 #ifdef GMX_MPI
1463         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1464                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1465 #endif
1466     }
1467 }
1468
1469 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1470                                        rvec *v, rvec *lv)
1471 {
1472     gmx_domdec_master_t *ma;
1473     int                 *scounts = NULL, *disps = NULL;
1474     int                  n, i, c, a;
1475     rvec                *buf = NULL;
1476
1477     if (DDMASTER(dd))
1478     {
1479         ma  = dd->ma;
1480
1481         get_commbuffer_counts(dd, &scounts, &disps);
1482
1483         buf = ma->vbuf;
1484         a   = 0;
1485         for (n = 0; n < dd->nnodes; n++)
1486         {
1487             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1488             {
1489                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1490                 {
1491                     copy_rvec(v[c], buf[a++]);
1492                 }
1493             }
1494         }
1495     }
1496
1497     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1498 }
1499
1500 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1501 {
1502     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1503     {
1504         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1505     }
1506     else
1507     {
1508         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1509     }
1510 }
1511
1512 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1513 {
1514     int i;
1515     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1516     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1517     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1518
1519     if (dfhist->nlambda > 0)
1520     {
1521         int nlam = dfhist->nlambda;
1522         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1523         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1524         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1525         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1526         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1527         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1528
1529         for (i = 0; i < nlam; i++)
1530         {
1531             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1532             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1533             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1534             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1535             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1536             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1537         }
1538     }
1539 }
1540
1541 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1542                                 t_state *state, t_state *state_local,
1543                                 rvec **f)
1544 {
1545     int  i, j, nh;
1546
1547     nh = state->nhchainlength;
1548
1549     if (DDMASTER(dd))
1550     {
1551         for (i = 0; i < efptNR; i++)
1552         {
1553             state_local->lambda[i] = state->lambda[i];
1554         }
1555         state_local->fep_state = state->fep_state;
1556         state_local->veta      = state->veta;
1557         state_local->vol0      = state->vol0;
1558         copy_mat(state->box, state_local->box);
1559         copy_mat(state->box_rel, state_local->box_rel);
1560         copy_mat(state->boxv, state_local->boxv);
1561         copy_mat(state->svir_prev, state_local->svir_prev);
1562         copy_mat(state->fvir_prev, state_local->fvir_prev);
1563         copy_df_history(&state_local->dfhist, &state->dfhist);
1564         for (i = 0; i < state_local->ngtc; i++)
1565         {
1566             for (j = 0; j < nh; j++)
1567             {
1568                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1569                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1570             }
1571             state_local->therm_integral[i] = state->therm_integral[i];
1572         }
1573         for (i = 0; i < state_local->nnhpres; i++)
1574         {
1575             for (j = 0; j < nh; j++)
1576             {
1577                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1578                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1579             }
1580         }
1581     }
1582     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1583     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1584     dd_bcast(dd, sizeof(real), &state_local->veta);
1585     dd_bcast(dd, sizeof(real), &state_local->vol0);
1586     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1587     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1588     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1589     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1590     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1591     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1592     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1593     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1594     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1595     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1596
1597     /* communicate df_history -- required for restarting from checkpoint */
1598     dd_distribute_dfhist(dd, &state_local->dfhist);
1599
1600     if (dd->nat_home > state_local->nalloc)
1601     {
1602         dd_realloc_state(state_local, f, dd->nat_home);
1603     }
1604     for (i = 0; i < estNR; i++)
1605     {
1606         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1607         {
1608             switch (i)
1609             {
1610                 case estX:
1611                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1612                     break;
1613                 case estV:
1614                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1615                     break;
1616                 case estSDX:
1617                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1618                     break;
1619                 case estCGP:
1620                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1621                     break;
1622                 case estDISRE_INITF:
1623                 case estDISRE_RM3TAV:
1624                 case estORIRE_INITF:
1625                 case estORIRE_DTAV:
1626                     /* Not implemented yet */
1627                     break;
1628                 default:
1629                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1630             }
1631         }
1632     }
1633 }
1634
1635 static char dim2char(int dim)
1636 {
1637     char c = '?';
1638
1639     switch (dim)
1640     {
1641         case XX: c = 'X'; break;
1642         case YY: c = 'Y'; break;
1643         case ZZ: c = 'Z'; break;
1644         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1645     }
1646
1647     return c;
1648 }
1649
1650 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1651                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1652 {
1653     rvec   grid_s[2], *grid_r = NULL, cx, r;
1654     char   fname[STRLEN], buf[22];
1655     FILE  *out;
1656     int    a, i, d, z, y, x;
1657     matrix tric;
1658     real   vol;
1659
1660     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1661     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1662
1663     if (DDMASTER(dd))
1664     {
1665         snew(grid_r, 2*dd->nnodes);
1666     }
1667
1668     dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : NULL);
1669
1670     if (DDMASTER(dd))
1671     {
1672         for (d = 0; d < DIM; d++)
1673         {
1674             for (i = 0; i < DIM; i++)
1675             {
1676                 if (d == i)
1677                 {
1678                     tric[d][i] = 1;
1679                 }
1680                 else
1681                 {
1682                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1683                     {
1684                         tric[d][i] = box[i][d]/box[i][i];
1685                     }
1686                     else
1687                     {
1688                         tric[d][i] = 0;
1689                     }
1690                 }
1691             }
1692         }
1693         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1694         out = gmx_fio_fopen(fname, "w");
1695         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1696         a = 1;
1697         for (i = 0; i < dd->nnodes; i++)
1698         {
1699             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1700             for (d = 0; d < DIM; d++)
1701             {
1702                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1703             }
1704             for (z = 0; z < 2; z++)
1705             {
1706                 for (y = 0; y < 2; y++)
1707                 {
1708                     for (x = 0; x < 2; x++)
1709                     {
1710                         cx[XX] = grid_r[i*2+x][XX];
1711                         cx[YY] = grid_r[i*2+y][YY];
1712                         cx[ZZ] = grid_r[i*2+z][ZZ];
1713                         mvmul(tric, cx, r);
1714                         gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
1715                                                  10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
1716                     }
1717                 }
1718             }
1719             for (d = 0; d < DIM; d++)
1720             {
1721                 for (x = 0; x < 4; x++)
1722                 {
1723                     switch (d)
1724                     {
1725                         case 0: y = 1 + i*8 + 2*x; break;
1726                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1727                         case 2: y = 1 + i*8 + x; break;
1728                     }
1729                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
1730                 }
1731             }
1732         }
1733         gmx_fio_fclose(out);
1734         sfree(grid_r);
1735     }
1736 }
1737
1738 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
1739                   gmx_mtop_t *mtop, t_commrec *cr,
1740                   int natoms, rvec x[], matrix box)
1741 {
1742     char          fname[STRLEN], buf[22];
1743     FILE         *out;
1744     int           i, ii, resnr, c;
1745     char         *atomname, *resname;
1746     real          b;
1747     gmx_domdec_t *dd;
1748
1749     dd = cr->dd;
1750     if (natoms == -1)
1751     {
1752         natoms = dd->comm->nat[ddnatVSITE];
1753     }
1754
1755     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
1756
1757     out = gmx_fio_fopen(fname, "w");
1758
1759     fprintf(out, "TITLE     %s\n", title);
1760     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1761     for (i = 0; i < natoms; i++)
1762     {
1763         ii = dd->gatindex[i];
1764         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
1765         if (i < dd->comm->nat[ddnatZONE])
1766         {
1767             c = 0;
1768             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1769             {
1770                 c++;
1771             }
1772             b = c;
1773         }
1774         else if (i < dd->comm->nat[ddnatVSITE])
1775         {
1776             b = dd->comm->zones.n;
1777         }
1778         else
1779         {
1780             b = dd->comm->zones.n + 1;
1781         }
1782         gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
1783                                  10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
1784     }
1785     fprintf(out, "TER\n");
1786
1787     gmx_fio_fclose(out);
1788 }
1789
1790 real dd_cutoff_multibody(const gmx_domdec_t *dd)
1791 {
1792     gmx_domdec_comm_t *comm;
1793     int                di;
1794     real               r;
1795
1796     comm = dd->comm;
1797
1798     r = -1;
1799     if (comm->bInterCGBondeds)
1800     {
1801         if (comm->cutoff_mbody > 0)
1802         {
1803             r = comm->cutoff_mbody;
1804         }
1805         else
1806         {
1807             /* cutoff_mbody=0 means we do not have DLB */
1808             r = comm->cellsize_min[dd->dim[0]];
1809             for (di = 1; di < dd->ndim; di++)
1810             {
1811                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
1812             }
1813             if (comm->bBondComm)
1814             {
1815                 r = std::max(r, comm->cutoff_mbody);
1816             }
1817             else
1818             {
1819                 r = std::min(r, comm->cutoff);
1820             }
1821         }
1822     }
1823
1824     return r;
1825 }
1826
1827 real dd_cutoff_twobody(const gmx_domdec_t *dd)
1828 {
1829     real r_mb;
1830
1831     r_mb = dd_cutoff_multibody(dd);
1832
1833     return std::max(dd->comm->cutoff, r_mb);
1834 }
1835
1836
1837 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
1838 {
1839     int nc, ntot;
1840
1841     nc   = dd->nc[dd->comm->cartpmedim];
1842     ntot = dd->comm->ntot[dd->comm->cartpmedim];
1843     copy_ivec(coord, coord_pme);
1844     coord_pme[dd->comm->cartpmedim] =
1845         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
1846 }
1847
1848 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
1849 {
1850     /* Here we assign a PME node to communicate with this DD node
1851      * by assuming that the major index of both is x.
1852      * We add cr->npmenodes/2 to obtain an even distribution.
1853      */
1854     return (ddindex*npme + npme/2)/ndd;
1855 }
1856
1857 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
1858 {
1859     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
1860 }
1861
1862 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
1863 {
1864     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
1865 }
1866
1867 static int *dd_pmenodes(t_commrec *cr)
1868 {
1869     int *pmenodes;
1870     int  n, i, p0, p1;
1871
1872     snew(pmenodes, cr->npmenodes);
1873     n = 0;
1874     for (i = 0; i < cr->dd->nnodes; i++)
1875     {
1876         p0 = cr_ddindex2pmeindex(cr, i);
1877         p1 = cr_ddindex2pmeindex(cr, i+1);
1878         if (i+1 == cr->dd->nnodes || p1 > p0)
1879         {
1880             if (debug)
1881             {
1882                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
1883             }
1884             pmenodes[n] = i + 1 + n;
1885             n++;
1886         }
1887     }
1888
1889     return pmenodes;
1890 }
1891
1892 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
1893 {
1894     gmx_domdec_t *dd;
1895     ivec          coords;
1896     int           slab;
1897
1898     dd = cr->dd;
1899     /*
1900        if (dd->comm->bCartesian) {
1901        gmx_ddindex2xyz(dd->nc,ddindex,coords);
1902        dd_coords2pmecoords(dd,coords,coords_pme);
1903        copy_ivec(dd->ntot,nc);
1904        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
1905        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1906
1907        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
1908        } else {
1909        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
1910        }
1911      */
1912     coords[XX] = x;
1913     coords[YY] = y;
1914     coords[ZZ] = z;
1915     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
1916
1917     return slab;
1918 }
1919
1920 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
1921 {
1922     gmx_domdec_comm_t *comm;
1923     ivec               coords;
1924     int                ddindex, nodeid = -1;
1925
1926     comm = cr->dd->comm;
1927
1928     coords[XX] = x;
1929     coords[YY] = y;
1930     coords[ZZ] = z;
1931     if (comm->bCartesianPP_PME)
1932     {
1933 #ifdef GMX_MPI
1934         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
1935 #endif
1936     }
1937     else
1938     {
1939         ddindex = dd_index(cr->dd->nc, coords);
1940         if (comm->bCartesianPP)
1941         {
1942             nodeid = comm->ddindex2simnodeid[ddindex];
1943         }
1944         else
1945         {
1946             if (comm->pmenodes)
1947             {
1948                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
1949             }
1950             else
1951             {
1952                 nodeid = ddindex;
1953             }
1954         }
1955     }
1956
1957     return nodeid;
1958 }
1959
1960 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
1961 {
1962     gmx_domdec_t      *dd;
1963     gmx_domdec_comm_t *comm;
1964     int                i;
1965     int                pmenode = -1;
1966
1967     dd   = cr->dd;
1968     comm = dd->comm;
1969
1970     /* This assumes a uniform x domain decomposition grid cell size */
1971     if (comm->bCartesianPP_PME)
1972     {
1973 #ifdef GMX_MPI
1974         ivec coord, coord_pme;
1975         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
1976         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1977         {
1978             /* This is a PP node */
1979             dd_cart_coord2pmecoord(dd, coord, coord_pme);
1980             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
1981         }
1982 #endif
1983     }
1984     else if (comm->bCartesianPP)
1985     {
1986         if (sim_nodeid < dd->nnodes)
1987         {
1988             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1989         }
1990     }
1991     else
1992     {
1993         /* This assumes DD cells with identical x coordinates
1994          * are numbered sequentially.
1995          */
1996         if (dd->comm->pmenodes == NULL)
1997         {
1998             if (sim_nodeid < dd->nnodes)
1999             {
2000                 /* The DD index equals the nodeid */
2001                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2002             }
2003         }
2004         else
2005         {
2006             i = 0;
2007             while (sim_nodeid > dd->comm->pmenodes[i])
2008             {
2009                 i++;
2010             }
2011             if (sim_nodeid < dd->comm->pmenodes[i])
2012             {
2013                 pmenode = dd->comm->pmenodes[i];
2014             }
2015         }
2016     }
2017
2018     return pmenode;
2019 }
2020
2021 void get_pme_nnodes(const gmx_domdec_t *dd,
2022                     int *npmenodes_x, int *npmenodes_y)
2023 {
2024     if (dd != NULL)
2025     {
2026         *npmenodes_x = dd->comm->npmenodes_x;
2027         *npmenodes_y = dd->comm->npmenodes_y;
2028     }
2029     else
2030     {
2031         *npmenodes_x = 1;
2032         *npmenodes_y = 1;
2033     }
2034 }
2035
2036 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2037                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2038 {
2039     gmx_domdec_t *dd;
2040     int           x, y, z;
2041     ivec          coord, coord_pme;
2042
2043     dd = cr->dd;
2044
2045     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2046
2047     *nmy_ddnodes = 0;
2048     for (x = 0; x < dd->nc[XX]; x++)
2049     {
2050         for (y = 0; y < dd->nc[YY]; y++)
2051         {
2052             for (z = 0; z < dd->nc[ZZ]; z++)
2053             {
2054                 if (dd->comm->bCartesianPP_PME)
2055                 {
2056                     coord[XX] = x;
2057                     coord[YY] = y;
2058                     coord[ZZ] = z;
2059                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2060                     if (dd->ci[XX] == coord_pme[XX] &&
2061                         dd->ci[YY] == coord_pme[YY] &&
2062                         dd->ci[ZZ] == coord_pme[ZZ])
2063                     {
2064                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2065                     }
2066                 }
2067                 else
2068                 {
2069                     /* The slab corresponds to the nodeid in the PME group */
2070                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2071                     {
2072                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2073                     }
2074                 }
2075             }
2076         }
2077     }
2078
2079     /* The last PP-only node is the peer node */
2080     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2081
2082     if (debug)
2083     {
2084         fprintf(debug, "Receive coordinates from PP ranks:");
2085         for (x = 0; x < *nmy_ddnodes; x++)
2086         {
2087             fprintf(debug, " %d", (*my_ddnodes)[x]);
2088         }
2089         fprintf(debug, "\n");
2090     }
2091 }
2092
2093 static gmx_bool receive_vir_ener(t_commrec *cr)
2094 {
2095     gmx_domdec_comm_t *comm;
2096     int                pmenode;
2097     gmx_bool           bReceive;
2098
2099     bReceive = TRUE;
2100     if (cr->npmenodes < cr->dd->nnodes)
2101     {
2102         comm = cr->dd->comm;
2103         if (comm->bCartesianPP_PME)
2104         {
2105             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2106 #ifdef GMX_MPI
2107             ivec coords;
2108             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2109             coords[comm->cartpmedim]++;
2110             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2111             {
2112                 int rank;
2113                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2114                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2115                 {
2116                     /* This is not the last PP node for pmenode */
2117                     bReceive = FALSE;
2118                 }
2119             }
2120 #endif
2121         }
2122         else
2123         {
2124             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2125             if (cr->sim_nodeid+1 < cr->nnodes &&
2126                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2127             {
2128                 /* This is not the last PP node for pmenode */
2129                 bReceive = FALSE;
2130             }
2131         }
2132     }
2133
2134     return bReceive;
2135 }
2136
2137 static void set_zones_ncg_home(gmx_domdec_t *dd)
2138 {
2139     gmx_domdec_zones_t *zones;
2140     int                 i;
2141
2142     zones = &dd->comm->zones;
2143
2144     zones->cg_range[0] = 0;
2145     for (i = 1; i < zones->n+1; i++)
2146     {
2147         zones->cg_range[i] = dd->ncg_home;
2148     }
2149     /* zone_ncg1[0] should always be equal to ncg_home */
2150     dd->comm->zone_ncg1[0] = dd->ncg_home;
2151 }
2152
2153 static void rebuild_cgindex(gmx_domdec_t *dd,
2154                             const int *gcgs_index, t_state *state)
2155 {
2156     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2157
2158     ind        = state->cg_gl;
2159     dd_cg_gl   = dd->index_gl;
2160     cgindex    = dd->cgindex;
2161     nat        = 0;
2162     cgindex[0] = nat;
2163     for (i = 0; i < state->ncg_gl; i++)
2164     {
2165         cgindex[i]  = nat;
2166         cg_gl       = ind[i];
2167         dd_cg_gl[i] = cg_gl;
2168         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2169     }
2170     cgindex[i] = nat;
2171
2172     dd->ncg_home = state->ncg_gl;
2173     dd->nat_home = nat;
2174
2175     set_zones_ncg_home(dd);
2176 }
2177
2178 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2179 {
2180     while (cg >= cginfo_mb->cg_end)
2181     {
2182         cginfo_mb++;
2183     }
2184
2185     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2186 }
2187
2188 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2189                           t_forcerec *fr, char *bLocalCG)
2190 {
2191     cginfo_mb_t *cginfo_mb;
2192     int         *cginfo;
2193     int          cg;
2194
2195     if (fr != NULL)
2196     {
2197         cginfo_mb = fr->cginfo_mb;
2198         cginfo    = fr->cginfo;
2199
2200         for (cg = cg0; cg < cg1; cg++)
2201         {
2202             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2203         }
2204     }
2205
2206     if (bLocalCG != NULL)
2207     {
2208         for (cg = cg0; cg < cg1; cg++)
2209         {
2210             bLocalCG[index_gl[cg]] = TRUE;
2211         }
2212     }
2213 }
2214
2215 static void make_dd_indices(gmx_domdec_t *dd,
2216                             const int *gcgs_index, int cg_start)
2217 {
2218     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2219     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2220     gmx_bool     bCGs;
2221
2222     if (dd->nat_tot > dd->gatindex_nalloc)
2223     {
2224         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2225         srenew(dd->gatindex, dd->gatindex_nalloc);
2226     }
2227
2228     nzone      = dd->comm->zones.n;
2229     zone2cg    = dd->comm->zones.cg_range;
2230     zone_ncg1  = dd->comm->zone_ncg1;
2231     index_gl   = dd->index_gl;
2232     gatindex   = dd->gatindex;
2233     bCGs       = dd->comm->bCGs;
2234
2235     if (zone2cg[1] != dd->ncg_home)
2236     {
2237         gmx_incons("dd->ncg_zone is not up to date");
2238     }
2239
2240     /* Make the local to global and global to local atom index */
2241     a = dd->cgindex[cg_start];
2242     for (zone = 0; zone < nzone; zone++)
2243     {
2244         if (zone == 0)
2245         {
2246             cg0 = cg_start;
2247         }
2248         else
2249         {
2250             cg0 = zone2cg[zone];
2251         }
2252         cg1    = zone2cg[zone+1];
2253         cg1_p1 = cg0 + zone_ncg1[zone];
2254
2255         for (cg = cg0; cg < cg1; cg++)
2256         {
2257             zone1 = zone;
2258             if (cg >= cg1_p1)
2259             {
2260                 /* Signal that this cg is from more than one pulse away */
2261                 zone1 += nzone;
2262             }
2263             cg_gl = index_gl[cg];
2264             if (bCGs)
2265             {
2266                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2267                 {
2268                     gatindex[a] = a_gl;
2269                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2270                     a++;
2271                 }
2272             }
2273             else
2274             {
2275                 gatindex[a] = cg_gl;
2276                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2277                 a++;
2278             }
2279         }
2280     }
2281 }
2282
2283 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2284                           const char *where)
2285 {
2286     int i, ngl, nerr;
2287
2288     nerr = 0;
2289     if (bLocalCG == NULL)
2290     {
2291         return nerr;
2292     }
2293     for (i = 0; i < dd->ncg_tot; i++)
2294     {
2295         if (!bLocalCG[dd->index_gl[i]])
2296         {
2297             fprintf(stderr,
2298                     "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2299             nerr++;
2300         }
2301     }
2302     ngl = 0;
2303     for (i = 0; i < ncg_sys; i++)
2304     {
2305         if (bLocalCG[i])
2306         {
2307             ngl++;
2308         }
2309     }
2310     if (ngl != dd->ncg_tot)
2311     {
2312         fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2313         nerr++;
2314     }
2315
2316     return nerr;
2317 }
2318
2319 static void check_index_consistency(gmx_domdec_t *dd,
2320                                     int natoms_sys, int ncg_sys,
2321                                     const char *where)
2322 {
2323     int   nerr, ngl, i, a, cell;
2324     int  *have;
2325
2326     nerr = 0;
2327
2328     if (dd->comm->DD_debug > 1)
2329     {
2330         snew(have, natoms_sys);
2331         for (a = 0; a < dd->nat_tot; a++)
2332         {
2333             if (have[dd->gatindex[a]] > 0)
2334             {
2335                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2336             }
2337             else
2338             {
2339                 have[dd->gatindex[a]] = a + 1;
2340             }
2341         }
2342         sfree(have);
2343     }
2344
2345     snew(have, dd->nat_tot);
2346
2347     ngl  = 0;
2348     for (i = 0; i < natoms_sys; i++)
2349     {
2350         if (ga2la_get(dd->ga2la, i, &a, &cell))
2351         {
2352             if (a >= dd->nat_tot)
2353             {
2354                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2355                 nerr++;
2356             }
2357             else
2358             {
2359                 have[a] = 1;
2360                 if (dd->gatindex[a] != i)
2361                 {
2362                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2363                     nerr++;
2364                 }
2365             }
2366             ngl++;
2367         }
2368     }
2369     if (ngl != dd->nat_tot)
2370     {
2371         fprintf(stderr,
2372                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2373                 dd->rank, where, ngl, dd->nat_tot);
2374     }
2375     for (a = 0; a < dd->nat_tot; a++)
2376     {
2377         if (have[a] == 0)
2378         {
2379             fprintf(stderr,
2380                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
2381                     dd->rank, where, a+1, dd->gatindex[a]+1);
2382         }
2383     }
2384     sfree(have);
2385
2386     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2387
2388     if (nerr > 0)
2389     {
2390         gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2391                   dd->rank, where, nerr);
2392     }
2393 }
2394
2395 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2396 {
2397     int   i;
2398     char *bLocalCG;
2399
2400     if (a_start == 0)
2401     {
2402         /* Clear the whole list without searching */
2403         ga2la_clear(dd->ga2la);
2404     }
2405     else
2406     {
2407         for (i = a_start; i < dd->nat_tot; i++)
2408         {
2409             ga2la_del(dd->ga2la, dd->gatindex[i]);
2410         }
2411     }
2412
2413     bLocalCG = dd->comm->bLocalCG;
2414     if (bLocalCG)
2415     {
2416         for (i = cg_start; i < dd->ncg_tot; i++)
2417         {
2418             bLocalCG[dd->index_gl[i]] = FALSE;
2419         }
2420     }
2421
2422     dd_clear_local_vsite_indices(dd);
2423
2424     if (dd->constraints)
2425     {
2426         dd_clear_local_constraint_indices(dd);
2427     }
2428 }
2429
2430 /* This function should be used for moving the domain boudaries during DLB,
2431  * for obtaining the minimum cell size. It checks the initially set limit
2432  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2433  * and, possibly, a longer cut-off limit set for PME load balancing.
2434  */
2435 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2436 {
2437     real cellsize_min;
2438
2439     cellsize_min = comm->cellsize_min[dim];
2440
2441     if (!comm->bVacDLBNoLimit)
2442     {
2443         /* The cut-off might have changed, e.g. by PME load balacning,
2444          * from the value used to set comm->cellsize_min, so check it.
2445          */
2446         cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2447
2448         if (comm->bPMELoadBalDLBLimits)
2449         {
2450             /* Check for the cut-off limit set by the PME load balancing */
2451             cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2452         }
2453     }
2454
2455     return cellsize_min;
2456 }
2457
2458 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2459                             int dim_ind)
2460 {
2461     real grid_jump_limit;
2462
2463     /* The distance between the boundaries of cells at distance
2464      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2465      * and by the fact that cells should not be shifted by more than
2466      * half their size, such that cg's only shift by one cell
2467      * at redecomposition.
2468      */
2469     grid_jump_limit = comm->cellsize_limit;
2470     if (!comm->bVacDLBNoLimit)
2471     {
2472         if (comm->bPMELoadBalDLBLimits)
2473         {
2474             cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
2475         }
2476         grid_jump_limit = std::max(grid_jump_limit,
2477                                    cutoff/comm->cd[dim_ind].np);
2478     }
2479
2480     return grid_jump_limit;
2481 }
2482
2483 static gmx_bool check_grid_jump(gmx_int64_t     step,
2484                                 gmx_domdec_t   *dd,
2485                                 real            cutoff,
2486                                 gmx_ddbox_t    *ddbox,
2487                                 gmx_bool        bFatal)
2488 {
2489     gmx_domdec_comm_t *comm;
2490     int                d, dim;
2491     real               limit, bfac;
2492     gmx_bool           bInvalid;
2493
2494     bInvalid = FALSE;
2495
2496     comm = dd->comm;
2497
2498     for (d = 1; d < dd->ndim; d++)
2499     {
2500         dim   = dd->dim[d];
2501         limit = grid_jump_limit(comm, cutoff, d);
2502         bfac  = ddbox->box_size[dim];
2503         if (ddbox->tric_dir[dim])
2504         {
2505             bfac *= ddbox->skew_fac[dim];
2506         }
2507         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2508                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2509         {
2510             bInvalid = TRUE;
2511
2512             if (bFatal)
2513             {
2514                 char buf[22];
2515
2516                 /* This error should never be triggered under normal
2517                  * circumstances, but you never know ...
2518                  */
2519                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2520                           gmx_step_str(step, buf),
2521                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2522             }
2523         }
2524     }
2525
2526     return bInvalid;
2527 }
2528
2529 static int dd_load_count(gmx_domdec_comm_t *comm)
2530 {
2531     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2532 }
2533
2534 static float dd_force_load(gmx_domdec_comm_t *comm)
2535 {
2536     float load;
2537
2538     if (comm->eFlop)
2539     {
2540         load = comm->flop;
2541         if (comm->eFlop > 1)
2542         {
2543             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2544         }
2545     }
2546     else
2547     {
2548         load = comm->cycl[ddCyclF];
2549         if (comm->cycl_n[ddCyclF] > 1)
2550         {
2551             /* Subtract the maximum of the last n cycle counts
2552              * to get rid of possible high counts due to other sources,
2553              * for instance system activity, that would otherwise
2554              * affect the dynamic load balancing.
2555              */
2556             load -= comm->cycl_max[ddCyclF];
2557         }
2558
2559 #ifdef GMX_MPI
2560         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2561         {
2562             float gpu_wait, gpu_wait_sum;
2563
2564             gpu_wait = comm->cycl[ddCyclWaitGPU];
2565             if (comm->cycl_n[ddCyclF] > 1)
2566             {
2567                 /* We should remove the WaitGPU time of the same MD step
2568                  * as the one with the maximum F time, since the F time
2569                  * and the wait time are not independent.
2570                  * Furthermore, the step for the max F time should be chosen
2571                  * the same on all ranks that share the same GPU.
2572                  * But to keep the code simple, we remove the average instead.
2573                  * The main reason for artificially long times at some steps
2574                  * is spurious CPU activity or MPI time, so we don't expect
2575                  * that changes in the GPU wait time matter a lot here.
2576                  */
2577                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2578             }
2579             /* Sum the wait times over the ranks that share the same GPU */
2580             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2581                           comm->mpi_comm_gpu_shared);
2582             /* Replace the wait time by the average over the ranks */
2583             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2584         }
2585 #endif
2586     }
2587
2588     return load;
2589 }
2590
2591 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2592 {
2593     gmx_domdec_comm_t *comm;
2594     int                i;
2595
2596     comm = dd->comm;
2597
2598     snew(*dim_f, dd->nc[dim]+1);
2599     (*dim_f)[0] = 0;
2600     for (i = 1; i < dd->nc[dim]; i++)
2601     {
2602         if (comm->slb_frac[dim])
2603         {
2604             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2605         }
2606         else
2607         {
2608             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2609         }
2610     }
2611     (*dim_f)[dd->nc[dim]] = 1;
2612 }
2613
2614 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2615 {
2616     int  pmeindex, slab, nso, i;
2617     ivec xyz;
2618
2619     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2620     {
2621         ddpme->dim = YY;
2622     }
2623     else
2624     {
2625         ddpme->dim = dimind;
2626     }
2627     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2628
2629     ddpme->nslab = (ddpme->dim == 0 ?
2630                     dd->comm->npmenodes_x :
2631                     dd->comm->npmenodes_y);
2632
2633     if (ddpme->nslab <= 1)
2634     {
2635         return;
2636     }
2637
2638     nso = dd->comm->npmenodes/ddpme->nslab;
2639     /* Determine for each PME slab the PP location range for dimension dim */
2640     snew(ddpme->pp_min, ddpme->nslab);
2641     snew(ddpme->pp_max, ddpme->nslab);
2642     for (slab = 0; slab < ddpme->nslab; slab++)
2643     {
2644         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2645         ddpme->pp_max[slab] = 0;
2646     }
2647     for (i = 0; i < dd->nnodes; i++)
2648     {
2649         ddindex2xyz(dd->nc, i, xyz);
2650         /* For y only use our y/z slab.
2651          * This assumes that the PME x grid size matches the DD grid size.
2652          */
2653         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2654         {
2655             pmeindex = ddindex2pmeindex(dd, i);
2656             if (dimind == 0)
2657             {
2658                 slab = pmeindex/nso;
2659             }
2660             else
2661             {
2662                 slab = pmeindex % ddpme->nslab;
2663             }
2664             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
2665             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
2666         }
2667     }
2668
2669     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2670 }
2671
2672 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2673 {
2674     if (dd->comm->ddpme[0].dim == XX)
2675     {
2676         return dd->comm->ddpme[0].maxshift;
2677     }
2678     else
2679     {
2680         return 0;
2681     }
2682 }
2683
2684 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2685 {
2686     if (dd->comm->ddpme[0].dim == YY)
2687     {
2688         return dd->comm->ddpme[0].maxshift;
2689     }
2690     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2691     {
2692         return dd->comm->ddpme[1].maxshift;
2693     }
2694     else
2695     {
2696         return 0;
2697     }
2698 }
2699
2700 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2701                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
2702 {
2703     gmx_domdec_comm_t *comm;
2704     int                nc, ns, s;
2705     int               *xmin, *xmax;
2706     real               range, pme_boundary;
2707     int                sh;
2708
2709     comm = dd->comm;
2710     nc   = dd->nc[ddpme->dim];
2711     ns   = ddpme->nslab;
2712
2713     if (!ddpme->dim_match)
2714     {
2715         /* PP decomposition is not along dim: the worst situation */
2716         sh = ns/2;
2717     }
2718     else if (ns <= 3 || (bUniform && ns == nc))
2719     {
2720         /* The optimal situation */
2721         sh = 1;
2722     }
2723     else
2724     {
2725         /* We need to check for all pme nodes which nodes they
2726          * could possibly need to communicate with.
2727          */
2728         xmin = ddpme->pp_min;
2729         xmax = ddpme->pp_max;
2730         /* Allow for atoms to be maximally 2/3 times the cut-off
2731          * out of their DD cell. This is a reasonable balance between
2732          * between performance and support for most charge-group/cut-off
2733          * combinations.
2734          */
2735         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2736         /* Avoid extra communication when we are exactly at a boundary */
2737         range *= 0.999;
2738
2739         sh = 1;
2740         for (s = 0; s < ns; s++)
2741         {
2742             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2743             pme_boundary = (real)s/ns;
2744             while (sh+1 < ns &&
2745                    ((s-(sh+1) >= 0 &&
2746                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2747                     (s-(sh+1) <  0 &&
2748                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2749             {
2750                 sh++;
2751             }
2752             pme_boundary = (real)(s+1)/ns;
2753             while (sh+1 < ns &&
2754                    ((s+(sh+1) <  ns &&
2755                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2756                     (s+(sh+1) >= ns &&
2757                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2758             {
2759                 sh++;
2760             }
2761         }
2762     }
2763
2764     ddpme->maxshift = sh;
2765
2766     if (debug)
2767     {
2768         fprintf(debug, "PME slab communication range for dim %d is %d\n",
2769                 ddpme->dim, ddpme->maxshift);
2770     }
2771 }
2772
2773 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
2774 {
2775     int d, dim;
2776
2777     for (d = 0; d < dd->ndim; d++)
2778     {
2779         dim = dd->dim[d];
2780         if (dim < ddbox->nboundeddim &&
2781             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2782             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2783         {
2784             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2785                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2786                       dd->nc[dim], dd->comm->cellsize_limit);
2787         }
2788     }
2789 }
2790
2791 enum {
2792     setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
2793 };
2794
2795 /* Set the domain boundaries. Use for static (or no) load balancing,
2796  * and also for the starting state for dynamic load balancing.
2797  * setmode determine if and where the boundaries are stored, use enum above.
2798  * Returns the number communication pulses in npulse.
2799  */
2800 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
2801                                   int setmode, ivec npulse)
2802 {
2803     gmx_domdec_comm_t *comm;
2804     int                d, j;
2805     rvec               cellsize_min;
2806     real              *cell_x, cell_dx, cellsize;
2807
2808     comm = dd->comm;
2809
2810     for (d = 0; d < DIM; d++)
2811     {
2812         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2813         npulse[d]       = 1;
2814         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2815         {
2816             /* Uniform grid */
2817             cell_dx = ddbox->box_size[d]/dd->nc[d];
2818             switch (setmode)
2819             {
2820                 case setcellsizeslbMASTER:
2821                     for (j = 0; j < dd->nc[d]+1; j++)
2822                     {
2823                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2824                     }
2825                     break;
2826                 case setcellsizeslbLOCAL:
2827                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2828                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2829                     break;
2830                 default:
2831                     break;
2832             }
2833             cellsize = cell_dx*ddbox->skew_fac[d];
2834             while (cellsize*npulse[d] < comm->cutoff)
2835             {
2836                 npulse[d]++;
2837             }
2838             cellsize_min[d] = cellsize;
2839         }
2840         else
2841         {
2842             /* Statically load balanced grid */
2843             /* Also when we are not doing a master distribution we determine
2844              * all cell borders in a loop to obtain identical values
2845              * to the master distribution case and to determine npulse.
2846              */
2847             if (setmode == setcellsizeslbMASTER)
2848             {
2849                 cell_x = dd->ma->cell_x[d];
2850             }
2851             else
2852             {
2853                 snew(cell_x, dd->nc[d]+1);
2854             }
2855             cell_x[0] = ddbox->box0[d];
2856             for (j = 0; j < dd->nc[d]; j++)
2857             {
2858                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
2859                 cell_x[j+1] = cell_x[j] + cell_dx;
2860                 cellsize    = cell_dx*ddbox->skew_fac[d];
2861                 while (cellsize*npulse[d] < comm->cutoff &&
2862                        npulse[d] < dd->nc[d]-1)
2863                 {
2864                     npulse[d]++;
2865                 }
2866                 cellsize_min[d] = std::min(cellsize_min[d], cellsize);
2867             }
2868             if (setmode == setcellsizeslbLOCAL)
2869             {
2870                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2871                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2872             }
2873             if (setmode != setcellsizeslbMASTER)
2874             {
2875                 sfree(cell_x);
2876             }
2877         }
2878         /* The following limitation is to avoid that a cell would receive
2879          * some of its own home charge groups back over the periodic boundary.
2880          * Double charge groups cause trouble with the global indices.
2881          */
2882         if (d < ddbox->npbcdim &&
2883             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2884         {
2885             char error_string[STRLEN];
2886
2887             sprintf(error_string,
2888                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2889                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
2890                     comm->cutoff,
2891                     dd->nc[d], dd->nc[d],
2892                     dd->nnodes > dd->nc[d] ? "cells" : "ranks");
2893
2894             if (setmode == setcellsizeslbLOCAL)
2895             {
2896                 gmx_fatal_collective(FARGS, NULL, dd, error_string);
2897             }
2898             else
2899             {
2900                 gmx_fatal(FARGS, error_string);
2901             }
2902         }
2903     }
2904
2905     if (!dlbIsOn(comm))
2906     {
2907         copy_rvec(cellsize_min, comm->cellsize_min);
2908     }
2909
2910     for (d = 0; d < comm->npmedecompdim; d++)
2911     {
2912         set_pme_maxshift(dd, &comm->ddpme[d],
2913                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
2914                          comm->ddpme[d].slb_dim_f);
2915     }
2916 }
2917
2918
2919 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2920                                                   int d, int dim, domdec_root_t *root,
2921                                                   gmx_ddbox_t *ddbox,
2922                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
2923 {
2924     gmx_domdec_comm_t *comm;
2925     int                ncd, i, j, nmin, nmin_old;
2926     gmx_bool           bLimLo, bLimHi;
2927     real              *cell_size;
2928     real               fac, halfway, cellsize_limit_f_i, region_size;
2929     gmx_bool           bPBC, bLastHi = FALSE;
2930     int                nrange[] = {range[0], range[1]};
2931
2932     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
2933
2934     comm = dd->comm;
2935
2936     ncd = dd->nc[dim];
2937
2938     bPBC = (dim < ddbox->npbcdim);
2939
2940     cell_size = root->buf_ncd;
2941
2942     if (debug)
2943     {
2944         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
2945     }
2946
2947     /* First we need to check if the scaling does not make cells
2948      * smaller than the smallest allowed size.
2949      * We need to do this iteratively, since if a cell is too small,
2950      * it needs to be enlarged, which makes all the other cells smaller,
2951      * which could in turn make another cell smaller than allowed.
2952      */
2953     for (i = range[0]; i < range[1]; i++)
2954     {
2955         root->bCellMin[i] = FALSE;
2956     }
2957     nmin = 0;
2958     do
2959     {
2960         nmin_old = nmin;
2961         /* We need the total for normalization */
2962         fac = 0;
2963         for (i = range[0]; i < range[1]; i++)
2964         {
2965             if (root->bCellMin[i] == FALSE)
2966             {
2967                 fac += cell_size[i];
2968             }
2969         }
2970         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
2971         /* Determine the cell boundaries */
2972         for (i = range[0]; i < range[1]; i++)
2973         {
2974             if (root->bCellMin[i] == FALSE)
2975             {
2976                 cell_size[i] *= fac;
2977                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
2978                 {
2979                     cellsize_limit_f_i = 0;
2980                 }
2981                 else
2982                 {
2983                     cellsize_limit_f_i = cellsize_limit_f;
2984                 }
2985                 if (cell_size[i] < cellsize_limit_f_i)
2986                 {
2987                     root->bCellMin[i] = TRUE;
2988                     cell_size[i]      = cellsize_limit_f_i;
2989                     nmin++;
2990                 }
2991             }
2992             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
2993         }
2994     }
2995     while (nmin > nmin_old);
2996
2997     i            = range[1]-1;
2998     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
2999     /* For this check we should not use DD_CELL_MARGIN,
3000      * but a slightly smaller factor,
3001      * since rounding could get use below the limit.
3002      */
3003     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3004     {
3005         char buf[22];
3006         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3007                   gmx_step_str(step, buf),
3008                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3009                   ncd, comm->cellsize_min[dim]);
3010     }
3011
3012     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3013
3014     if (!bUniform)
3015     {
3016         /* Check if the boundary did not displace more than halfway
3017          * each of the cells it bounds, as this could cause problems,
3018          * especially when the differences between cell sizes are large.
3019          * If changes are applied, they will not make cells smaller
3020          * than the cut-off, as we check all the boundaries which
3021          * might be affected by a change and if the old state was ok,
3022          * the cells will at most be shrunk back to their old size.
3023          */
3024         for (i = range[0]+1; i < range[1]; i++)
3025         {
3026             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3027             if (root->cell_f[i] < halfway)
3028             {
3029                 root->cell_f[i] = halfway;
3030                 /* Check if the change also causes shifts of the next boundaries */
3031                 for (j = i+1; j < range[1]; j++)
3032                 {
3033                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3034                     {
3035                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3036                     }
3037                 }
3038             }
3039             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3040             if (root->cell_f[i] > halfway)
3041             {
3042                 root->cell_f[i] = halfway;
3043                 /* Check if the change also causes shifts of the next boundaries */
3044                 for (j = i-1; j >= range[0]+1; j--)
3045                 {
3046                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3047                     {
3048                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3049                     }
3050                 }
3051             }
3052         }
3053     }
3054
3055     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3056     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3057      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3058      * for a and b nrange is used */
3059     if (d > 0)
3060     {
3061         /* Take care of the staggering of the cell boundaries */
3062         if (bUniform)
3063         {
3064             for (i = range[0]; i < range[1]; i++)
3065             {
3066                 root->cell_f_max0[i] = root->cell_f[i];
3067                 root->cell_f_min1[i] = root->cell_f[i+1];
3068             }
3069         }
3070         else
3071         {
3072             for (i = range[0]+1; i < range[1]; i++)
3073             {
3074                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3075                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3076                 if (bLimLo && bLimHi)
3077                 {
3078                     /* Both limits violated, try the best we can */
3079                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3080                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3081                     nrange[0]       = range[0];
3082                     nrange[1]       = i;
3083                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3084
3085                     nrange[0] = i;
3086                     nrange[1] = range[1];
3087                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3088
3089                     return;
3090                 }
3091                 else if (bLimLo)
3092                 {
3093                     /* root->cell_f[i] = root->bound_min[i]; */
3094                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3095                     bLastHi   = FALSE;
3096                 }
3097                 else if (bLimHi && !bLastHi)
3098                 {
3099                     bLastHi = TRUE;
3100                     if (nrange[1] < range[1])   /* found a LimLo before */
3101                     {
3102                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3103                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3104                         nrange[0] = nrange[1];
3105                     }
3106                     root->cell_f[i] = root->bound_max[i];
3107                     nrange[1]       = i;
3108                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3109                     nrange[0] = i;
3110                     nrange[1] = range[1];
3111                 }
3112             }
3113             if (nrange[1] < range[1])   /* found last a LimLo */
3114             {
3115                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3116                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3117                 nrange[0] = nrange[1];
3118                 nrange[1] = range[1];
3119                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3120             }
3121             else if (nrange[0] > range[0]) /* found at least one LimHi */
3122             {
3123                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3124             }
3125         }
3126     }
3127 }
3128
3129
3130 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3131                                        int d, int dim, domdec_root_t *root,
3132                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3133                                        gmx_bool bUniform, gmx_int64_t step)
3134 {
3135     gmx_domdec_comm_t *comm;
3136     int                ncd, d1, i, pos;
3137     real              *cell_size;
3138     real               load_aver, load_i, imbalance, change, change_max, sc;
3139     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3140     real               change_limit;
3141     real               relax = 0.5;
3142     gmx_bool           bPBC;
3143     int                range[] = { 0, 0 };
3144
3145     comm = dd->comm;
3146
3147     /* Convert the maximum change from the input percentage to a fraction */
3148     change_limit = comm->dlb_scale_lim*0.01;
3149
3150     ncd = dd->nc[dim];
3151
3152     bPBC = (dim < ddbox->npbcdim);
3153
3154     cell_size = root->buf_ncd;
3155
3156     /* Store the original boundaries */
3157     for (i = 0; i < ncd+1; i++)
3158     {
3159         root->old_cell_f[i] = root->cell_f[i];
3160     }
3161     if (bUniform)
3162     {
3163         for (i = 0; i < ncd; i++)
3164         {
3165             cell_size[i] = 1.0/ncd;
3166         }
3167     }
3168     else if (dd_load_count(comm) > 0)
3169     {
3170         load_aver  = comm->load[d].sum_m/ncd;
3171         change_max = 0;
3172         for (i = 0; i < ncd; i++)
3173         {
3174             /* Determine the relative imbalance of cell i */
3175             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3176             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3177             /* Determine the change of the cell size using underrelaxation */
3178             change     = -relax*imbalance;
3179             change_max = std::max(change_max, std::max(change, -change));
3180         }
3181         /* Limit the amount of scaling.
3182          * We need to use the same rescaling for all cells in one row,
3183          * otherwise the load balancing might not converge.
3184          */
3185         sc = relax;
3186         if (change_max > change_limit)
3187         {
3188             sc *= change_limit/change_max;
3189         }
3190         for (i = 0; i < ncd; i++)
3191         {
3192             /* Determine the relative imbalance of cell i */
3193             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3194             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3195             /* Determine the change of the cell size using underrelaxation */
3196             change       = -sc*imbalance;
3197             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3198         }
3199     }
3200
3201     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3202     cellsize_limit_f *= DD_CELL_MARGIN;
3203     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3204     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3205     if (ddbox->tric_dir[dim])
3206     {
3207         cellsize_limit_f /= ddbox->skew_fac[dim];
3208         dist_min_f       /= ddbox->skew_fac[dim];
3209     }
3210     if (bDynamicBox && d > 0)
3211     {
3212         dist_min_f *= DD_PRES_SCALE_MARGIN;
3213     }
3214     if (d > 0 && !bUniform)
3215     {
3216         /* Make sure that the grid is not shifted too much */
3217         for (i = 1; i < ncd; i++)
3218         {
3219             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3220             {
3221                 gmx_incons("Inconsistent DD boundary staggering limits!");
3222             }
3223             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3224             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3225             if (space > 0)
3226             {
3227                 root->bound_min[i] += 0.5*space;
3228             }
3229             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3230             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3231             if (space < 0)
3232             {
3233                 root->bound_max[i] += 0.5*space;
3234             }
3235             if (debug)
3236             {
3237                 fprintf(debug,
3238                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3239                         d, i,
3240                         root->cell_f_max0[i-1] + dist_min_f,
3241                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3242                         root->cell_f_min1[i] - dist_min_f);
3243             }
3244         }
3245     }
3246     range[1]          = ncd;
3247     root->cell_f[0]   = 0;
3248     root->cell_f[ncd] = 1;
3249     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3250
3251
3252     /* After the checks above, the cells should obey the cut-off
3253      * restrictions, but it does not hurt to check.
3254      */
3255     for (i = 0; i < ncd; i++)
3256     {
3257         if (debug)
3258         {
3259             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3260                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3261         }
3262
3263         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3264             root->cell_f[i+1] - root->cell_f[i] <
3265             cellsize_limit_f/DD_CELL_MARGIN)
3266         {
3267             char buf[22];
3268             fprintf(stderr,
3269                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3270                     gmx_step_str(step, buf), dim2char(dim), i,
3271                     (root->cell_f[i+1] - root->cell_f[i])
3272                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3273         }
3274     }
3275
3276     pos = ncd + 1;
3277     /* Store the cell boundaries of the lower dimensions at the end */
3278     for (d1 = 0; d1 < d; d1++)
3279     {
3280         root->cell_f[pos++] = comm->cell_f0[d1];
3281         root->cell_f[pos++] = comm->cell_f1[d1];
3282     }
3283
3284     if (d < comm->npmedecompdim)
3285     {
3286         /* The master determines the maximum shift for
3287          * the coordinate communication between separate PME nodes.
3288          */
3289         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3290     }
3291     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3292     if (d >= 1)
3293     {
3294         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3295     }
3296 }
3297
3298 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3299                                              gmx_ddbox_t *ddbox, int dimind)
3300 {
3301     gmx_domdec_comm_t *comm;
3302     int                dim;
3303
3304     comm = dd->comm;
3305
3306     /* Set the cell dimensions */
3307     dim                = dd->dim[dimind];
3308     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3309     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3310     if (dim >= ddbox->nboundeddim)
3311     {
3312         comm->cell_x0[dim] += ddbox->box0[dim];
3313         comm->cell_x1[dim] += ddbox->box0[dim];
3314     }
3315 }
3316
3317 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3318                                          int d, int dim, real *cell_f_row,
3319                                          gmx_ddbox_t *ddbox)
3320 {
3321     gmx_domdec_comm_t *comm;
3322     int                d1, pos;
3323
3324     comm = dd->comm;
3325
3326 #ifdef GMX_MPI
3327     /* Each node would only need to know two fractions,
3328      * but it is probably cheaper to broadcast the whole array.
3329      */
3330     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3331               0, comm->mpi_comm_load[d]);
3332 #endif
3333     /* Copy the fractions for this dimension from the buffer */
3334     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3335     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3336     /* The whole array was communicated, so set the buffer position */
3337     pos = dd->nc[dim] + 1;
3338     for (d1 = 0; d1 <= d; d1++)
3339     {
3340         if (d1 < d)
3341         {
3342             /* Copy the cell fractions of the lower dimensions */
3343             comm->cell_f0[d1] = cell_f_row[pos++];
3344             comm->cell_f1[d1] = cell_f_row[pos++];
3345         }
3346         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3347     }
3348     /* Convert the communicated shift from float to int */
3349     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3350     if (d >= 1)
3351     {
3352         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3353     }
3354 }
3355
3356 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3357                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3358                                          gmx_bool bUniform, gmx_int64_t step)
3359 {
3360     gmx_domdec_comm_t *comm;
3361     int                d, dim, d1;
3362     gmx_bool           bRowMember, bRowRoot;
3363     real              *cell_f_row;
3364
3365     comm = dd->comm;
3366
3367     for (d = 0; d < dd->ndim; d++)
3368     {
3369         dim        = dd->dim[d];
3370         bRowMember = TRUE;
3371         bRowRoot   = TRUE;
3372         for (d1 = d; d1 < dd->ndim; d1++)
3373         {
3374             if (dd->ci[dd->dim[d1]] > 0)
3375             {
3376                 if (d1 != d)
3377                 {
3378                     bRowMember = FALSE;
3379                 }
3380                 bRowRoot = FALSE;
3381             }
3382         }
3383         if (bRowMember)
3384         {
3385             if (bRowRoot)
3386             {
3387                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3388                                            ddbox, bDynamicBox, bUniform, step);
3389                 cell_f_row = comm->root[d]->cell_f;
3390             }
3391             else
3392             {
3393                 cell_f_row = comm->cell_f_row;
3394             }
3395             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3396         }
3397     }
3398 }
3399
3400 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3401 {
3402     int d;
3403
3404     /* This function assumes the box is static and should therefore
3405      * not be called when the box has changed since the last
3406      * call to dd_partition_system.
3407      */
3408     for (d = 0; d < dd->ndim; d++)
3409     {
3410         relative_to_absolute_cell_bounds(dd, ddbox, d);
3411     }
3412 }
3413
3414
3415
3416 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3417                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3418                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3419                                   gmx_wallcycle_t wcycle)
3420 {
3421     gmx_domdec_comm_t *comm;
3422     int                dim;
3423
3424     comm = dd->comm;
3425
3426     if (bDoDLB)
3427     {
3428         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3429         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3430         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3431     }
3432     else if (bDynamicBox)
3433     {
3434         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3435     }
3436
3437     /* Set the dimensions for which no DD is used */
3438     for (dim = 0; dim < DIM; dim++)
3439     {
3440         if (dd->nc[dim] == 1)
3441         {
3442             comm->cell_x0[dim] = 0;
3443             comm->cell_x1[dim] = ddbox->box_size[dim];
3444             if (dim >= ddbox->nboundeddim)
3445             {
3446                 comm->cell_x0[dim] += ddbox->box0[dim];
3447                 comm->cell_x1[dim] += ddbox->box0[dim];
3448             }
3449         }
3450     }
3451 }
3452
3453 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3454 {
3455     int                    d, np, i;
3456     gmx_domdec_comm_dim_t *cd;
3457
3458     for (d = 0; d < dd->ndim; d++)
3459     {
3460         cd = &dd->comm->cd[d];
3461         np = npulse[dd->dim[d]];
3462         if (np > cd->np_nalloc)
3463         {
3464             if (debug)
3465             {
3466                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3467                         dim2char(dd->dim[d]), np);
3468             }
3469             if (DDMASTER(dd) && cd->np_nalloc > 0)
3470             {
3471                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3472             }
3473             srenew(cd->ind, np);
3474             for (i = cd->np_nalloc; i < np; i++)
3475             {
3476                 cd->ind[i].index  = NULL;
3477                 cd->ind[i].nalloc = 0;
3478             }
3479             cd->np_nalloc = np;
3480         }
3481         cd->np = np;
3482     }
3483 }
3484
3485
3486 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3487                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3488                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3489                               gmx_wallcycle_t wcycle)
3490 {
3491     gmx_domdec_comm_t *comm;
3492     int                d;
3493     ivec               npulse;
3494
3495     comm = dd->comm;
3496
3497     /* Copy the old cell boundaries for the cg displacement check */
3498     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3499     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3500
3501     if (dlbIsOn(comm))
3502     {
3503         if (DDMASTER(dd))
3504         {
3505             check_box_size(dd, ddbox);
3506         }
3507         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3508     }
3509     else
3510     {
3511         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3512         realloc_comm_ind(dd, npulse);
3513     }
3514
3515     if (debug)
3516     {
3517         for (d = 0; d < DIM; d++)
3518         {
3519             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3520                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3521         }
3522     }
3523 }
3524
3525 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3526                                   gmx_ddbox_t *ddbox,
3527                                   rvec cell_ns_x0, rvec cell_ns_x1,
3528                                   gmx_int64_t step)
3529 {
3530     gmx_domdec_comm_t *comm;
3531     int                dim_ind, dim;
3532
3533     comm = dd->comm;
3534
3535     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3536     {
3537         dim = dd->dim[dim_ind];
3538
3539         /* Without PBC we don't have restrictions on the outer cells */
3540         if (!(dim >= ddbox->npbcdim &&
3541               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3542             dlbIsOn(comm) &&
3543             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3544             comm->cellsize_min[dim])
3545         {
3546             char buf[22];
3547             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3548                       gmx_step_str(step, buf), dim2char(dim),
3549                       comm->cell_x1[dim] - comm->cell_x0[dim],
3550                       ddbox->skew_fac[dim],
3551                       dd->comm->cellsize_min[dim],
3552                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3553         }
3554     }
3555
3556     if ((dlbIsOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3557     {
3558         /* Communicate the boundaries and update cell_ns_x0/1 */
3559         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3560         if (dlbIsOn(dd->comm) && dd->ndim > 1)
3561         {
3562             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3563         }
3564     }
3565 }
3566
3567 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3568 {
3569     if (YY < npbcdim)
3570     {
3571         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3572     }
3573     else
3574     {
3575         tcm[YY][XX] = 0;
3576     }
3577     if (ZZ < npbcdim)
3578     {
3579         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3580         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3581     }
3582     else
3583     {
3584         tcm[ZZ][XX] = 0;
3585         tcm[ZZ][YY] = 0;
3586     }
3587 }
3588
3589 static void check_screw_box(matrix box)
3590 {
3591     /* Mathematical limitation */
3592     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3593     {
3594         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3595     }
3596
3597     /* Limitation due to the asymmetry of the eighth shell method */
3598     if (box[ZZ][YY] != 0)
3599     {
3600         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3601     }
3602 }
3603
3604 static void distribute_cg(FILE *fplog,
3605                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3606                           gmx_domdec_t *dd)
3607 {
3608     gmx_domdec_master_t *ma;
3609     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3610     int                  i, icg, j, k, k0, k1, d;
3611     matrix               tcm;
3612     rvec                 cg_cm;
3613     ivec                 ind;
3614     real                 nrcg, inv_ncg, pos_d;
3615     atom_id             *cgindex;
3616     gmx_bool             bScrew;
3617
3618     ma = dd->ma;
3619
3620     if (tmp_ind == NULL)
3621     {
3622         snew(tmp_nalloc, dd->nnodes);
3623         snew(tmp_ind, dd->nnodes);
3624         for (i = 0; i < dd->nnodes; i++)
3625         {
3626             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3627             snew(tmp_ind[i], tmp_nalloc[i]);
3628         }
3629     }
3630
3631     /* Clear the count */
3632     for (i = 0; i < dd->nnodes; i++)
3633     {
3634         ma->ncg[i] = 0;
3635         ma->nat[i] = 0;
3636     }
3637
3638     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3639
3640     cgindex = cgs->index;
3641
3642     /* Compute the center of geometry for all charge groups */
3643     for (icg = 0; icg < cgs->nr; icg++)
3644     {
3645         k0      = cgindex[icg];
3646         k1      = cgindex[icg+1];
3647         nrcg    = k1 - k0;
3648         if (nrcg == 1)
3649         {
3650             copy_rvec(pos[k0], cg_cm);
3651         }
3652         else
3653         {
3654             inv_ncg = 1.0/nrcg;
3655
3656             clear_rvec(cg_cm);
3657             for (k = k0; (k < k1); k++)
3658             {
3659                 rvec_inc(cg_cm, pos[k]);
3660             }
3661             for (d = 0; (d < DIM); d++)
3662             {
3663                 cg_cm[d] *= inv_ncg;
3664             }
3665         }
3666         /* Put the charge group in the box and determine the cell index */
3667         for (d = DIM-1; d >= 0; d--)
3668         {
3669             pos_d = cg_cm[d];
3670             if (d < dd->npbcdim)
3671             {
3672                 bScrew = (dd->bScrewPBC && d == XX);
3673                 if (tric_dir[d] && dd->nc[d] > 1)
3674                 {
3675                     /* Use triclinic coordintates for this dimension */
3676                     for (j = d+1; j < DIM; j++)
3677                     {
3678                         pos_d += cg_cm[j]*tcm[j][d];
3679                     }
3680                 }
3681                 while (pos_d >= box[d][d])
3682                 {
3683                     pos_d -= box[d][d];
3684                     rvec_dec(cg_cm, box[d]);
3685                     if (bScrew)
3686                     {
3687                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3688                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3689                     }
3690                     for (k = k0; (k < k1); k++)
3691                     {
3692                         rvec_dec(pos[k], box[d]);
3693                         if (bScrew)
3694                         {
3695                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3696                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3697                         }
3698                     }
3699                 }
3700                 while (pos_d < 0)
3701                 {
3702                     pos_d += box[d][d];
3703                     rvec_inc(cg_cm, box[d]);
3704                     if (bScrew)
3705                     {
3706                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3707                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3708                     }
3709                     for (k = k0; (k < k1); k++)
3710                     {
3711                         rvec_inc(pos[k], box[d]);
3712                         if (bScrew)
3713                         {
3714                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3715                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3716                         }
3717                     }
3718                 }
3719             }
3720             /* This could be done more efficiently */
3721             ind[d] = 0;
3722             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3723             {
3724                 ind[d]++;
3725             }
3726         }
3727         i = dd_index(dd->nc, ind);
3728         if (ma->ncg[i] == tmp_nalloc[i])
3729         {
3730             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3731             srenew(tmp_ind[i], tmp_nalloc[i]);
3732         }
3733         tmp_ind[i][ma->ncg[i]] = icg;
3734         ma->ncg[i]++;
3735         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3736     }
3737
3738     k1 = 0;
3739     for (i = 0; i < dd->nnodes; i++)
3740     {
3741         ma->index[i] = k1;
3742         for (k = 0; k < ma->ncg[i]; k++)
3743         {
3744             ma->cg[k1++] = tmp_ind[i][k];
3745         }
3746     }
3747     ma->index[dd->nnodes] = k1;
3748
3749     for (i = 0; i < dd->nnodes; i++)
3750     {
3751         sfree(tmp_ind[i]);
3752     }
3753     sfree(tmp_ind);
3754     sfree(tmp_nalloc);
3755
3756     if (fplog)
3757     {
3758         /* Here we avoid int overflows due to #atoms^2: use double, dsqr */
3759         int    nat_sum, nat_min, nat_max;
3760         double nat2_sum;
3761
3762         nat_sum  = 0;
3763         nat2_sum = 0;
3764         nat_min  = ma->nat[0];
3765         nat_max  = ma->nat[0];
3766         for (i = 0; i < dd->nnodes; i++)
3767         {
3768             nat_sum  += ma->nat[i];
3769             nat2_sum += dsqr(ma->nat[i]);
3770             nat_min   = std::min(nat_min, ma->nat[i]);
3771             nat_max   = std::max(nat_max, ma->nat[i]);
3772         }
3773         nat_sum  /= dd->nnodes;
3774         nat2_sum /= dd->nnodes;
3775
3776         fprintf(fplog, "Atom distribution over %d domains: av %d stddev %d min %d max %d\n",
3777                 dd->nnodes,
3778                 nat_sum,
3779                 static_cast<int>(sqrt(nat2_sum - dsqr(nat_sum) + 0.5)),
3780                 nat_min, nat_max);
3781     }
3782 }
3783
3784 static void get_cg_distribution(FILE *fplog, gmx_domdec_t *dd,
3785                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
3786                                 rvec pos[])
3787 {
3788     gmx_domdec_master_t *ma = NULL;
3789     ivec                 npulse;
3790     int                  i, cg_gl;
3791     int                 *ibuf, buf2[2] = { 0, 0 };
3792     gmx_bool             bMaster = DDMASTER(dd);
3793
3794     if (bMaster)
3795     {
3796         ma = dd->ma;
3797
3798         if (dd->bScrewPBC)
3799         {
3800             check_screw_box(box);
3801         }
3802
3803         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
3804
3805         distribute_cg(fplog, box, ddbox->tric_dir, cgs, pos, dd);
3806         for (i = 0; i < dd->nnodes; i++)
3807         {
3808             ma->ibuf[2*i]   = ma->ncg[i];
3809             ma->ibuf[2*i+1] = ma->nat[i];
3810         }
3811         ibuf = ma->ibuf;
3812     }
3813     else
3814     {
3815         ibuf = NULL;
3816     }
3817     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
3818
3819     dd->ncg_home = buf2[0];
3820     dd->nat_home = buf2[1];
3821     dd->ncg_tot  = dd->ncg_home;
3822     dd->nat_tot  = dd->nat_home;
3823     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3824     {
3825         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3826         srenew(dd->index_gl, dd->cg_nalloc);
3827         srenew(dd->cgindex, dd->cg_nalloc+1);
3828     }
3829     if (bMaster)
3830     {
3831         for (i = 0; i < dd->nnodes; i++)
3832         {
3833             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
3834             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3835         }
3836     }
3837
3838     dd_scatterv(dd,
3839                 bMaster ? ma->ibuf : NULL,
3840                 bMaster ? ma->ibuf+dd->nnodes : NULL,
3841                 bMaster ? ma->cg : NULL,
3842                 dd->ncg_home*sizeof(int), dd->index_gl);
3843
3844     /* Determine the home charge group sizes */
3845     dd->cgindex[0] = 0;
3846     for (i = 0; i < dd->ncg_home; i++)
3847     {
3848         cg_gl            = dd->index_gl[i];
3849         dd->cgindex[i+1] =
3850             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3851     }
3852
3853     if (debug)
3854     {
3855         fprintf(debug, "Home charge groups:\n");
3856         for (i = 0; i < dd->ncg_home; i++)
3857         {
3858             fprintf(debug, " %d", dd->index_gl[i]);
3859             if (i % 10 == 9)
3860             {
3861                 fprintf(debug, "\n");
3862             }
3863         }
3864         fprintf(debug, "\n");
3865     }
3866 }
3867
3868 static int compact_and_copy_vec_at(int ncg, int *move,
3869                                    int *cgindex,
3870                                    int nvec, int vec,
3871                                    rvec *src, gmx_domdec_comm_t *comm,
3872                                    gmx_bool bCompact)
3873 {
3874     int m, icg, i, i0, i1, nrcg;
3875     int home_pos;
3876     int pos_vec[DIM*2];
3877
3878     home_pos = 0;
3879
3880     for (m = 0; m < DIM*2; m++)
3881     {
3882         pos_vec[m] = 0;
3883     }
3884
3885     i0 = 0;
3886     for (icg = 0; icg < ncg; icg++)
3887     {
3888         i1 = cgindex[icg+1];
3889         m  = move[icg];
3890         if (m == -1)
3891         {
3892             if (bCompact)
3893             {
3894                 /* Compact the home array in place */
3895                 for (i = i0; i < i1; i++)
3896                 {
3897                     copy_rvec(src[i], src[home_pos++]);
3898                 }
3899             }
3900         }
3901         else
3902         {
3903             /* Copy to the communication buffer */
3904             nrcg        = i1 - i0;
3905             pos_vec[m] += 1 + vec*nrcg;
3906             for (i = i0; i < i1; i++)
3907             {
3908                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
3909             }
3910             pos_vec[m] += (nvec - vec - 1)*nrcg;
3911         }
3912         if (!bCompact)
3913         {
3914             home_pos += i1 - i0;
3915         }
3916         i0 = i1;
3917     }
3918
3919     return home_pos;
3920 }
3921
3922 static int compact_and_copy_vec_cg(int ncg, int *move,
3923                                    int *cgindex,
3924                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
3925                                    gmx_bool bCompact)
3926 {
3927     int m, icg, i0, i1, nrcg;
3928     int home_pos;
3929     int pos_vec[DIM*2];
3930
3931     home_pos = 0;
3932
3933     for (m = 0; m < DIM*2; m++)
3934     {
3935         pos_vec[m] = 0;
3936     }
3937
3938     i0 = 0;
3939     for (icg = 0; icg < ncg; icg++)
3940     {
3941         i1 = cgindex[icg+1];
3942         m  = move[icg];
3943         if (m == -1)
3944         {
3945             if (bCompact)
3946             {
3947                 /* Compact the home array in place */
3948                 copy_rvec(src[icg], src[home_pos++]);
3949             }
3950         }
3951         else
3952         {
3953             nrcg = i1 - i0;
3954             /* Copy to the communication buffer */
3955             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
3956             pos_vec[m] += 1 + nrcg*nvec;
3957         }
3958         i0 = i1;
3959     }
3960     if (!bCompact)
3961     {
3962         home_pos = ncg;
3963     }
3964
3965     return home_pos;
3966 }
3967
3968 static int compact_ind(int ncg, int *move,
3969                        int *index_gl, int *cgindex,
3970                        int *gatindex,
3971                        gmx_ga2la_t ga2la, char *bLocalCG,
3972                        int *cginfo)
3973 {
3974     int cg, nat, a0, a1, a, a_gl;
3975     int home_pos;
3976
3977     home_pos = 0;
3978     nat      = 0;
3979     for (cg = 0; cg < ncg; cg++)
3980     {
3981         a0 = cgindex[cg];
3982         a1 = cgindex[cg+1];
3983         if (move[cg] == -1)
3984         {
3985             /* Compact the home arrays in place.
3986              * Anything that can be done here avoids access to global arrays.
3987              */
3988             cgindex[home_pos] = nat;
3989             for (a = a0; a < a1; a++)
3990             {
3991                 a_gl          = gatindex[a];
3992                 gatindex[nat] = a_gl;
3993                 /* The cell number stays 0, so we don't need to set it */
3994                 ga2la_change_la(ga2la, a_gl, nat);
3995                 nat++;
3996             }
3997             index_gl[home_pos] = index_gl[cg];
3998             cginfo[home_pos]   = cginfo[cg];
3999             /* The charge group remains local, so bLocalCG does not change */
4000             home_pos++;
4001         }
4002         else
4003         {
4004             /* Clear the global indices */
4005             for (a = a0; a < a1; a++)
4006             {
4007                 ga2la_del(ga2la, gatindex[a]);
4008             }
4009             if (bLocalCG)
4010             {
4011                 bLocalCG[index_gl[cg]] = FALSE;
4012             }
4013         }
4014     }
4015     cgindex[home_pos] = nat;
4016
4017     return home_pos;
4018 }
4019
4020 static void clear_and_mark_ind(int ncg, int *move,
4021                                int *index_gl, int *cgindex, int *gatindex,
4022                                gmx_ga2la_t ga2la, char *bLocalCG,
4023                                int *cell_index)
4024 {
4025     int cg, a0, a1, a;
4026
4027     for (cg = 0; cg < ncg; cg++)
4028     {
4029         if (move[cg] >= 0)
4030         {
4031             a0 = cgindex[cg];
4032             a1 = cgindex[cg+1];
4033             /* Clear the global indices */
4034             for (a = a0; a < a1; a++)
4035             {
4036                 ga2la_del(ga2la, gatindex[a]);
4037             }
4038             if (bLocalCG)
4039             {
4040                 bLocalCG[index_gl[cg]] = FALSE;
4041             }
4042             /* Signal that this cg has moved using the ns cell index.
4043              * Here we set it to -1. fill_grid will change it
4044              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4045              */
4046             cell_index[cg] = -1;
4047         }
4048     }
4049 }
4050
4051 static void print_cg_move(FILE *fplog,
4052                           gmx_domdec_t *dd,
4053                           gmx_int64_t step, int cg, int dim, int dir,
4054                           gmx_bool bHaveCgcmOld, real limitd,
4055                           rvec cm_old, rvec cm_new, real pos_d)
4056 {
4057     gmx_domdec_comm_t *comm;
4058     char               buf[22];
4059
4060     comm = dd->comm;
4061
4062     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4063     if (limitd > 0)
4064     {
4065         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4066                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4067                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4068     }
4069     else
4070     {
4071         /* We don't have a limiting distance available: don't print it */
4072         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4073                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4074                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4075     }
4076     fprintf(fplog, "distance out of cell %f\n",
4077             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4078     if (bHaveCgcmOld)
4079     {
4080         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4081                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4082     }
4083     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4084             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4085     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4086             dim2char(dim),
4087             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4088     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4089             dim2char(dim),
4090             comm->cell_x0[dim], comm->cell_x1[dim]);
4091 }
4092
4093 static void cg_move_error(FILE *fplog,
4094                           gmx_domdec_t *dd,
4095                           gmx_int64_t step, int cg, int dim, int dir,
4096                           gmx_bool bHaveCgcmOld, real limitd,
4097                           rvec cm_old, rvec cm_new, real pos_d)
4098 {
4099     if (fplog)
4100     {
4101         print_cg_move(fplog, dd, step, cg, dim, dir,
4102                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4103     }
4104     print_cg_move(stderr, dd, step, cg, dim, dir,
4105                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4106     gmx_fatal(FARGS,
4107               "%s moved too far between two domain decomposition steps\n"
4108               "This usually means that your system is not well equilibrated",
4109               dd->comm->bCGs ? "A charge group" : "An atom");
4110 }
4111
4112 static void rotate_state_atom(t_state *state, int a)
4113 {
4114     int est;
4115
4116     for (est = 0; est < estNR; est++)
4117     {
4118         if (EST_DISTR(est) && (state->flags & (1<<est)))
4119         {
4120             switch (est)
4121             {
4122                 case estX:
4123                     /* Rotate the complete state; for a rectangular box only */
4124                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4125                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4126                     break;
4127                 case estV:
4128                     state->v[a][YY] = -state->v[a][YY];
4129                     state->v[a][ZZ] = -state->v[a][ZZ];
4130                     break;
4131                 case estSDX:
4132                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4133                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4134                     break;
4135                 case estCGP:
4136                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4137                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4138                     break;
4139                 case estDISRE_INITF:
4140                 case estDISRE_RM3TAV:
4141                 case estORIRE_INITF:
4142                 case estORIRE_DTAV:
4143                     /* These are distances, so not affected by rotation */
4144                     break;
4145                 default:
4146                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4147             }
4148         }
4149     }
4150 }
4151
4152 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4153 {
4154     if (natoms > comm->moved_nalloc)
4155     {
4156         /* Contents should be preserved here */
4157         comm->moved_nalloc = over_alloc_dd(natoms);
4158         srenew(comm->moved, comm->moved_nalloc);
4159     }
4160
4161     return comm->moved;
4162 }
4163
4164 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4165                          gmx_domdec_t *dd,
4166                          t_state *state,
4167                          ivec tric_dir, matrix tcm,
4168                          rvec cell_x0, rvec cell_x1,
4169                          rvec limitd, rvec limit0, rvec limit1,
4170                          const int *cgindex,
4171                          int cg_start, int cg_end,
4172                          rvec *cg_cm,
4173                          int *move)
4174 {
4175     int      npbcdim;
4176     int      cg, k, k0, k1, d, dim, d2;
4177     int      mc, nrcg;
4178     int      flag;
4179     gmx_bool bScrew;
4180     ivec     dev;
4181     real     inv_ncg, pos_d;
4182     rvec     cm_new;
4183
4184     npbcdim = dd->npbcdim;
4185
4186     for (cg = cg_start; cg < cg_end; cg++)
4187     {
4188         k0   = cgindex[cg];
4189         k1   = cgindex[cg+1];
4190         nrcg = k1 - k0;
4191         if (nrcg == 1)
4192         {
4193             copy_rvec(state->x[k0], cm_new);
4194         }
4195         else
4196         {
4197             inv_ncg = 1.0/nrcg;
4198
4199             clear_rvec(cm_new);
4200             for (k = k0; (k < k1); k++)
4201             {
4202                 rvec_inc(cm_new, state->x[k]);
4203             }
4204             for (d = 0; (d < DIM); d++)
4205             {
4206                 cm_new[d] = inv_ncg*cm_new[d];
4207             }
4208         }
4209
4210         clear_ivec(dev);
4211         /* Do pbc and check DD cell boundary crossings */
4212         for (d = DIM-1; d >= 0; d--)
4213         {
4214             if (dd->nc[d] > 1)
4215             {
4216                 bScrew = (dd->bScrewPBC && d == XX);
4217                 /* Determine the location of this cg in lattice coordinates */
4218                 pos_d = cm_new[d];
4219                 if (tric_dir[d])
4220                 {
4221                     for (d2 = d+1; d2 < DIM; d2++)
4222                     {
4223                         pos_d += cm_new[d2]*tcm[d2][d];
4224                     }
4225                 }
4226                 /* Put the charge group in the triclinic unit-cell */
4227                 if (pos_d >= cell_x1[d])
4228                 {
4229                     if (pos_d >= limit1[d])
4230                     {
4231                         cg_move_error(fplog, dd, step, cg, d, 1,
4232                                       cg_cm != state->x, limitd[d],
4233                                       cg_cm[cg], cm_new, pos_d);
4234                     }
4235                     dev[d] = 1;
4236                     if (dd->ci[d] == dd->nc[d] - 1)
4237                     {
4238                         rvec_dec(cm_new, state->box[d]);
4239                         if (bScrew)
4240                         {
4241                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4242                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4243                         }
4244                         for (k = k0; (k < k1); k++)
4245                         {
4246                             rvec_dec(state->x[k], state->box[d]);
4247                             if (bScrew)
4248                             {
4249                                 rotate_state_atom(state, k);
4250                             }
4251                         }
4252                     }
4253                 }
4254                 else if (pos_d < cell_x0[d])
4255                 {
4256                     if (pos_d < limit0[d])
4257                     {
4258                         cg_move_error(fplog, dd, step, cg, d, -1,
4259                                       cg_cm != state->x, limitd[d],
4260                                       cg_cm[cg], cm_new, pos_d);
4261                     }
4262                     dev[d] = -1;
4263                     if (dd->ci[d] == 0)
4264                     {
4265                         rvec_inc(cm_new, state->box[d]);
4266                         if (bScrew)
4267                         {
4268                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4269                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4270                         }
4271                         for (k = k0; (k < k1); k++)
4272                         {
4273                             rvec_inc(state->x[k], state->box[d]);
4274                             if (bScrew)
4275                             {
4276                                 rotate_state_atom(state, k);
4277                             }
4278                         }
4279                     }
4280                 }
4281             }
4282             else if (d < npbcdim)
4283             {
4284                 /* Put the charge group in the rectangular unit-cell */
4285                 while (cm_new[d] >= state->box[d][d])
4286                 {
4287                     rvec_dec(cm_new, state->box[d]);
4288                     for (k = k0; (k < k1); k++)
4289                     {
4290                         rvec_dec(state->x[k], state->box[d]);
4291                     }
4292                 }
4293                 while (cm_new[d] < 0)
4294                 {
4295                     rvec_inc(cm_new, state->box[d]);
4296                     for (k = k0; (k < k1); k++)
4297                     {
4298                         rvec_inc(state->x[k], state->box[d]);
4299                     }
4300                 }
4301             }
4302         }
4303
4304         copy_rvec(cm_new, cg_cm[cg]);
4305
4306         /* Determine where this cg should go */
4307         flag = 0;
4308         mc   = -1;
4309         for (d = 0; d < dd->ndim; d++)
4310         {
4311             dim = dd->dim[d];
4312             if (dev[dim] == 1)
4313             {
4314                 flag |= DD_FLAG_FW(d);
4315                 if (mc == -1)
4316                 {
4317                     mc = d*2;
4318                 }
4319             }
4320             else if (dev[dim] == -1)
4321             {
4322                 flag |= DD_FLAG_BW(d);
4323                 if (mc == -1)
4324                 {
4325                     if (dd->nc[dim] > 2)
4326                     {
4327                         mc = d*2 + 1;
4328                     }
4329                     else
4330                     {
4331                         mc = d*2;
4332                     }
4333                 }
4334             }
4335         }
4336         /* Temporarily store the flag in move */
4337         move[cg] = mc + flag;
4338     }
4339 }
4340
4341 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4342                                gmx_domdec_t *dd, ivec tric_dir,
4343                                t_state *state, rvec **f,
4344                                t_forcerec *fr,
4345                                gmx_bool bCompact,
4346                                t_nrnb *nrnb,
4347                                int *ncg_stay_home,
4348                                int *ncg_moved)
4349 {
4350     int               *move;
4351     int                npbcdim;
4352     int                ncg[DIM*2], nat[DIM*2];
4353     int                c, i, cg, k, d, dim, dim2, dir, d2, d3;
4354     int                mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
4355     int                sbuf[2], rbuf[2];
4356     int                home_pos_cg, home_pos_at, buf_pos;
4357     int                flag;
4358     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4359     real               pos_d;
4360     matrix             tcm;
4361     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1;
4362     atom_id           *cgindex;
4363     cginfo_mb_t       *cginfo_mb;
4364     gmx_domdec_comm_t *comm;
4365     int               *moved;
4366     int                nthread, thread;
4367
4368     if (dd->bScrewPBC)
4369     {
4370         check_screw_box(state->box);
4371     }
4372
4373     comm  = dd->comm;
4374     if (fr->cutoff_scheme == ecutsGROUP)
4375     {
4376         cg_cm = fr->cg_cm;
4377     }
4378
4379     for (i = 0; i < estNR; i++)
4380     {
4381         if (EST_DISTR(i))
4382         {
4383             switch (i)
4384             {
4385                 case estX: /* Always present */ break;
4386                 case estV:   bV   = (state->flags & (1<<i)); break;
4387                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4388                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4389                 case estLD_RNG:
4390                 case estLD_RNGI:
4391                 case estDISRE_INITF:
4392                 case estDISRE_RM3TAV:
4393                 case estORIRE_INITF:
4394                 case estORIRE_DTAV:
4395                     /* No processing required */
4396                     break;
4397                 default:
4398                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4399             }
4400         }
4401     }
4402
4403     if (dd->ncg_tot > comm->nalloc_int)
4404     {
4405         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4406         srenew(comm->buf_int, comm->nalloc_int);
4407     }
4408     move = comm->buf_int;
4409
4410     /* Clear the count */
4411     for (c = 0; c < dd->ndim*2; c++)
4412     {
4413         ncg[c] = 0;
4414         nat[c] = 0;
4415     }
4416
4417     npbcdim = dd->npbcdim;
4418
4419     for (d = 0; (d < DIM); d++)
4420     {
4421         limitd[d] = dd->comm->cellsize_min[d];
4422         if (d >= npbcdim && dd->ci[d] == 0)
4423         {
4424             cell_x0[d] = -GMX_FLOAT_MAX;
4425         }
4426         else
4427         {
4428             cell_x0[d] = comm->cell_x0[d];
4429         }
4430         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4431         {
4432             cell_x1[d] = GMX_FLOAT_MAX;
4433         }
4434         else
4435         {
4436             cell_x1[d] = comm->cell_x1[d];
4437         }
4438         if (d < npbcdim)
4439         {
4440             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4441             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4442         }
4443         else
4444         {
4445             /* We check after communication if a charge group moved
4446              * more than one cell. Set the pre-comm check limit to float_max.
4447              */
4448             limit0[d] = -GMX_FLOAT_MAX;
4449             limit1[d] =  GMX_FLOAT_MAX;
4450         }
4451     }
4452
4453     make_tric_corr_matrix(npbcdim, state->box, tcm);
4454
4455     cgindex = dd->cgindex;
4456
4457     nthread = gmx_omp_nthreads_get(emntDomdec);
4458
4459     /* Compute the center of geometry for all home charge groups
4460      * and put them in the box and determine where they should go.
4461      */
4462 #pragma omp parallel for num_threads(nthread) schedule(static)
4463     for (thread = 0; thread < nthread; thread++)
4464     {
4465         try
4466         {
4467             calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4468                          cell_x0, cell_x1, limitd, limit0, limit1,
4469                          cgindex,
4470                          ( thread   *dd->ncg_home)/nthread,
4471                          ((thread+1)*dd->ncg_home)/nthread,
4472                          fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4473                          move);
4474         }
4475         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4476     }
4477
4478     for (cg = 0; cg < dd->ncg_home; cg++)
4479     {
4480         if (move[cg] >= 0)
4481         {
4482             mc       = move[cg];
4483             flag     = mc & ~DD_FLAG_NRCG;
4484             mc       = mc & DD_FLAG_NRCG;
4485             move[cg] = mc;
4486
4487             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4488             {
4489                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4490                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4491             }
4492             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4493             /* We store the cg size in the lower 16 bits
4494              * and the place where the charge group should go
4495              * in the next 6 bits. This saves some communication volume.
4496              */
4497             nrcg = cgindex[cg+1] - cgindex[cg];
4498             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4499             ncg[mc] += 1;
4500             nat[mc] += nrcg;
4501         }
4502     }
4503
4504     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4505     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4506
4507     *ncg_moved = 0;
4508     for (i = 0; i < dd->ndim*2; i++)
4509     {
4510         *ncg_moved += ncg[i];
4511     }
4512
4513     nvec = 1;
4514     if (bV)
4515     {
4516         nvec++;
4517     }
4518     if (bSDX)
4519     {
4520         nvec++;
4521     }
4522     if (bCGP)
4523     {
4524         nvec++;
4525     }
4526
4527     /* Make sure the communication buffers are large enough */
4528     for (mc = 0; mc < dd->ndim*2; mc++)
4529     {
4530         nvr = ncg[mc] + nat[mc]*nvec;
4531         if (nvr > comm->cgcm_state_nalloc[mc])
4532         {
4533             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4534             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4535         }
4536     }
4537
4538     switch (fr->cutoff_scheme)
4539     {
4540         case ecutsGROUP:
4541             /* Recalculating cg_cm might be cheaper than communicating,
4542              * but that could give rise to rounding issues.
4543              */
4544             home_pos_cg =
4545                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4546                                         nvec, cg_cm, comm, bCompact);
4547             break;
4548         case ecutsVERLET:
4549             /* Without charge groups we send the moved atom coordinates
4550              * over twice. This is so the code below can be used without
4551              * many conditionals for both for with and without charge groups.
4552              */
4553             home_pos_cg =
4554                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4555                                         nvec, state->x, comm, FALSE);
4556             if (bCompact)
4557             {
4558                 home_pos_cg -= *ncg_moved;
4559             }
4560             break;
4561         default:
4562             gmx_incons("unimplemented");
4563             home_pos_cg = 0;
4564     }
4565
4566     vec         = 0;
4567     home_pos_at =
4568         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4569                                 nvec, vec++, state->x, comm, bCompact);
4570     if (bV)
4571     {
4572         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4573                                 nvec, vec++, state->v, comm, bCompact);
4574     }
4575     if (bSDX)
4576     {
4577         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4578                                 nvec, vec++, state->sd_X, comm, bCompact);
4579     }
4580     if (bCGP)
4581     {
4582         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4583                                 nvec, vec++, state->cg_p, comm, bCompact);
4584     }
4585
4586     if (bCompact)
4587     {
4588         compact_ind(dd->ncg_home, move,
4589                     dd->index_gl, dd->cgindex, dd->gatindex,
4590                     dd->ga2la, comm->bLocalCG,
4591                     fr->cginfo);
4592     }
4593     else
4594     {
4595         if (fr->cutoff_scheme == ecutsVERLET)
4596         {
4597             moved = get_moved(comm, dd->ncg_home);
4598
4599             for (k = 0; k < dd->ncg_home; k++)
4600             {
4601                 moved[k] = 0;
4602             }
4603         }
4604         else
4605         {
4606             moved = fr->ns.grid->cell_index;
4607         }
4608
4609         clear_and_mark_ind(dd->ncg_home, move,
4610                            dd->index_gl, dd->cgindex, dd->gatindex,
4611                            dd->ga2la, comm->bLocalCG,
4612                            moved);
4613     }
4614
4615     cginfo_mb = fr->cginfo_mb;
4616
4617     *ncg_stay_home = home_pos_cg;
4618     for (d = 0; d < dd->ndim; d++)
4619     {
4620         dim      = dd->dim[d];
4621         ncg_recv = 0;
4622         nvr      = 0;
4623         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4624         {
4625             cdd = d*2 + dir;
4626             /* Communicate the cg and atom counts */
4627             sbuf[0] = ncg[cdd];
4628             sbuf[1] = nat[cdd];
4629             if (debug)
4630             {
4631                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4632                         d, dir, sbuf[0], sbuf[1]);
4633             }
4634             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4635
4636             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4637             {
4638                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4639                 srenew(comm->buf_int, comm->nalloc_int);
4640             }
4641
4642             /* Communicate the charge group indices, sizes and flags */
4643             dd_sendrecv_int(dd, d, dir,
4644                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4645                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4646
4647             nvs = ncg[cdd] + nat[cdd]*nvec;
4648             i   = rbuf[0]  + rbuf[1] *nvec;
4649             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4650
4651             /* Communicate cgcm and state */
4652             dd_sendrecv_rvec(dd, d, dir,
4653                              comm->cgcm_state[cdd], nvs,
4654                              comm->vbuf.v+nvr, i);
4655             ncg_recv += rbuf[0];
4656             nvr      += i;
4657         }
4658
4659         /* Process the received charge groups */
4660         buf_pos = 0;
4661         for (cg = 0; cg < ncg_recv; cg++)
4662         {
4663             flag = comm->buf_int[cg*DD_CGIBS+1];
4664
4665             if (dim >= npbcdim && dd->nc[dim] > 2)
4666             {
4667                 /* No pbc in this dim and more than one domain boundary.
4668                  * We do a separate check if a charge group didn't move too far.
4669                  */
4670                 if (((flag & DD_FLAG_FW(d)) &&
4671                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4672                     ((flag & DD_FLAG_BW(d)) &&
4673                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4674                 {
4675                     cg_move_error(fplog, dd, step, cg, dim,
4676                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4677                                   fr->cutoff_scheme == ecutsGROUP, 0,
4678                                   comm->vbuf.v[buf_pos],
4679                                   comm->vbuf.v[buf_pos],
4680                                   comm->vbuf.v[buf_pos][dim]);
4681                 }
4682             }
4683
4684             mc = -1;
4685             if (d < dd->ndim-1)
4686             {
4687                 /* Check which direction this cg should go */
4688                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4689                 {
4690                     if (dlbIsOn(dd->comm))
4691                     {
4692                         /* The cell boundaries for dimension d2 are not equal
4693                          * for each cell row of the lower dimension(s),
4694                          * therefore we might need to redetermine where
4695                          * this cg should go.
4696                          */
4697                         dim2 = dd->dim[d2];
4698                         /* If this cg crosses the box boundary in dimension d2
4699                          * we can use the communicated flag, so we do not
4700                          * have to worry about pbc.
4701                          */
4702                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4703                                (flag & DD_FLAG_FW(d2))) ||
4704                               (dd->ci[dim2] == 0 &&
4705                                (flag & DD_FLAG_BW(d2)))))
4706                         {
4707                             /* Clear the two flags for this dimension */
4708                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4709                             /* Determine the location of this cg
4710                              * in lattice coordinates
4711                              */
4712                             pos_d = comm->vbuf.v[buf_pos][dim2];
4713                             if (tric_dir[dim2])
4714                             {
4715                                 for (d3 = dim2+1; d3 < DIM; d3++)
4716                                 {
4717                                     pos_d +=
4718                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4719                                 }
4720                             }
4721                             /* Check of we are not at the box edge.
4722                              * pbc is only handled in the first step above,
4723                              * but this check could move over pbc while
4724                              * the first step did not due to different rounding.
4725                              */
4726                             if (pos_d >= cell_x1[dim2] &&
4727                                 dd->ci[dim2] != dd->nc[dim2]-1)
4728                             {
4729                                 flag |= DD_FLAG_FW(d2);
4730                             }
4731                             else if (pos_d < cell_x0[dim2] &&
4732                                      dd->ci[dim2] != 0)
4733                             {
4734                                 flag |= DD_FLAG_BW(d2);
4735                             }
4736                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4737                         }
4738                     }
4739                     /* Set to which neighboring cell this cg should go */
4740                     if (flag & DD_FLAG_FW(d2))
4741                     {
4742                         mc = d2*2;
4743                     }
4744                     else if (flag & DD_FLAG_BW(d2))
4745                     {
4746                         if (dd->nc[dd->dim[d2]] > 2)
4747                         {
4748                             mc = d2*2+1;
4749                         }
4750                         else
4751                         {
4752                             mc = d2*2;
4753                         }
4754                     }
4755                 }
4756             }
4757
4758             nrcg = flag & DD_FLAG_NRCG;
4759             if (mc == -1)
4760             {
4761                 if (home_pos_cg+1 > dd->cg_nalloc)
4762                 {
4763                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4764                     srenew(dd->index_gl, dd->cg_nalloc);
4765                     srenew(dd->cgindex, dd->cg_nalloc+1);
4766                 }
4767                 /* Set the global charge group index and size */
4768                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
4769                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4770                 /* Copy the state from the buffer */
4771                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
4772                 if (fr->cutoff_scheme == ecutsGROUP)
4773                 {
4774                     cg_cm = fr->cg_cm;
4775                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4776                 }
4777                 buf_pos++;
4778
4779                 /* Set the cginfo */
4780                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4781                                                    dd->index_gl[home_pos_cg]);
4782                 if (comm->bLocalCG)
4783                 {
4784                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4785                 }
4786
4787                 if (home_pos_at+nrcg > state->nalloc)
4788                 {
4789                     dd_realloc_state(state, f, home_pos_at+nrcg);
4790                 }
4791                 for (i = 0; i < nrcg; i++)
4792                 {
4793                     copy_rvec(comm->vbuf.v[buf_pos++],
4794                               state->x[home_pos_at+i]);
4795                 }
4796                 if (bV)
4797                 {
4798                     for (i = 0; i < nrcg; i++)
4799                     {
4800                         copy_rvec(comm->vbuf.v[buf_pos++],
4801                                   state->v[home_pos_at+i]);
4802                     }
4803                 }
4804                 if (bSDX)
4805                 {
4806                     for (i = 0; i < nrcg; i++)
4807                     {
4808                         copy_rvec(comm->vbuf.v[buf_pos++],
4809                                   state->sd_X[home_pos_at+i]);
4810                     }
4811                 }
4812                 if (bCGP)
4813                 {
4814                     for (i = 0; i < nrcg; i++)
4815                     {
4816                         copy_rvec(comm->vbuf.v[buf_pos++],
4817                                   state->cg_p[home_pos_at+i]);
4818                     }
4819                 }
4820                 home_pos_cg += 1;
4821                 home_pos_at += nrcg;
4822             }
4823             else
4824             {
4825                 /* Reallocate the buffers if necessary  */
4826                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4827                 {
4828                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4829                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4830                 }
4831                 nvr = ncg[mc] + nat[mc]*nvec;
4832                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4833                 {
4834                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4835                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4836                 }
4837                 /* Copy from the receive to the send buffers */
4838                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4839                        comm->buf_int + cg*DD_CGIBS,
4840                        DD_CGIBS*sizeof(int));
4841                 memcpy(comm->cgcm_state[mc][nvr],
4842                        comm->vbuf.v[buf_pos],
4843                        (1+nrcg*nvec)*sizeof(rvec));
4844                 buf_pos += 1 + nrcg*nvec;
4845                 ncg[mc] += 1;
4846                 nat[mc] += nrcg;
4847             }
4848         }
4849     }
4850
4851     /* With sorting (!bCompact) the indices are now only partially up to date
4852      * and ncg_home and nat_home are not the real count, since there are
4853      * "holes" in the arrays for the charge groups that moved to neighbors.
4854      */
4855     if (fr->cutoff_scheme == ecutsVERLET)
4856     {
4857         moved = get_moved(comm, home_pos_cg);
4858
4859         for (i = dd->ncg_home; i < home_pos_cg; i++)
4860         {
4861             moved[i] = 0;
4862         }
4863     }
4864     dd->ncg_home = home_pos_cg;
4865     dd->nat_home = home_pos_at;
4866
4867     if (debug)
4868     {
4869         fprintf(debug,
4870                 "Finished repartitioning: cgs moved out %d, new home %d\n",
4871                 *ncg_moved, dd->ncg_home-*ncg_moved);
4872
4873     }
4874 }
4875
4876 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
4877 {
4878     /* Note that the cycles value can be incorrect, either 0 or some
4879      * extremely large value, when our thread migrated to another core
4880      * with an unsynchronized cycle counter. If this happens less often
4881      * that once per nstlist steps, this will not cause issues, since
4882      * we later subtract the maximum value from the sum over nstlist steps.
4883      * A zero count will slightly lower the total, but that's a small effect.
4884      * Note that the main purpose of the subtraction of the maximum value
4885      * is to avoid throwing off the load balancing when stalls occur due
4886      * e.g. system activity or network congestion.
4887      */
4888     dd->comm->cycl[ddCycl] += cycles;
4889     dd->comm->cycl_n[ddCycl]++;
4890     if (cycles > dd->comm->cycl_max[ddCycl])
4891     {
4892         dd->comm->cycl_max[ddCycl] = cycles;
4893     }
4894 }
4895
4896 static double force_flop_count(t_nrnb *nrnb)
4897 {
4898     int         i;
4899     double      sum;
4900     const char *name;
4901
4902     sum = 0;
4903     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
4904     {
4905         /* To get closer to the real timings, we half the count
4906          * for the normal loops and again half it for water loops.
4907          */
4908         name = nrnb_str(i);
4909         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
4910         {
4911             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4912         }
4913         else
4914         {
4915             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4916         }
4917     }
4918     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
4919     {
4920         name = nrnb_str(i);
4921         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
4922         {
4923             sum += nrnb->n[i]*cost_nrnb(i);
4924         }
4925     }
4926     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
4927     {
4928         sum += nrnb->n[i]*cost_nrnb(i);
4929     }
4930
4931     return sum;
4932 }
4933
4934 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
4935 {
4936     if (dd->comm->eFlop)
4937     {
4938         dd->comm->flop -= force_flop_count(nrnb);
4939     }
4940 }
4941 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
4942 {
4943     if (dd->comm->eFlop)
4944     {
4945         dd->comm->flop += force_flop_count(nrnb);
4946         dd->comm->flop_n++;
4947     }
4948 }
4949
4950 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4951 {
4952     int i;
4953
4954     for (i = 0; i < ddCyclNr; i++)
4955     {
4956         dd->comm->cycl[i]     = 0;
4957         dd->comm->cycl_n[i]   = 0;
4958         dd->comm->cycl_max[i] = 0;
4959     }
4960     dd->comm->flop   = 0;
4961     dd->comm->flop_n = 0;
4962 }
4963
4964 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
4965 {
4966     gmx_domdec_comm_t *comm;
4967     domdec_load_t     *load;
4968     domdec_root_t     *root = NULL;
4969     int                d, dim, i, pos;
4970     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
4971     gmx_bool           bSepPME;
4972
4973     if (debug)
4974     {
4975         fprintf(debug, "get_load_distribution start\n");
4976     }
4977
4978     wallcycle_start(wcycle, ewcDDCOMMLOAD);
4979
4980     comm = dd->comm;
4981
4982     bSepPME = (dd->pme_nodeid >= 0);
4983
4984     if (dd->ndim == 0 && bSepPME)
4985     {
4986         /* Without decomposition, but with PME nodes, we need the load */
4987         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
4988         comm->load[0].pme = comm->cycl[ddCyclPME];
4989     }
4990
4991     for (d = dd->ndim-1; d >= 0; d--)
4992     {
4993         dim = dd->dim[d];
4994         /* Check if we participate in the communication in this dimension */
4995         if (d == dd->ndim-1 ||
4996             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
4997         {
4998             load = &comm->load[d];
4999             if (dlbIsOn(dd->comm))
5000             {
5001                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5002             }
5003             pos = 0;
5004             if (d == dd->ndim-1)
5005             {
5006                 sbuf[pos++] = dd_force_load(comm);
5007                 sbuf[pos++] = sbuf[0];
5008                 if (dlbIsOn(dd->comm))
5009                 {
5010                     sbuf[pos++] = sbuf[0];
5011                     sbuf[pos++] = cell_frac;
5012                     if (d > 0)
5013                     {
5014                         sbuf[pos++] = comm->cell_f_max0[d];
5015                         sbuf[pos++] = comm->cell_f_min1[d];
5016                     }
5017                 }
5018                 if (bSepPME)
5019                 {
5020                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5021                     sbuf[pos++] = comm->cycl[ddCyclPME];
5022                 }
5023             }
5024             else
5025             {
5026                 sbuf[pos++] = comm->load[d+1].sum;
5027                 sbuf[pos++] = comm->load[d+1].max;
5028                 if (dlbIsOn(dd->comm))
5029                 {
5030                     sbuf[pos++] = comm->load[d+1].sum_m;
5031                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5032                     sbuf[pos++] = comm->load[d+1].flags;
5033                     if (d > 0)
5034                     {
5035                         sbuf[pos++] = comm->cell_f_max0[d];
5036                         sbuf[pos++] = comm->cell_f_min1[d];
5037                     }
5038                 }
5039                 if (bSepPME)
5040                 {
5041                     sbuf[pos++] = comm->load[d+1].mdf;
5042                     sbuf[pos++] = comm->load[d+1].pme;
5043                 }
5044             }
5045             load->nload = pos;
5046             /* Communicate a row in DD direction d.
5047              * The communicators are setup such that the root always has rank 0.
5048              */
5049 #ifdef GMX_MPI
5050             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5051                        load->load, load->nload*sizeof(float), MPI_BYTE,
5052                        0, comm->mpi_comm_load[d]);
5053 #endif
5054             if (dd->ci[dim] == dd->master_ci[dim])
5055             {
5056                 /* We are the root, process this row */
5057                 if (dlbIsOn(comm))
5058                 {
5059                     root = comm->root[d];
5060                 }
5061                 load->sum      = 0;
5062                 load->max      = 0;
5063                 load->sum_m    = 0;
5064                 load->cvol_min = 1;
5065                 load->flags    = 0;
5066                 load->mdf      = 0;
5067                 load->pme      = 0;
5068                 pos            = 0;
5069                 for (i = 0; i < dd->nc[dim]; i++)
5070                 {
5071                     load->sum += load->load[pos++];
5072                     load->max  = std::max(load->max, load->load[pos]);
5073                     pos++;
5074                     if (dlbIsOn(dd->comm))
5075                     {
5076                         if (root->bLimited)
5077                         {
5078                             /* This direction could not be load balanced properly,
5079                              * therefore we need to use the maximum iso the average load.
5080                              */
5081                             load->sum_m = std::max(load->sum_m, load->load[pos]);
5082                         }
5083                         else
5084                         {
5085                             load->sum_m += load->load[pos];
5086                         }
5087                         pos++;
5088                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
5089                         pos++;
5090                         if (d < dd->ndim-1)
5091                         {
5092                             load->flags = (int)(load->load[pos++] + 0.5);
5093                         }
5094                         if (d > 0)
5095                         {
5096                             root->cell_f_max0[i] = load->load[pos++];
5097                             root->cell_f_min1[i] = load->load[pos++];
5098                         }
5099                     }
5100                     if (bSepPME)
5101                     {
5102                         load->mdf = std::max(load->mdf, load->load[pos]);
5103                         pos++;
5104                         load->pme = std::max(load->pme, load->load[pos]);
5105                         pos++;
5106                     }
5107                 }
5108                 if (dlbIsOn(comm) && root->bLimited)
5109                 {
5110                     load->sum_m *= dd->nc[dim];
5111                     load->flags |= (1<<d);
5112                 }
5113             }
5114         }
5115     }
5116
5117     if (DDMASTER(dd))
5118     {
5119         comm->nload      += dd_load_count(comm);
5120         comm->load_step  += comm->cycl[ddCyclStep];
5121         comm->load_sum   += comm->load[0].sum;
5122         comm->load_max   += comm->load[0].max;
5123         if (dlbIsOn(comm))
5124         {
5125             for (d = 0; d < dd->ndim; d++)
5126             {
5127                 if (comm->load[0].flags & (1<<d))
5128                 {
5129                     comm->load_lim[d]++;
5130                 }
5131             }
5132         }
5133         if (bSepPME)
5134         {
5135             comm->load_mdf += comm->load[0].mdf;
5136             comm->load_pme += comm->load[0].pme;
5137         }
5138     }
5139
5140     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5141
5142     if (debug)
5143     {
5144         fprintf(debug, "get_load_distribution finished\n");
5145     }
5146 }
5147
5148 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5149 {
5150     /* Return the relative performance loss on the total run time
5151      * due to the force calculation load imbalance.
5152      */
5153     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5154     {
5155         return
5156             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5157             (dd->comm->load_step*dd->nnodes);
5158     }
5159     else
5160     {
5161         return 0;
5162     }
5163 }
5164
5165 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5166 {
5167     char               buf[STRLEN];
5168     int                npp, npme, nnodes, d, limp;
5169     float              imbal, pme_f_ratio, lossf = 0, lossp = 0;
5170     gmx_bool           bLim;
5171     gmx_domdec_comm_t *comm;
5172
5173     comm = dd->comm;
5174     if (DDMASTER(dd) && comm->nload > 0)
5175     {
5176         npp    = dd->nnodes;
5177         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5178         nnodes = npp + npme;
5179         if (dd->nnodes > 1 && comm->load_sum > 0)
5180         {
5181             imbal  = comm->load_max*npp/comm->load_sum - 1;
5182             lossf  = dd_force_imb_perf_loss(dd);
5183             sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5184             fprintf(fplog, "%s", buf);
5185             fprintf(stderr, "\n");
5186             fprintf(stderr, "%s", buf);
5187             sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5188             fprintf(fplog, "%s", buf);
5189             fprintf(stderr, "%s", buf);
5190         }
5191         bLim = FALSE;
5192         if (dlbIsOn(comm))
5193         {
5194             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5195             for (d = 0; d < dd->ndim; d++)
5196             {
5197                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5198                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5199                 if (limp >= 50)
5200                 {
5201                     bLim = TRUE;
5202                 }
5203             }
5204             sprintf(buf+strlen(buf), "\n");
5205             fprintf(fplog, "%s", buf);
5206             fprintf(stderr, "%s", buf);
5207         }
5208         if (npme > 0 && comm->load_mdf > 0 && comm->load_step > 0)
5209         {
5210             pme_f_ratio = comm->load_pme/comm->load_mdf;
5211             lossp       = (comm->load_pme - comm->load_mdf)/comm->load_step;
5212             if (lossp <= 0)
5213             {
5214                 lossp *= (float)npme/(float)nnodes;
5215             }
5216             else
5217             {
5218                 lossp *= (float)npp/(float)nnodes;
5219             }
5220             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5221             fprintf(fplog, "%s", buf);
5222             fprintf(stderr, "%s", buf);
5223             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5224             fprintf(fplog, "%s", buf);
5225             fprintf(stderr, "%s", buf);
5226         }
5227         fprintf(fplog, "\n");
5228         fprintf(stderr, "\n");
5229
5230         if (lossf >= DD_PERF_LOSS_WARN)
5231         {
5232             sprintf(buf,
5233                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5234                     "      in the domain decomposition.\n", lossf*100);
5235             if (!dlbIsOn(comm))
5236             {
5237                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5238             }
5239             else if (bLim)
5240             {
5241                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5242             }
5243             fprintf(fplog, "%s\n", buf);
5244             fprintf(stderr, "%s\n", buf);
5245         }
5246         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS_WARN)
5247         {
5248             sprintf(buf,
5249                     "NOTE: %.1f %% performance was lost because the PME ranks\n"
5250                     "      had %s work to do than the PP ranks.\n"
5251                     "      You might want to %s the number of PME ranks\n"
5252                     "      or %s the cut-off and the grid spacing.\n",
5253                     fabs(lossp*100),
5254                     (lossp < 0) ? "less"     : "more",
5255                     (lossp < 0) ? "decrease" : "increase",
5256                     (lossp < 0) ? "decrease" : "increase");
5257             fprintf(fplog, "%s\n", buf);
5258             fprintf(stderr, "%s\n", buf);
5259         }
5260     }
5261 }
5262
5263 static float dd_vol_min(gmx_domdec_t *dd)
5264 {
5265     return dd->comm->load[0].cvol_min*dd->nnodes;
5266 }
5267
5268 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5269 {
5270     return dd->comm->load[0].flags;
5271 }
5272
5273 static float dd_f_imbal(gmx_domdec_t *dd)
5274 {
5275     if (dd->comm->load[0].sum > 0)
5276     {
5277         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
5278     }
5279     else
5280     {
5281         /* Something is wrong in the cycle counting, report no load imbalance */
5282         return 0.0f;
5283     }
5284 }
5285
5286 float dd_pme_f_ratio(gmx_domdec_t *dd)
5287 {
5288     /* Should only be called on the DD master rank */
5289     assert(DDMASTER(dd));
5290
5291     if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
5292     {
5293         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5294     }
5295     else
5296     {
5297         return -1.0;
5298     }
5299 }
5300
5301 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5302 {
5303     int  flags, d;
5304     char buf[22];
5305
5306     flags = dd_load_flags(dd);
5307     if (flags)
5308     {
5309         fprintf(fplog,
5310                 "DD  load balancing is limited by minimum cell size in dimension");
5311         for (d = 0; d < dd->ndim; d++)
5312         {
5313             if (flags & (1<<d))
5314             {
5315                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5316             }
5317         }
5318         fprintf(fplog, "\n");
5319     }
5320     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5321     if (dlbIsOn(dd->comm))
5322     {
5323         fprintf(fplog, "  vol min/aver %5.3f%c",
5324                 dd_vol_min(dd), flags ? '!' : ' ');
5325     }
5326     if (dd->nnodes > 1)
5327     {
5328         fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5329     }
5330     if (dd->comm->cycl_n[ddCyclPME])
5331     {
5332         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5333     }
5334     fprintf(fplog, "\n\n");
5335 }
5336
5337 static void dd_print_load_verbose(gmx_domdec_t *dd)
5338 {
5339     if (dlbIsOn(dd->comm))
5340     {
5341         fprintf(stderr, "vol %4.2f%c ",
5342                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5343     }
5344     if (dd->nnodes > 1)
5345     {
5346         fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5347     }
5348     if (dd->comm->cycl_n[ddCyclPME])
5349     {
5350         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5351     }
5352 }
5353
5354 #ifdef GMX_MPI
5355 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5356 {
5357     MPI_Comm           c_row;
5358     int                dim, i, rank;
5359     ivec               loc_c;
5360     domdec_root_t     *root;
5361     gmx_bool           bPartOfGroup = FALSE;
5362
5363     dim = dd->dim[dim_ind];
5364     copy_ivec(loc, loc_c);
5365     for (i = 0; i < dd->nc[dim]; i++)
5366     {
5367         loc_c[dim] = i;
5368         rank       = dd_index(dd->nc, loc_c);
5369         if (rank == dd->rank)
5370         {
5371             /* This process is part of the group */
5372             bPartOfGroup = TRUE;
5373         }
5374     }
5375     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5376                    &c_row);
5377     if (bPartOfGroup)
5378     {
5379         dd->comm->mpi_comm_load[dim_ind] = c_row;
5380         if (dd->comm->dlbState != edlbsOffForever)
5381         {
5382             if (dd->ci[dim] == dd->master_ci[dim])
5383             {
5384                 /* This is the root process of this row */
5385                 snew(dd->comm->root[dim_ind], 1);
5386                 root = dd->comm->root[dim_ind];
5387                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5388                 snew(root->old_cell_f, dd->nc[dim]+1);
5389                 snew(root->bCellMin, dd->nc[dim]);
5390                 if (dim_ind > 0)
5391                 {
5392                     snew(root->cell_f_max0, dd->nc[dim]);
5393                     snew(root->cell_f_min1, dd->nc[dim]);
5394                     snew(root->bound_min, dd->nc[dim]);
5395                     snew(root->bound_max, dd->nc[dim]);
5396                 }
5397                 snew(root->buf_ncd, dd->nc[dim]);
5398             }
5399             else
5400             {
5401                 /* This is not a root process, we only need to receive cell_f */
5402                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5403             }
5404         }
5405         if (dd->ci[dim] == dd->master_ci[dim])
5406         {
5407             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5408         }
5409     }
5410 }
5411 #endif
5412
5413 void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
5414                                    const gmx_hw_info_t gmx_unused *hwinfo,
5415                                    const gmx_hw_opt_t  gmx_unused *hw_opt)
5416 {
5417 #ifdef GMX_MPI
5418     int           physicalnode_id_hash;
5419     int           gpu_id;
5420     gmx_domdec_t *dd;
5421     MPI_Comm      mpi_comm_pp_physicalnode;
5422
5423     if (!(cr->duty & DUTY_PP) || hw_opt->gpu_opt.n_dev_use == 0)
5424     {
5425         /* Only PP nodes (currently) use GPUs.
5426          * If we don't have GPUs, there are no resources to share.
5427          */
5428         return;
5429     }
5430
5431     physicalnode_id_hash = gmx_physicalnode_id_hash();
5432
5433     gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
5434
5435     dd = cr->dd;
5436
5437     if (debug)
5438     {
5439         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5440         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5441                 dd->rank, physicalnode_id_hash, gpu_id);
5442     }
5443     /* Split the PP communicator over the physical nodes */
5444     /* TODO: See if we should store this (before), as it's also used for
5445      * for the nodecomm summution.
5446      */
5447     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5448                    &mpi_comm_pp_physicalnode);
5449     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5450                    &dd->comm->mpi_comm_gpu_shared);
5451     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5452     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5453
5454     if (debug)
5455     {
5456         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5457     }
5458
5459     /* Note that some ranks could share a GPU, while others don't */
5460
5461     if (dd->comm->nrank_gpu_shared == 1)
5462     {
5463         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5464     }
5465 #endif
5466 }
5467
5468 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5469 {
5470 #ifdef GMX_MPI
5471     int  dim0, dim1, i, j;
5472     ivec loc;
5473
5474     if (debug)
5475     {
5476         fprintf(debug, "Making load communicators\n");
5477     }
5478
5479     snew(dd->comm->load,          std::max(dd->ndim, 1));
5480     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
5481
5482     if (dd->ndim == 0)
5483     {
5484         return;
5485     }
5486
5487     clear_ivec(loc);
5488     make_load_communicator(dd, 0, loc);
5489     if (dd->ndim > 1)
5490     {
5491         dim0 = dd->dim[0];
5492         for (i = 0; i < dd->nc[dim0]; i++)
5493         {
5494             loc[dim0] = i;
5495             make_load_communicator(dd, 1, loc);
5496         }
5497     }
5498     if (dd->ndim > 2)
5499     {
5500         dim0 = dd->dim[0];
5501         for (i = 0; i < dd->nc[dim0]; i++)
5502         {
5503             loc[dim0] = i;
5504             dim1      = dd->dim[1];
5505             for (j = 0; j < dd->nc[dim1]; j++)
5506             {
5507                 loc[dim1] = j;
5508                 make_load_communicator(dd, 2, loc);
5509             }
5510         }
5511     }
5512
5513     if (debug)
5514     {
5515         fprintf(debug, "Finished making load communicators\n");
5516     }
5517 #endif
5518 }
5519
5520 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5521 {
5522     int                     d, dim, i, j, m;
5523     ivec                    tmp, s;
5524     int                     nzone, nzonep;
5525     ivec                    dd_zp[DD_MAXIZONE];
5526     gmx_domdec_zones_t     *zones;
5527     gmx_domdec_ns_ranges_t *izone;
5528
5529     for (d = 0; d < dd->ndim; d++)
5530     {
5531         dim = dd->dim[d];
5532         copy_ivec(dd->ci, tmp);
5533         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5534         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5535         copy_ivec(dd->ci, tmp);
5536         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5537         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5538         if (debug)
5539         {
5540             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5541                     dd->rank, dim,
5542                     dd->neighbor[d][0],
5543                     dd->neighbor[d][1]);
5544         }
5545     }
5546
5547     if (fplog)
5548     {
5549         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5550                 dd->ndim,
5551                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5552                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5553     }
5554     switch (dd->ndim)
5555     {
5556         case 3:
5557             nzone  = dd_z3n;
5558             nzonep = dd_zp3n;
5559             for (i = 0; i < nzonep; i++)
5560             {
5561                 copy_ivec(dd_zp3[i], dd_zp[i]);
5562             }
5563             break;
5564         case 2:
5565             nzone  = dd_z2n;
5566             nzonep = dd_zp2n;
5567             for (i = 0; i < nzonep; i++)
5568             {
5569                 copy_ivec(dd_zp2[i], dd_zp[i]);
5570             }
5571             break;
5572         case 1:
5573             nzone  = dd_z1n;
5574             nzonep = dd_zp1n;
5575             for (i = 0; i < nzonep; i++)
5576             {
5577                 copy_ivec(dd_zp1[i], dd_zp[i]);
5578             }
5579             break;
5580         case 0:
5581             nzone  = dd_z0n;
5582             nzonep = dd_zp0n;
5583             for (i = 0; i < nzonep; i++)
5584             {
5585                 copy_ivec(dd_zp0[i], dd_zp[i]);
5586             }
5587             break;
5588         default:
5589             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5590             nzone  = 0;
5591             nzonep = 0;
5592     }
5593
5594     zones = &dd->comm->zones;
5595
5596     for (i = 0; i < nzone; i++)
5597     {
5598         m = 0;
5599         clear_ivec(zones->shift[i]);
5600         for (d = 0; d < dd->ndim; d++)
5601         {
5602             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5603         }
5604     }
5605
5606     zones->n = nzone;
5607     for (i = 0; i < nzone; i++)
5608     {
5609         for (d = 0; d < DIM; d++)
5610         {
5611             s[d] = dd->ci[d] - zones->shift[i][d];
5612             if (s[d] < 0)
5613             {
5614                 s[d] += dd->nc[d];
5615             }
5616             else if (s[d] >= dd->nc[d])
5617             {
5618                 s[d] -= dd->nc[d];
5619             }
5620         }
5621     }
5622     zones->nizone = nzonep;
5623     for (i = 0; i < zones->nizone; i++)
5624     {
5625         if (dd_zp[i][0] != i)
5626         {
5627             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5628         }
5629         izone     = &zones->izone[i];
5630         izone->j0 = dd_zp[i][1];
5631         izone->j1 = dd_zp[i][2];
5632         for (dim = 0; dim < DIM; dim++)
5633         {
5634             if (dd->nc[dim] == 1)
5635             {
5636                 /* All shifts should be allowed */
5637                 izone->shift0[dim] = -1;
5638                 izone->shift1[dim] = 1;
5639             }
5640             else
5641             {
5642                 /*
5643                    izone->shift0[d] = 0;
5644                    izone->shift1[d] = 0;
5645                    for(j=izone->j0; j<izone->j1; j++) {
5646                    if (dd->shift[j][d] > dd->shift[i][d])
5647                    izone->shift0[d] = -1;
5648                    if (dd->shift[j][d] < dd->shift[i][d])
5649                    izone->shift1[d] = 1;
5650                    }
5651                  */
5652
5653                 int shift_diff;
5654
5655                 /* Assume the shift are not more than 1 cell */
5656                 izone->shift0[dim] = 1;
5657                 izone->shift1[dim] = -1;
5658                 for (j = izone->j0; j < izone->j1; j++)
5659                 {
5660                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5661                     if (shift_diff < izone->shift0[dim])
5662                     {
5663                         izone->shift0[dim] = shift_diff;
5664                     }
5665                     if (shift_diff > izone->shift1[dim])
5666                     {
5667                         izone->shift1[dim] = shift_diff;
5668                     }
5669                 }
5670             }
5671         }
5672     }
5673
5674     if (dd->comm->dlbState != edlbsOffForever)
5675     {
5676         snew(dd->comm->root, dd->ndim);
5677     }
5678
5679     if (dd->comm->bRecordLoad)
5680     {
5681         make_load_communicators(dd);
5682     }
5683 }
5684
5685 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
5686 {
5687     gmx_domdec_t      *dd;
5688     dd   = cr->dd;
5689
5690 #ifdef GMX_MPI
5691     gmx_domdec_comm_t *comm;
5692     int                rank, *buf;
5693     ivec               periods;
5694     MPI_Comm           comm_cart;
5695
5696     comm = dd->comm;
5697
5698     if (comm->bCartesianPP)
5699     {
5700         /* Set up cartesian communication for the particle-particle part */
5701         if (fplog)
5702         {
5703             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5704                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5705         }
5706
5707         for (int i = 0; i < DIM; i++)
5708         {
5709             periods[i] = TRUE;
5710         }
5711         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5712                         &comm_cart);
5713         /* We overwrite the old communicator with the new cartesian one */
5714         cr->mpi_comm_mygroup = comm_cart;
5715     }
5716
5717     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5718     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5719
5720     if (comm->bCartesianPP_PME)
5721     {
5722         /* Since we want to use the original cartesian setup for sim,
5723          * and not the one after split, we need to make an index.
5724          */
5725         snew(comm->ddindex2ddnodeid, dd->nnodes);
5726         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5727         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5728         /* Get the rank of the DD master,
5729          * above we made sure that the master node is a PP node.
5730          */
5731         if (MASTER(cr))
5732         {
5733             rank = dd->rank;
5734         }
5735         else
5736         {
5737             rank = 0;
5738         }
5739         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5740     }
5741     else if (comm->bCartesianPP)
5742     {
5743         if (cr->npmenodes == 0)
5744         {
5745             /* The PP communicator is also
5746              * the communicator for this simulation
5747              */
5748             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5749         }
5750         cr->nodeid = dd->rank;
5751
5752         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5753
5754         /* We need to make an index to go from the coordinates
5755          * to the nodeid of this simulation.
5756          */
5757         snew(comm->ddindex2simnodeid, dd->nnodes);
5758         snew(buf, dd->nnodes);
5759         if (cr->duty & DUTY_PP)
5760         {
5761             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5762         }
5763         /* Communicate the ddindex to simulation nodeid index */
5764         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5765                       cr->mpi_comm_mysim);
5766         sfree(buf);
5767
5768         /* Determine the master coordinates and rank.
5769          * The DD master should be the same node as the master of this sim.
5770          */
5771         for (int i = 0; i < dd->nnodes; i++)
5772         {
5773             if (comm->ddindex2simnodeid[i] == 0)
5774             {
5775                 ddindex2xyz(dd->nc, i, dd->master_ci);
5776                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5777             }
5778         }
5779         if (debug)
5780         {
5781             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5782         }
5783     }
5784     else
5785     {
5786         /* No Cartesian communicators */
5787         /* We use the rank in dd->comm->all as DD index */
5788         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5789         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5790         dd->masterrank = 0;
5791         clear_ivec(dd->master_ci);
5792     }
5793 #endif
5794
5795     if (fplog)
5796     {
5797         fprintf(fplog,
5798                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5799                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5800     }
5801     if (debug)
5802     {
5803         fprintf(debug,
5804                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5805                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5806     }
5807 }
5808
5809 static void receive_ddindex2simnodeid(t_commrec gmx_unused *cr)
5810 {
5811 #ifdef GMX_MPI
5812     gmx_domdec_t      *dd;
5813     gmx_domdec_comm_t *comm;
5814
5815     dd   = cr->dd;
5816     comm = dd->comm;
5817
5818     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5819     {
5820         int *buf;
5821         snew(comm->ddindex2simnodeid, dd->nnodes);
5822         snew(buf, dd->nnodes);
5823         if (cr->duty & DUTY_PP)
5824         {
5825             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5826         }
5827         /* Communicate the ddindex to simulation nodeid index */
5828         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5829                       cr->mpi_comm_mysim);
5830         sfree(buf);
5831     }
5832 #endif
5833 }
5834
5835 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5836                                                      int ncg, int natoms)
5837 {
5838     gmx_domdec_master_t *ma;
5839     int                  i;
5840
5841     snew(ma, 1);
5842
5843     snew(ma->ncg, dd->nnodes);
5844     snew(ma->index, dd->nnodes+1);
5845     snew(ma->cg, ncg);
5846     snew(ma->nat, dd->nnodes);
5847     snew(ma->ibuf, dd->nnodes*2);
5848     snew(ma->cell_x, DIM);
5849     for (i = 0; i < DIM; i++)
5850     {
5851         snew(ma->cell_x[i], dd->nc[i]+1);
5852     }
5853
5854     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5855     {
5856         ma->vbuf = NULL;
5857     }
5858     else
5859     {
5860         snew(ma->vbuf, natoms);
5861     }
5862
5863     return ma;
5864 }
5865
5866 static void split_communicator(FILE *fplog, t_commrec *cr, int gmx_unused dd_node_order,
5867                                int gmx_unused reorder)
5868 {
5869     gmx_domdec_t      *dd;
5870     gmx_domdec_comm_t *comm;
5871     int                i;
5872     gmx_bool           bDiv[DIM];
5873 #ifdef GMX_MPI
5874     MPI_Comm           comm_cart;
5875 #endif
5876
5877     dd   = cr->dd;
5878     comm = dd->comm;
5879
5880     if (comm->bCartesianPP)
5881     {
5882         for (i = 1; i < DIM; i++)
5883         {
5884             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5885         }
5886         if (bDiv[YY] || bDiv[ZZ])
5887         {
5888             comm->bCartesianPP_PME = TRUE;
5889             /* If we have 2D PME decomposition, which is always in x+y,
5890              * we stack the PME only nodes in z.
5891              * Otherwise we choose the direction that provides the thinnest slab
5892              * of PME only nodes as this will have the least effect
5893              * on the PP communication.
5894              * But for the PME communication the opposite might be better.
5895              */
5896             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5897                              !bDiv[YY] ||
5898                              dd->nc[YY] > dd->nc[ZZ]))
5899             {
5900                 comm->cartpmedim = ZZ;
5901             }
5902             else
5903             {
5904                 comm->cartpmedim = YY;
5905             }
5906             comm->ntot[comm->cartpmedim]
5907                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5908         }
5909         else if (fplog)
5910         {
5911             fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
5912             fprintf(fplog,
5913                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5914         }
5915     }
5916
5917 #ifdef GMX_MPI
5918     if (comm->bCartesianPP_PME)
5919     {
5920         int  rank;
5921         ivec periods;
5922
5923         if (fplog)
5924         {
5925             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
5926         }
5927
5928         for (i = 0; i < DIM; i++)
5929         {
5930             periods[i] = TRUE;
5931         }
5932         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
5933                         &comm_cart);
5934         MPI_Comm_rank(comm_cart, &rank);
5935         if (MASTERNODE(cr) && rank != 0)
5936         {
5937             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5938         }
5939
5940         /* With this assigment we loose the link to the original communicator
5941          * which will usually be MPI_COMM_WORLD, unless have multisim.
5942          */
5943         cr->mpi_comm_mysim = comm_cart;
5944         cr->sim_nodeid     = rank;
5945
5946         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
5947
5948         if (fplog)
5949         {
5950             fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
5951                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5952         }
5953
5954         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5955         {
5956             cr->duty = DUTY_PP;
5957         }
5958         if (cr->npmenodes == 0 ||
5959             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5960         {
5961             cr->duty = DUTY_PME;
5962         }
5963
5964         /* Split the sim communicator into PP and PME only nodes */
5965         MPI_Comm_split(cr->mpi_comm_mysim,
5966                        cr->duty,
5967                        dd_index(comm->ntot, dd->ci),
5968                        &cr->mpi_comm_mygroup);
5969     }
5970     else
5971     {
5972         switch (dd_node_order)
5973         {
5974             case ddnoPP_PME:
5975                 if (fplog)
5976                 {
5977                     fprintf(fplog, "Order of the ranks: PP first, PME last\n");
5978                 }
5979                 break;
5980             case ddnoINTERLEAVE:
5981                 /* Interleave the PP-only and PME-only nodes,
5982                  * as on clusters with dual-core machines this will double
5983                  * the communication bandwidth of the PME processes
5984                  * and thus speed up the PP <-> PME and inter PME communication.
5985                  */
5986                 if (fplog)
5987                 {
5988                     fprintf(fplog, "Interleaving PP and PME ranks\n");
5989                 }
5990                 comm->pmenodes = dd_pmenodes(cr);
5991                 break;
5992             case ddnoCARTESIAN:
5993                 break;
5994             default:
5995                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
5996         }
5997
5998         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
5999         {
6000             cr->duty = DUTY_PME;
6001         }
6002         else
6003         {
6004             cr->duty = DUTY_PP;
6005         }
6006
6007         /* Split the sim communicator into PP and PME only nodes */
6008         MPI_Comm_split(cr->mpi_comm_mysim,
6009                        cr->duty,
6010                        cr->nodeid,
6011                        &cr->mpi_comm_mygroup);
6012         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6013     }
6014 #endif
6015
6016     if (fplog)
6017     {
6018         fprintf(fplog, "This rank does only %s work.\n\n",
6019                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6020     }
6021 }
6022
6023 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6024 {
6025     gmx_domdec_t      *dd;
6026     gmx_domdec_comm_t *comm;
6027     int                CartReorder;
6028
6029     dd   = cr->dd;
6030     comm = dd->comm;
6031
6032     copy_ivec(dd->nc, comm->ntot);
6033
6034     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6035     comm->bCartesianPP_PME = FALSE;
6036
6037     /* Reorder the nodes by default. This might change the MPI ranks.
6038      * Real reordering is only supported on very few architectures,
6039      * Blue Gene is one of them.
6040      */
6041     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6042
6043     if (cr->npmenodes > 0)
6044     {
6045         /* Split the communicator into a PP and PME part */
6046         split_communicator(fplog, cr, dd_node_order, CartReorder);
6047         if (comm->bCartesianPP_PME)
6048         {
6049             /* We (possibly) reordered the nodes in split_communicator,
6050              * so it is no longer required in make_pp_communicator.
6051              */
6052             CartReorder = FALSE;
6053         }
6054     }
6055     else
6056     {
6057         /* All nodes do PP and PME */
6058 #ifdef GMX_MPI
6059         /* We do not require separate communicators */
6060         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6061 #endif
6062     }
6063
6064     if (cr->duty & DUTY_PP)
6065     {
6066         /* Copy or make a new PP communicator */
6067         make_pp_communicator(fplog, cr, CartReorder);
6068     }
6069     else
6070     {
6071         receive_ddindex2simnodeid(cr);
6072     }
6073
6074     if (!(cr->duty & DUTY_PME))
6075     {
6076         /* Set up the commnuication to our PME node */
6077         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6078         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6079         if (debug)
6080         {
6081             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6082                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6083         }
6084     }
6085     else
6086     {
6087         dd->pme_nodeid = -1;
6088     }
6089
6090     if (DDMASTER(dd))
6091     {
6092         dd->ma = init_gmx_domdec_master_t(dd,
6093                                           comm->cgs_gl.nr,
6094                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6095     }
6096 }
6097
6098 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6099 {
6100     real  *slb_frac, tot;
6101     int    i, n;
6102     double dbl;
6103
6104     slb_frac = NULL;
6105     if (nc > 1 && size_string != NULL)
6106     {
6107         if (fplog)
6108         {
6109             fprintf(fplog, "Using static load balancing for the %s direction\n",
6110                     dir);
6111         }
6112         snew(slb_frac, nc);
6113         tot = 0;
6114         for (i = 0; i < nc; i++)
6115         {
6116             dbl = 0;
6117             sscanf(size_string, "%20lf%n", &dbl, &n);
6118             if (dbl == 0)
6119             {
6120                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6121             }
6122             slb_frac[i]  = dbl;
6123             size_string += n;
6124             tot         += slb_frac[i];
6125         }
6126         /* Normalize */
6127         if (fplog)
6128         {
6129             fprintf(fplog, "Relative cell sizes:");
6130         }
6131         for (i = 0; i < nc; i++)
6132         {
6133             slb_frac[i] /= tot;
6134             if (fplog)
6135             {
6136                 fprintf(fplog, " %5.3f", slb_frac[i]);
6137             }
6138         }
6139         if (fplog)
6140         {
6141             fprintf(fplog, "\n");
6142         }
6143     }
6144
6145     return slb_frac;
6146 }
6147
6148 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6149 {
6150     int                  n, nmol, ftype;
6151     gmx_mtop_ilistloop_t iloop;
6152     t_ilist             *il;
6153
6154     n     = 0;
6155     iloop = gmx_mtop_ilistloop_init(mtop);
6156     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6157     {
6158         for (ftype = 0; ftype < F_NRE; ftype++)
6159         {
6160             if ((interaction_function[ftype].flags & IF_BOND) &&
6161                 NRAL(ftype) >  2)
6162             {
6163                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6164             }
6165         }
6166     }
6167
6168     return n;
6169 }
6170
6171 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6172 {
6173     char *val;
6174     int   nst;
6175
6176     nst = def;
6177     val = getenv(env_var);
6178     if (val)
6179     {
6180         if (sscanf(val, "%20d", &nst) <= 0)
6181         {
6182             nst = 1;
6183         }
6184         if (fplog)
6185         {
6186             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6187                     env_var, val, nst);
6188         }
6189     }
6190
6191     return nst;
6192 }
6193
6194 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6195 {
6196     if (MASTER(cr))
6197     {
6198         fprintf(stderr, "\n%s\n", warn_string);
6199     }
6200     if (fplog)
6201     {
6202         fprintf(fplog, "\n%s\n", warn_string);
6203     }
6204 }
6205
6206 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6207                                   t_inputrec *ir, FILE *fplog)
6208 {
6209     if (ir->ePBC == epbcSCREW &&
6210         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6211     {
6212         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6213     }
6214
6215     if (ir->ns_type == ensSIMPLE)
6216     {
6217         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6218     }
6219
6220     if (ir->nstlist == 0)
6221     {
6222         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6223     }
6224
6225     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6226     {
6227         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6228     }
6229 }
6230
6231 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6232 {
6233     int  di, d;
6234     real r;
6235
6236     r = ddbox->box_size[XX];
6237     for (di = 0; di < dd->ndim; di++)
6238     {
6239         d = dd->dim[di];
6240         /* Check using the initial average cell size */
6241         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6242     }
6243
6244     return r;
6245 }
6246
6247 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6248                              const char *dlb_opt, gmx_bool bRecordLoad,
6249                              unsigned long Flags, t_inputrec *ir)
6250 {
6251     int           dlbState = -1;
6252     char          buf[STRLEN];
6253
6254     switch (dlb_opt[0])
6255     {
6256         case 'a': dlbState = edlbsOffCanTurnOn; break;
6257         case 'n': dlbState = edlbsOffForever;   break;
6258         case 'y': dlbState = edlbsOn;           break;
6259         default: gmx_incons("Unknown dlb_opt");
6260     }
6261
6262     if (Flags & MD_RERUN)
6263     {
6264         return edlbsOffForever;
6265     }
6266
6267     if (!EI_DYNAMICS(ir->eI))
6268     {
6269         if (dlbState == edlbsOn)
6270         {
6271             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6272             dd_warning(cr, fplog, buf);
6273         }
6274
6275         return edlbsOffForever;
6276     }
6277
6278     if (!bRecordLoad)
6279     {
6280         dd_warning(cr, fplog, "NOTE: Cycle counters unsupported or not enabled in kernel. Cannot use dynamic load balancing.\n");
6281         return edlbsOffForever;
6282     }
6283
6284     if (Flags & MD_REPRODUCIBLE)
6285     {
6286         switch (dlbState)
6287         {
6288             case edlbsOffForever:
6289                 break;
6290             case edlbsOffCanTurnOn:
6291                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6292                 dlbState = edlbsOffForever;
6293                 break;
6294             case edlbsOn:
6295                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6296                 break;
6297             default:
6298                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
6299                 break;
6300         }
6301     }
6302
6303     return dlbState;
6304 }
6305
6306 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6307 {
6308     int dim;
6309
6310     dd->ndim = 0;
6311     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6312     {
6313         /* Decomposition order z,y,x */
6314         if (fplog)
6315         {
6316             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6317         }
6318         for (dim = DIM-1; dim >= 0; dim--)
6319         {
6320             if (dd->nc[dim] > 1)
6321             {
6322                 dd->dim[dd->ndim++] = dim;
6323             }
6324         }
6325     }
6326     else
6327     {
6328         /* Decomposition order x,y,z */
6329         for (dim = 0; dim < DIM; dim++)
6330         {
6331             if (dd->nc[dim] > 1)
6332             {
6333                 dd->dim[dd->ndim++] = dim;
6334             }
6335         }
6336     }
6337 }
6338
6339 static gmx_domdec_comm_t *init_dd_comm()
6340 {
6341     gmx_domdec_comm_t *comm;
6342     int                i;
6343
6344     snew(comm, 1);
6345     snew(comm->cggl_flag, DIM*2);
6346     snew(comm->cgcm_state, DIM*2);
6347     for (i = 0; i < DIM*2; i++)
6348     {
6349         comm->cggl_flag_nalloc[i]  = 0;
6350         comm->cgcm_state_nalloc[i] = 0;
6351     }
6352
6353     comm->nalloc_int = 0;
6354     comm->buf_int    = NULL;
6355
6356     vec_rvec_init(&comm->vbuf);
6357
6358     comm->n_load_have    = 0;
6359     comm->n_load_collect = 0;
6360
6361     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6362     {
6363         comm->sum_nat[i] = 0;
6364     }
6365     comm->ndecomp   = 0;
6366     comm->nload     = 0;
6367     comm->load_step = 0;
6368     comm->load_sum  = 0;
6369     comm->load_max  = 0;
6370     clear_ivec(comm->load_lim);
6371     comm->load_mdf  = 0;
6372     comm->load_pme  = 0;
6373
6374     return comm;
6375 }
6376
6377 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6378                                         unsigned long Flags,
6379                                         ivec nc,
6380                                         real comm_distance_min, real rconstr,
6381                                         const char *dlb_opt, real dlb_scale,
6382                                         const char *sizex, const char *sizey, const char *sizez,
6383                                         gmx_mtop_t *mtop, t_inputrec *ir,
6384                                         matrix box, rvec *x,
6385                                         gmx_ddbox_t *ddbox,
6386                                         int *npme_x, int *npme_y)
6387 {
6388     gmx_domdec_t      *dd;
6389     gmx_domdec_comm_t *comm;
6390     int                recload;
6391     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6392     gmx_bool           bC;
6393     char               buf[STRLEN];
6394     const real         tenPercentMargin = 1.1;
6395
6396     if (fplog)
6397     {
6398         fprintf(fplog,
6399                 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
6400     }
6401
6402     snew(dd, 1);
6403
6404     dd->comm = init_dd_comm();
6405     comm     = dd->comm;
6406     snew(comm->cggl_flag, DIM*2);
6407     snew(comm->cgcm_state, DIM*2);
6408
6409     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6410     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6411
6412     dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
6413     comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
6414     comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
6415     recload             = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
6416     comm->nstSortCG     = dd_getenv(fplog, "GMX_DD_NST_SORT_CHARGE_GROUPS", 1);
6417     comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
6418     comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
6419     comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
6420
6421     dd->pme_recv_f_alloc = 0;
6422     dd->pme_recv_f_buf   = NULL;
6423
6424     if (dd->bSendRecv2 && fplog)
6425     {
6426         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6427     }
6428     if (comm->eFlop)
6429     {
6430         if (fplog)
6431         {
6432             fprintf(fplog, "Will load balance based on FLOP count\n");
6433         }
6434         if (comm->eFlop > 1)
6435         {
6436             srand(1+cr->nodeid);
6437         }
6438         comm->bRecordLoad = TRUE;
6439     }
6440     else
6441     {
6442         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6443
6444     }
6445
6446     /* Initialize to GPU share count to 0, might change later */
6447     comm->nrank_gpu_shared = 0;
6448
6449     comm->dlbState                 = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6450     comm->bCheckWhetherToTurnDlbOn = TRUE;
6451
6452     if (fplog)
6453     {
6454         fprintf(fplog, "Dynamic load balancing: %s\n",
6455                 edlbs_names[comm->dlbState]);
6456     }
6457     comm->bPMELoadBalDLBLimits = FALSE;
6458
6459     if (comm->nstSortCG)
6460     {
6461         if (fplog)
6462         {
6463             if (comm->nstSortCG == 1)
6464             {
6465                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6466             }
6467             else
6468             {
6469                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6470                         comm->nstSortCG);
6471             }
6472         }
6473         snew(comm->sort, 1);
6474     }
6475     else
6476     {
6477         if (fplog)
6478         {
6479             fprintf(fplog, "Will not sort the charge groups\n");
6480         }
6481     }
6482
6483     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6484
6485     comm->bInterCGBondeds = ((ncg_mtop(mtop) > mtop->mols.nr) ||
6486                              mtop->bIntermolecularInteractions);
6487     if (comm->bInterCGBondeds)
6488     {
6489         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6490     }
6491     else
6492     {
6493         comm->bInterCGMultiBody = FALSE;
6494     }
6495
6496     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6497     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6498
6499     if (ir->rlistlong == 0)
6500     {
6501         /* Set the cut-off to some very large value,
6502          * so we don't need if statements everywhere in the code.
6503          * We use sqrt, since the cut-off is squared in some places.
6504          */
6505         comm->cutoff   = GMX_CUTOFF_INF;
6506     }
6507     else
6508     {
6509         comm->cutoff   = ir->rlistlong;
6510     }
6511     comm->cutoff_mbody = 0;
6512
6513     comm->cellsize_limit = 0;
6514     comm->bBondComm      = FALSE;
6515
6516     /* Atoms should be able to move by up to half the list buffer size (if > 0)
6517      * within nstlist steps. Since boundaries are allowed to displace by half
6518      * a cell size, DD cells should be at least the size of the list buffer.
6519      */
6520     comm->cellsize_limit = std::max(comm->cellsize_limit,
6521                                     ir->rlistlong - std::max(ir->rvdw, ir->rcoulomb));
6522
6523     if (comm->bInterCGBondeds)
6524     {
6525         if (comm_distance_min > 0)
6526         {
6527             comm->cutoff_mbody = comm_distance_min;
6528             if (Flags & MD_DDBONDCOMM)
6529             {
6530                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6531             }
6532             else
6533             {
6534                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6535             }
6536             r_bonded_limit = comm->cutoff_mbody;
6537         }
6538         else if (ir->bPeriodicMols)
6539         {
6540             /* Can not easily determine the required cut-off */
6541             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6542             comm->cutoff_mbody = comm->cutoff/2;
6543             r_bonded_limit     = comm->cutoff_mbody;
6544         }
6545         else
6546         {
6547             if (MASTER(cr))
6548             {
6549                 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6550                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6551             }
6552             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6553             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6554
6555             /* We use an initial margin of 10% for the minimum cell size,
6556              * except when we are just below the non-bonded cut-off.
6557              */
6558             if (Flags & MD_DDBONDCOMM)
6559             {
6560                 if (std::max(r_2b, r_mb) > comm->cutoff)
6561                 {
6562                     r_bonded        = std::max(r_2b, r_mb);
6563                     r_bonded_limit  = tenPercentMargin*r_bonded;
6564                     comm->bBondComm = TRUE;
6565                 }
6566                 else
6567                 {
6568                     r_bonded       = r_mb;
6569                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
6570                 }
6571                 /* We determine cutoff_mbody later */
6572             }
6573             else
6574             {
6575                 /* No special bonded communication,
6576                  * simply increase the DD cut-off.
6577                  */
6578                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
6579                 comm->cutoff_mbody = r_bonded_limit;
6580                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
6581             }
6582         }
6583         if (fplog)
6584         {
6585             fprintf(fplog,
6586                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6587                     r_bonded_limit);
6588         }
6589         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
6590     }
6591
6592     if (dd->bInterCGcons && rconstr <= 0)
6593     {
6594         /* There is a cell size limit due to the constraints (P-LINCS) */
6595         rconstr = constr_r_max(fplog, mtop, ir);
6596         if (fplog)
6597         {
6598             fprintf(fplog,
6599                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6600                     rconstr);
6601             if (rconstr > comm->cellsize_limit)
6602             {
6603                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6604             }
6605         }
6606     }
6607     else if (rconstr > 0 && fplog)
6608     {
6609         /* Here we do not check for dd->bInterCGcons,
6610          * because one can also set a cell size limit for virtual sites only
6611          * and at this point we don't know yet if there are intercg v-sites.
6612          */
6613         fprintf(fplog,
6614                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6615                 rconstr);
6616     }
6617     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
6618
6619     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6620
6621     if (nc[XX] > 0)
6622     {
6623         copy_ivec(nc, dd->nc);
6624         set_dd_dim(fplog, dd);
6625         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6626
6627         if (cr->npmenodes == -1)
6628         {
6629             cr->npmenodes = 0;
6630         }
6631         acs = average_cellsize_min(dd, ddbox);
6632         if (acs < comm->cellsize_limit)
6633         {
6634             if (fplog)
6635             {
6636                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6637             }
6638             gmx_fatal_collective(FARGS, cr, NULL,
6639                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6640                                  acs, comm->cellsize_limit);
6641         }
6642     }
6643     else
6644     {
6645         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6646
6647         /* We need to choose the optimal DD grid and possibly PME nodes */
6648         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6649                                comm->dlbState != edlbsOffForever, dlb_scale,
6650                                comm->cellsize_limit, comm->cutoff,
6651                                comm->bInterCGBondeds);
6652
6653         if (dd->nc[XX] == 0)
6654         {
6655             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6656             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6657                     !bC ? "-rdd" : "-rcon",
6658                     comm->dlbState != edlbsOffForever ? " or -dds" : "",
6659                     bC ? " or your LINCS settings" : "");
6660
6661             gmx_fatal_collective(FARGS, cr, NULL,
6662                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6663                                  "%s\n"
6664                                  "Look in the log file for details on the domain decomposition",
6665                                  cr->nnodes-cr->npmenodes, limit, buf);
6666         }
6667         set_dd_dim(fplog, dd);
6668     }
6669
6670     if (fplog)
6671     {
6672         fprintf(fplog,
6673                 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6674                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6675     }
6676
6677     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6678     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6679     {
6680         gmx_fatal_collective(FARGS, cr, NULL,
6681                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6682                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6683     }
6684     if (cr->npmenodes > dd->nnodes)
6685     {
6686         gmx_fatal_collective(FARGS, cr, NULL,
6687                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6688     }
6689     if (cr->npmenodes > 0)
6690     {
6691         comm->npmenodes = cr->npmenodes;
6692     }
6693     else
6694     {
6695         comm->npmenodes = dd->nnodes;
6696     }
6697
6698     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6699     {
6700         /* The following choices should match those
6701          * in comm_cost_est in domdec_setup.c.
6702          * Note that here the checks have to take into account
6703          * that the decomposition might occur in a different order than xyz
6704          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6705          * in which case they will not match those in comm_cost_est,
6706          * but since that is mainly for testing purposes that's fine.
6707          */
6708         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6709             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6710             getenv("GMX_PMEONEDD") == NULL)
6711         {
6712             comm->npmedecompdim = 2;
6713             comm->npmenodes_x   = dd->nc[XX];
6714             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6715         }
6716         else
6717         {
6718             /* In case nc is 1 in both x and y we could still choose to
6719              * decompose pme in y instead of x, but we use x for simplicity.
6720              */
6721             comm->npmedecompdim = 1;
6722             if (dd->dim[0] == YY)
6723             {
6724                 comm->npmenodes_x = 1;
6725                 comm->npmenodes_y = comm->npmenodes;
6726             }
6727             else
6728             {
6729                 comm->npmenodes_x = comm->npmenodes;
6730                 comm->npmenodes_y = 1;
6731             }
6732         }
6733         if (fplog)
6734         {
6735             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6736                     comm->npmenodes_x, comm->npmenodes_y, 1);
6737         }
6738     }
6739     else
6740     {
6741         comm->npmedecompdim = 0;
6742         comm->npmenodes_x   = 0;
6743         comm->npmenodes_y   = 0;
6744     }
6745
6746     /* Technically we don't need both of these,
6747      * but it simplifies code not having to recalculate it.
6748      */
6749     *npme_x = comm->npmenodes_x;
6750     *npme_y = comm->npmenodes_y;
6751
6752     snew(comm->slb_frac, DIM);
6753     if (comm->dlbState == edlbsOffForever)
6754     {
6755         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6756         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6757         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6758     }
6759
6760     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6761     {
6762         if (comm->bBondComm || comm->dlbState != edlbsOffForever)
6763         {
6764             /* Set the bonded communication distance to halfway
6765              * the minimum and the maximum,
6766              * since the extra communication cost is nearly zero.
6767              */
6768             acs                = average_cellsize_min(dd, ddbox);
6769             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6770             if (comm->dlbState != edlbsOffForever)
6771             {
6772                 /* Check if this does not limit the scaling */
6773                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
6774             }
6775             if (!comm->bBondComm)
6776             {
6777                 /* Without bBondComm do not go beyond the n.b. cut-off */
6778                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
6779                 if (comm->cellsize_limit >= comm->cutoff)
6780                 {
6781                     /* We don't loose a lot of efficieny
6782                      * when increasing it to the n.b. cut-off.
6783                      * It can even be slightly faster, because we need
6784                      * less checks for the communication setup.
6785                      */
6786                     comm->cutoff_mbody = comm->cutoff;
6787                 }
6788             }
6789             /* Check if we did not end up below our original limit */
6790             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
6791
6792             if (comm->cutoff_mbody > comm->cellsize_limit)
6793             {
6794                 comm->cellsize_limit = comm->cutoff_mbody;
6795             }
6796         }
6797         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6798     }
6799
6800     if (debug)
6801     {
6802         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6803                 "cellsize limit %f\n",
6804                 comm->bBondComm, comm->cellsize_limit);
6805     }
6806
6807     if (MASTER(cr))
6808     {
6809         check_dd_restrictions(cr, dd, ir, fplog);
6810     }
6811
6812     comm->partition_step = INT_MIN;
6813     dd->ddp_count        = 0;
6814
6815     clear_dd_cycle_counts(dd);
6816
6817     return dd;
6818 }
6819
6820 static void set_dlb_limits(gmx_domdec_t *dd)
6821
6822 {
6823     int d;
6824
6825     for (d = 0; d < dd->ndim; d++)
6826     {
6827         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6828         dd->comm->cellsize_min[dd->dim[d]] =
6829             dd->comm->cellsize_min_dlb[dd->dim[d]];
6830     }
6831 }
6832
6833
6834 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6835 {
6836     gmx_domdec_t      *dd;
6837     gmx_domdec_comm_t *comm;
6838     real               cellsize_min;
6839     int                d, nc, i;
6840     char               buf[STRLEN];
6841
6842     dd   = cr->dd;
6843     comm = dd->comm;
6844
6845     if (fplog)
6846     {
6847         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
6848     }
6849
6850     cellsize_min = comm->cellsize_min[dd->dim[0]];
6851     for (d = 1; d < dd->ndim; d++)
6852     {
6853         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6854     }
6855
6856     if (cellsize_min < comm->cellsize_limit*1.05)
6857     {
6858         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6859
6860         /* Change DLB from "auto" to "no". */
6861         comm->dlbState = edlbsOffForever;
6862
6863         return;
6864     }
6865
6866     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
6867     comm->dlbState = edlbsOn;
6868
6869     set_dlb_limits(dd);
6870
6871     /* We can set the required cell size info here,
6872      * so we do not need to communicate this.
6873      * The grid is completely uniform.
6874      */
6875     for (d = 0; d < dd->ndim; d++)
6876     {
6877         if (comm->root[d])
6878         {
6879             comm->load[d].sum_m = comm->load[d].sum;
6880
6881             nc = dd->nc[dd->dim[d]];
6882             for (i = 0; i < nc; i++)
6883             {
6884                 comm->root[d]->cell_f[i]    = i/(real)nc;
6885                 if (d > 0)
6886                 {
6887                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6888                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6889                 }
6890             }
6891             comm->root[d]->cell_f[nc] = 1.0;
6892         }
6893     }
6894 }
6895
6896 static char *init_bLocalCG(gmx_mtop_t *mtop)
6897 {
6898     int   ncg, cg;
6899     char *bLocalCG;
6900
6901     ncg = ncg_mtop(mtop);
6902     snew(bLocalCG, ncg);
6903     for (cg = 0; cg < ncg; cg++)
6904     {
6905         bLocalCG[cg] = FALSE;
6906     }
6907
6908     return bLocalCG;
6909 }
6910
6911 void dd_init_bondeds(FILE *fplog,
6912                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
6913                      gmx_vsite_t *vsite,
6914                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
6915 {
6916     gmx_domdec_comm_t *comm;
6917
6918     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
6919
6920     comm = dd->comm;
6921
6922     if (comm->bBondComm)
6923     {
6924         /* Communicate atoms beyond the cut-off for bonded interactions */
6925         comm = dd->comm;
6926
6927         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
6928
6929         comm->bLocalCG = init_bLocalCG(mtop);
6930     }
6931     else
6932     {
6933         /* Only communicate atoms based on cut-off */
6934         comm->cglink   = NULL;
6935         comm->bLocalCG = NULL;
6936     }
6937 }
6938
6939 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
6940                               t_inputrec *ir,
6941                               gmx_bool bDynLoadBal, real dlb_scale,
6942                               gmx_ddbox_t *ddbox)
6943 {
6944     gmx_domdec_comm_t *comm;
6945     int                d;
6946     ivec               np;
6947     real               limit, shrink;
6948     char               buf[64];
6949
6950     if (fplog == NULL)
6951     {
6952         return;
6953     }
6954
6955     comm = dd->comm;
6956
6957     if (bDynLoadBal)
6958     {
6959         fprintf(fplog, "The maximum number of communication pulses is:");
6960         for (d = 0; d < dd->ndim; d++)
6961         {
6962             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
6963         }
6964         fprintf(fplog, "\n");
6965         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
6966         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
6967         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
6968         for (d = 0; d < DIM; d++)
6969         {
6970             if (dd->nc[d] > 1)
6971             {
6972                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6973                 {
6974                     shrink = 0;
6975                 }
6976                 else
6977                 {
6978                     shrink =
6979                         comm->cellsize_min_dlb[d]/
6980                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6981                 }
6982                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
6983             }
6984         }
6985         fprintf(fplog, "\n");
6986     }
6987     else
6988     {
6989         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
6990         fprintf(fplog, "The initial number of communication pulses is:");
6991         for (d = 0; d < dd->ndim; d++)
6992         {
6993             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
6994         }
6995         fprintf(fplog, "\n");
6996         fprintf(fplog, "The initial domain decomposition cell size is:");
6997         for (d = 0; d < DIM; d++)
6998         {
6999             if (dd->nc[d] > 1)
7000             {
7001                 fprintf(fplog, " %c %.2f nm",
7002                         dim2char(d), dd->comm->cellsize_min[d]);
7003             }
7004         }
7005         fprintf(fplog, "\n\n");
7006     }
7007
7008     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7009     {
7010         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7011         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7012                 "non-bonded interactions", "", comm->cutoff);
7013
7014         if (bDynLoadBal)
7015         {
7016             limit = dd->comm->cellsize_limit;
7017         }
7018         else
7019         {
7020             if (dynamic_dd_box(ddbox, ir))
7021             {
7022                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7023             }
7024             limit = dd->comm->cellsize_min[XX];
7025             for (d = 1; d < DIM; d++)
7026             {
7027                 limit = std::min(limit, dd->comm->cellsize_min[d]);
7028             }
7029         }
7030
7031         if (comm->bInterCGBondeds)
7032         {
7033             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7034                     "two-body bonded interactions", "(-rdd)",
7035                     std::max(comm->cutoff, comm->cutoff_mbody));
7036             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7037                     "multi-body bonded interactions", "(-rdd)",
7038                     (comm->bBondComm || dlbIsOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
7039         }
7040         if (dd->vsite_comm)
7041         {
7042             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7043                     "virtual site constructions", "(-rcon)", limit);
7044         }
7045         if (dd->constraint_comm)
7046         {
7047             sprintf(buf, "atoms separated by up to %d constraints",
7048                     1+ir->nProjOrder);
7049             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7050                     buf, "(-rcon)", limit);
7051         }
7052         fprintf(fplog, "\n");
7053     }
7054
7055     fflush(fplog);
7056 }
7057
7058 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7059                                 real               dlb_scale,
7060                                 const t_inputrec  *ir,
7061                                 const gmx_ddbox_t *ddbox)
7062 {
7063     gmx_domdec_comm_t *comm;
7064     int                d, dim, npulse, npulse_d_max, npulse_d;
7065     gmx_bool           bNoCutOff;
7066
7067     comm = dd->comm;
7068
7069     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7070
7071     /* Determine the maximum number of comm. pulses in one dimension */
7072
7073     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7074
7075     /* Determine the maximum required number of grid pulses */
7076     if (comm->cellsize_limit >= comm->cutoff)
7077     {
7078         /* Only a single pulse is required */
7079         npulse = 1;
7080     }
7081     else if (!bNoCutOff && comm->cellsize_limit > 0)
7082     {
7083         /* We round down slightly here to avoid overhead due to the latency
7084          * of extra communication calls when the cut-off
7085          * would be only slightly longer than the cell size.
7086          * Later cellsize_limit is redetermined,
7087          * so we can not miss interactions due to this rounding.
7088          */
7089         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7090     }
7091     else
7092     {
7093         /* There is no cell size limit */
7094         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7095     }
7096
7097     if (!bNoCutOff && npulse > 1)
7098     {
7099         /* See if we can do with less pulses, based on dlb_scale */
7100         npulse_d_max = 0;
7101         for (d = 0; d < dd->ndim; d++)
7102         {
7103             dim      = dd->dim[d];
7104             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7105                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7106             npulse_d_max = std::max(npulse_d_max, npulse_d);
7107         }
7108         npulse = std::min(npulse, npulse_d_max);
7109     }
7110
7111     /* This env var can override npulse */
7112     d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
7113     if (d > 0)
7114     {
7115         npulse = d;
7116     }
7117
7118     comm->maxpulse       = 1;
7119     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7120     for (d = 0; d < dd->ndim; d++)
7121     {
7122         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
7123         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7124         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7125         comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
7126         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7127         {
7128             comm->bVacDLBNoLimit = FALSE;
7129         }
7130     }
7131
7132     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7133     if (!comm->bVacDLBNoLimit)
7134     {
7135         comm->cellsize_limit = std::max(comm->cellsize_limit,
7136                                         comm->cutoff/comm->maxpulse);
7137     }
7138     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7139     /* Set the minimum cell size for each DD dimension */
7140     for (d = 0; d < dd->ndim; d++)
7141     {
7142         if (comm->bVacDLBNoLimit ||
7143             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7144         {
7145             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7146         }
7147         else
7148         {
7149             comm->cellsize_min_dlb[dd->dim[d]] =
7150                 comm->cutoff/comm->cd[d].np_dlb;
7151         }
7152     }
7153     if (comm->cutoff_mbody <= 0)
7154     {
7155         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
7156     }
7157     if (dlbIsOn(comm))
7158     {
7159         set_dlb_limits(dd);
7160     }
7161 }
7162
7163 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7164 {
7165     /* If each molecule is a single charge group
7166      * or we use domain decomposition for each periodic dimension,
7167      * we do not need to take pbc into account for the bonded interactions.
7168      */
7169     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7170             !(dd->nc[XX] > 1 &&
7171               dd->nc[YY] > 1 &&
7172               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7173 }
7174
7175 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7176                        t_inputrec *ir, gmx_ddbox_t *ddbox)
7177 {
7178     gmx_domdec_comm_t *comm;
7179     int                natoms_tot;
7180     real               vol_frac;
7181
7182     comm = dd->comm;
7183
7184     /* Initialize the thread data.
7185      * This can not be done in init_domain_decomposition,
7186      * as the numbers of threads is determined later.
7187      */
7188     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7189     if (comm->nth > 1)
7190     {
7191         snew(comm->dth, comm->nth);
7192     }
7193
7194     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7195     {
7196         init_ddpme(dd, &comm->ddpme[0], 0);
7197         if (comm->npmedecompdim >= 2)
7198         {
7199             init_ddpme(dd, &comm->ddpme[1], 1);
7200         }
7201     }
7202     else
7203     {
7204         comm->npmenodes = 0;
7205         if (dd->pme_nodeid >= 0)
7206         {
7207             gmx_fatal_collective(FARGS, NULL, dd,
7208                                  "Can not have separate PME ranks without PME electrostatics");
7209         }
7210     }
7211
7212     if (debug)
7213     {
7214         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7215     }
7216     if (comm->dlbState != edlbsOffForever)
7217     {
7218         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7219     }
7220
7221     print_dd_settings(fplog, dd, ir, dlbIsOn(comm), dlb_scale, ddbox);
7222     if (comm->dlbState == edlbsOffCanTurnOn)
7223     {
7224         if (fplog)
7225         {
7226             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7227         }
7228         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7229     }
7230
7231     if (ir->ePBC == epbcNONE)
7232     {
7233         vol_frac = 1 - 1/(double)dd->nnodes;
7234     }
7235     else
7236     {
7237         vol_frac =
7238             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7239     }
7240     if (debug)
7241     {
7242         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7243     }
7244     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7245
7246     dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
7247 }
7248
7249 static gmx_bool test_dd_cutoff(t_commrec *cr,
7250                                t_state *state, t_inputrec *ir,
7251                                real cutoff_req)
7252 {
7253     gmx_domdec_t *dd;
7254     gmx_ddbox_t   ddbox;
7255     int           d, dim, np;
7256     real          inv_cell_size;
7257     int           LocallyLimited;
7258
7259     dd = cr->dd;
7260
7261     set_ddbox(dd, FALSE, cr, ir, state->box,
7262               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7263
7264     LocallyLimited = 0;
7265
7266     for (d = 0; d < dd->ndim; d++)
7267     {
7268         dim = dd->dim[d];
7269
7270         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7271         if (dynamic_dd_box(&ddbox, ir))
7272         {
7273             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7274         }
7275
7276         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7277
7278         if (dd->comm->dlbState != edlbsOffForever && dim < ddbox.npbcdim &&
7279             dd->comm->cd[d].np_dlb > 0)
7280         {
7281             if (np > dd->comm->cd[d].np_dlb)
7282             {
7283                 return FALSE;
7284             }
7285
7286             /* If a current local cell size is smaller than the requested
7287              * cut-off, we could still fix it, but this gets very complicated.
7288              * Without fixing here, we might actually need more checks.
7289              */
7290             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7291             {
7292                 LocallyLimited = 1;
7293             }
7294         }
7295     }
7296
7297     if (dd->comm->dlbState != edlbsOffForever)
7298     {
7299         /* If DLB is not active yet, we don't need to check the grid jumps.
7300          * Actually we shouldn't, because then the grid jump data is not set.
7301          */
7302         if (dlbIsOn(dd->comm) &&
7303             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7304         {
7305             LocallyLimited = 1;
7306         }
7307
7308         gmx_sumi(1, &LocallyLimited, cr);
7309
7310         if (LocallyLimited > 0)
7311         {
7312             return FALSE;
7313         }
7314     }
7315
7316     return TRUE;
7317 }
7318
7319 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7320                           real cutoff_req)
7321 {
7322     gmx_bool bCutoffAllowed;
7323
7324     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7325
7326     if (bCutoffAllowed)
7327     {
7328         cr->dd->comm->cutoff = cutoff_req;
7329     }
7330
7331     return bCutoffAllowed;
7332 }
7333
7334 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
7335 {
7336     gmx_domdec_comm_t *comm;
7337
7338     comm = cr->dd->comm;
7339
7340     /* Turn on the DLB limiting (might have been on already) */
7341     comm->bPMELoadBalDLBLimits = TRUE;
7342
7343     /* Change the cut-off limit */
7344     comm->PMELoadBal_max_cutoff = cutoff;
7345
7346     if (debug)
7347     {
7348         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
7349                 comm->PMELoadBal_max_cutoff);
7350     }
7351 }
7352
7353 /* Sets whether we should later check the load imbalance data, so that
7354  * we can trigger dynamic load balancing if enough imbalance has
7355  * arisen.
7356  *
7357  * Used after PME load balancing unlocks DLB, so that the check
7358  * whether DLB will be useful can happen immediately.
7359  */
7360 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
7361 {
7362     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7363     {
7364         dd->comm->bCheckWhetherToTurnDlbOn = bValue;
7365     }
7366 }
7367
7368 /* Returns if we should check whether there has been enough load
7369  * imbalance to trigger dynamic load balancing.
7370  */
7371 static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
7372 {
7373     const int nddp_chk_dlb = 100;
7374
7375     if (dd->comm->dlbState != edlbsOffCanTurnOn)
7376     {
7377         return FALSE;
7378     }
7379
7380     /* We should check whether we should use DLB directly after
7381      * unlocking DLB. */
7382     if (dd->comm->bCheckWhetherToTurnDlbOn)
7383     {
7384         /* This flag was set when the PME load-balancing routines
7385            unlocked DLB, and should now be cleared. */
7386         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
7387         return TRUE;
7388     }
7389     /* We should also check whether we should use DLB every 100
7390      * partitionings (we do not do this every partioning, so that we
7391      * avoid excessive communication). */
7392     if (dd->comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1)
7393     {
7394         return TRUE;
7395     }
7396
7397     return FALSE;
7398 }
7399
7400 gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
7401 {
7402     return (dd->comm->dlbState == edlbsOn);
7403 }
7404
7405 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
7406 {
7407     return (dd->comm->dlbState == edlbsOffTemporarilyLocked);
7408 }
7409
7410 void dd_dlb_lock(gmx_domdec_t *dd)
7411 {
7412     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7413     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7414     {
7415         dd->comm->dlbState = edlbsOffTemporarilyLocked;
7416     }
7417 }
7418
7419 void dd_dlb_unlock(gmx_domdec_t *dd)
7420 {
7421     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7422     if (dd->comm->dlbState == edlbsOffTemporarilyLocked)
7423     {
7424         dd->comm->dlbState = edlbsOffCanTurnOn;
7425         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
7426     }
7427 }
7428
7429 static void merge_cg_buffers(int ncell,
7430                              gmx_domdec_comm_dim_t *cd, int pulse,
7431                              int  *ncg_cell,
7432                              int  *index_gl, int  *recv_i,
7433                              rvec *cg_cm,    rvec *recv_vr,
7434                              int *cgindex,
7435                              cginfo_mb_t *cginfo_mb, int *cginfo)
7436 {
7437     gmx_domdec_ind_t *ind, *ind_p;
7438     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7439     int               shift, shift_at;
7440
7441     ind = &cd->ind[pulse];
7442
7443     /* First correct the already stored data */
7444     shift = ind->nrecv[ncell];
7445     for (cell = ncell-1; cell >= 0; cell--)
7446     {
7447         shift -= ind->nrecv[cell];
7448         if (shift > 0)
7449         {
7450             /* Move the cg's present from previous grid pulses */
7451             cg0                = ncg_cell[ncell+cell];
7452             cg1                = ncg_cell[ncell+cell+1];
7453             cgindex[cg1+shift] = cgindex[cg1];
7454             for (cg = cg1-1; cg >= cg0; cg--)
7455             {
7456                 index_gl[cg+shift] = index_gl[cg];
7457                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7458                 cgindex[cg+shift] = cgindex[cg];
7459                 cginfo[cg+shift]  = cginfo[cg];
7460             }
7461             /* Correct the already stored send indices for the shift */
7462             for (p = 1; p <= pulse; p++)
7463             {
7464                 ind_p = &cd->ind[p];
7465                 cg0   = 0;
7466                 for (c = 0; c < cell; c++)
7467                 {
7468                     cg0 += ind_p->nsend[c];
7469                 }
7470                 cg1 = cg0 + ind_p->nsend[cell];
7471                 for (cg = cg0; cg < cg1; cg++)
7472                 {
7473                     ind_p->index[cg] += shift;
7474                 }
7475             }
7476         }
7477     }
7478
7479     /* Merge in the communicated buffers */
7480     shift    = 0;
7481     shift_at = 0;
7482     cg0      = 0;
7483     for (cell = 0; cell < ncell; cell++)
7484     {
7485         cg1 = ncg_cell[ncell+cell+1] + shift;
7486         if (shift_at > 0)
7487         {
7488             /* Correct the old cg indices */
7489             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7490             {
7491                 cgindex[cg+1] += shift_at;
7492             }
7493         }
7494         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7495         {
7496             /* Copy this charge group from the buffer */
7497             index_gl[cg1] = recv_i[cg0];
7498             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7499             /* Add it to the cgindex */
7500             cg_gl          = index_gl[cg1];
7501             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7502             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7503             cgindex[cg1+1] = cgindex[cg1] + nat;
7504             cg0++;
7505             cg1++;
7506             shift_at += nat;
7507         }
7508         shift                 += ind->nrecv[cell];
7509         ncg_cell[ncell+cell+1] = cg1;
7510     }
7511 }
7512
7513 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7514                                int nzone, int cg0, const int *cgindex)
7515 {
7516     int cg, zone, p;
7517
7518     /* Store the atom block boundaries for easy copying of communication buffers
7519      */
7520     cg = cg0;
7521     for (zone = 0; zone < nzone; zone++)
7522     {
7523         for (p = 0; p < cd->np; p++)
7524         {
7525             cd->ind[p].cell2at0[zone] = cgindex[cg];
7526             cg += cd->ind[p].nrecv[zone];
7527             cd->ind[p].cell2at1[zone] = cgindex[cg];
7528         }
7529     }
7530 }
7531
7532 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7533 {
7534     int      i;
7535     gmx_bool bMiss;
7536
7537     bMiss = FALSE;
7538     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7539     {
7540         if (!bLocalCG[link->a[i]])
7541         {
7542             bMiss = TRUE;
7543         }
7544     }
7545
7546     return bMiss;
7547 }
7548
7549 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7550 typedef struct {
7551     real c[DIM][4]; /* the corners for the non-bonded communication */
7552     real cr0;       /* corner for rounding */
7553     real cr1[4];    /* corners for rounding */
7554     real bc[DIM];   /* corners for bounded communication */
7555     real bcr1;      /* corner for rounding for bonded communication */
7556 } dd_corners_t;
7557
7558 /* Determine the corners of the domain(s) we are communicating with */
7559 static void
7560 set_dd_corners(const gmx_domdec_t *dd,
7561                int dim0, int dim1, int dim2,
7562                gmx_bool bDistMB,
7563                dd_corners_t *c)
7564 {
7565     const gmx_domdec_comm_t  *comm;
7566     const gmx_domdec_zones_t *zones;
7567     int i, j;
7568
7569     comm = dd->comm;
7570
7571     zones = &comm->zones;
7572
7573     /* Keep the compiler happy */
7574     c->cr0  = 0;
7575     c->bcr1 = 0;
7576
7577     /* The first dimension is equal for all cells */
7578     c->c[0][0] = comm->cell_x0[dim0];
7579     if (bDistMB)
7580     {
7581         c->bc[0] = c->c[0][0];
7582     }
7583     if (dd->ndim >= 2)
7584     {
7585         dim1 = dd->dim[1];
7586         /* This cell row is only seen from the first row */
7587         c->c[1][0] = comm->cell_x0[dim1];
7588         /* All rows can see this row */
7589         c->c[1][1] = comm->cell_x0[dim1];
7590         if (dlbIsOn(dd->comm))
7591         {
7592             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7593             if (bDistMB)
7594             {
7595                 /* For the multi-body distance we need the maximum */
7596                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7597             }
7598         }
7599         /* Set the upper-right corner for rounding */
7600         c->cr0 = comm->cell_x1[dim0];
7601
7602         if (dd->ndim >= 3)
7603         {
7604             dim2 = dd->dim[2];
7605             for (j = 0; j < 4; j++)
7606             {
7607                 c->c[2][j] = comm->cell_x0[dim2];
7608             }
7609             if (dlbIsOn(dd->comm))
7610             {
7611                 /* Use the maximum of the i-cells that see a j-cell */
7612                 for (i = 0; i < zones->nizone; i++)
7613                 {
7614                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7615                     {
7616                         if (j >= 4)
7617                         {
7618                             c->c[2][j-4] =
7619                                 std::max(c->c[2][j-4],
7620                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7621                         }
7622                     }
7623                 }
7624                 if (bDistMB)
7625                 {
7626                     /* For the multi-body distance we need the maximum */
7627                     c->bc[2] = comm->cell_x0[dim2];
7628                     for (i = 0; i < 2; i++)
7629                     {
7630                         for (j = 0; j < 2; j++)
7631                         {
7632                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
7633                         }
7634                     }
7635                 }
7636             }
7637
7638             /* Set the upper-right corner for rounding */
7639             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7640              * Only cell (0,0,0) can see cell 7 (1,1,1)
7641              */
7642             c->cr1[0] = comm->cell_x1[dim1];
7643             c->cr1[3] = comm->cell_x1[dim1];
7644             if (dlbIsOn(dd->comm))
7645             {
7646                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7647                 if (bDistMB)
7648                 {
7649                     /* For the multi-body distance we need the maximum */
7650                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7651                 }
7652             }
7653         }
7654     }
7655 }
7656
7657 /* Determine which cg's we need to send in this pulse from this zone */
7658 static void
7659 get_zone_pulse_cgs(gmx_domdec_t *dd,
7660                    int zonei, int zone,
7661                    int cg0, int cg1,
7662                    const int *index_gl,
7663                    const int *cgindex,
7664                    int dim, int dim_ind,
7665                    int dim0, int dim1, int dim2,
7666                    real r_comm2, real r_bcomm2,
7667                    matrix box,
7668                    ivec tric_dist,
7669                    rvec *normal,
7670                    real skew_fac2_d, real skew_fac_01,
7671                    rvec *v_d, rvec *v_0, rvec *v_1,
7672                    const dd_corners_t *c,
7673                    rvec sf2_round,
7674                    gmx_bool bDistBonded,
7675                    gmx_bool bBondComm,
7676                    gmx_bool bDist2B,
7677                    gmx_bool bDistMB,
7678                    rvec *cg_cm,
7679                    int *cginfo,
7680                    gmx_domdec_ind_t *ind,
7681                    int **ibuf, int *ibuf_nalloc,
7682                    vec_rvec_t *vbuf,
7683                    int *nsend_ptr,
7684                    int *nat_ptr,
7685                    int *nsend_z_ptr)
7686 {
7687     gmx_domdec_comm_t *comm;
7688     gmx_bool           bScrew;
7689     gmx_bool           bDistMB_pulse;
7690     int                cg, i;
7691     real               r2, rb2, r, tric_sh;
7692     rvec               rn, rb;
7693     int                dimd;
7694     int                nsend_z, nsend, nat;
7695
7696     comm = dd->comm;
7697
7698     bScrew = (dd->bScrewPBC && dim == XX);
7699
7700     bDistMB_pulse = (bDistMB && bDistBonded);
7701
7702     nsend_z = 0;
7703     nsend   = *nsend_ptr;
7704     nat     = *nat_ptr;
7705
7706     for (cg = cg0; cg < cg1; cg++)
7707     {
7708         r2  = 0;
7709         rb2 = 0;
7710         if (tric_dist[dim_ind] == 0)
7711         {
7712             /* Rectangular direction, easy */
7713             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7714             if (r > 0)
7715             {
7716                 r2 += r*r;
7717             }
7718             if (bDistMB_pulse)
7719             {
7720                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7721                 if (r > 0)
7722                 {
7723                     rb2 += r*r;
7724                 }
7725             }
7726             /* Rounding gives at most a 16% reduction
7727              * in communicated atoms
7728              */
7729             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7730             {
7731                 r = cg_cm[cg][dim0] - c->cr0;
7732                 /* This is the first dimension, so always r >= 0 */
7733                 r2 += r*r;
7734                 if (bDistMB_pulse)
7735                 {
7736                     rb2 += r*r;
7737                 }
7738             }
7739             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7740             {
7741                 r = cg_cm[cg][dim1] - c->cr1[zone];
7742                 if (r > 0)
7743                 {
7744                     r2 += r*r;
7745                 }
7746                 if (bDistMB_pulse)
7747                 {
7748                     r = cg_cm[cg][dim1] - c->bcr1;
7749                     if (r > 0)
7750                     {
7751                         rb2 += r*r;
7752                     }
7753                 }
7754             }
7755         }
7756         else
7757         {
7758             /* Triclinic direction, more complicated */
7759             clear_rvec(rn);
7760             clear_rvec(rb);
7761             /* Rounding, conservative as the skew_fac multiplication
7762              * will slightly underestimate the distance.
7763              */
7764             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7765             {
7766                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7767                 for (i = dim0+1; i < DIM; i++)
7768                 {
7769                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7770                 }
7771                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7772                 if (bDistMB_pulse)
7773                 {
7774                     rb[dim0] = rn[dim0];
7775                     rb2      = r2;
7776                 }
7777                 /* Take care that the cell planes along dim0 might not
7778                  * be orthogonal to those along dim1 and dim2.
7779                  */
7780                 for (i = 1; i <= dim_ind; i++)
7781                 {
7782                     dimd = dd->dim[i];
7783                     if (normal[dim0][dimd] > 0)
7784                     {
7785                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7786                         if (bDistMB_pulse)
7787                         {
7788                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7789                         }
7790                     }
7791                 }
7792             }
7793             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7794             {
7795                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7796                 tric_sh   = 0;
7797                 for (i = dim1+1; i < DIM; i++)
7798                 {
7799                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7800                 }
7801                 rn[dim1] += tric_sh;
7802                 if (rn[dim1] > 0)
7803                 {
7804                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7805                     /* Take care of coupling of the distances
7806                      * to the planes along dim0 and dim1 through dim2.
7807                      */
7808                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7809                     /* Take care that the cell planes along dim1
7810                      * might not be orthogonal to that along dim2.
7811                      */
7812                     if (normal[dim1][dim2] > 0)
7813                     {
7814                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7815                     }
7816                 }
7817                 if (bDistMB_pulse)
7818                 {
7819                     rb[dim1] +=
7820                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7821                     if (rb[dim1] > 0)
7822                     {
7823                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7824                         /* Take care of coupling of the distances
7825                          * to the planes along dim0 and dim1 through dim2.
7826                          */
7827                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7828                         /* Take care that the cell planes along dim1
7829                          * might not be orthogonal to that along dim2.
7830                          */
7831                         if (normal[dim1][dim2] > 0)
7832                         {
7833                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7834                         }
7835                     }
7836                 }
7837             }
7838             /* The distance along the communication direction */
7839             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7840             tric_sh  = 0;
7841             for (i = dim+1; i < DIM; i++)
7842             {
7843                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7844             }
7845             rn[dim] += tric_sh;
7846             if (rn[dim] > 0)
7847             {
7848                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7849                 /* Take care of coupling of the distances
7850                  * to the planes along dim0 and dim1 through dim2.
7851                  */
7852                 if (dim_ind == 1 && zonei == 1)
7853                 {
7854                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7855                 }
7856             }
7857             if (bDistMB_pulse)
7858             {
7859                 clear_rvec(rb);
7860                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7861                 if (rb[dim] > 0)
7862                 {
7863                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7864                     /* Take care of coupling of the distances
7865                      * to the planes along dim0 and dim1 through dim2.
7866                      */
7867                     if (dim_ind == 1 && zonei == 1)
7868                     {
7869                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7870                     }
7871                 }
7872             }
7873         }
7874
7875         if (r2 < r_comm2 ||
7876             (bDistBonded &&
7877              ((bDistMB && rb2 < r_bcomm2) ||
7878               (bDist2B && r2  < r_bcomm2)) &&
7879              (!bBondComm ||
7880               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7881                missing_link(comm->cglink, index_gl[cg],
7882                             comm->bLocalCG)))))
7883         {
7884             /* Make an index to the local charge groups */
7885             if (nsend+1 > ind->nalloc)
7886             {
7887                 ind->nalloc = over_alloc_large(nsend+1);
7888                 srenew(ind->index, ind->nalloc);
7889             }
7890             if (nsend+1 > *ibuf_nalloc)
7891             {
7892                 *ibuf_nalloc = over_alloc_large(nsend+1);
7893                 srenew(*ibuf, *ibuf_nalloc);
7894             }
7895             ind->index[nsend] = cg;
7896             (*ibuf)[nsend]    = index_gl[cg];
7897             nsend_z++;
7898             vec_rvec_check_alloc(vbuf, nsend+1);
7899
7900             if (dd->ci[dim] == 0)
7901             {
7902                 /* Correct cg_cm for pbc */
7903                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7904                 if (bScrew)
7905                 {
7906                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7907                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7908                 }
7909             }
7910             else
7911             {
7912                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7913             }
7914             nsend++;
7915             nat += cgindex[cg+1] - cgindex[cg];
7916         }
7917     }
7918
7919     *nsend_ptr   = nsend;
7920     *nat_ptr     = nat;
7921     *nsend_z_ptr = nsend_z;
7922 }
7923
7924 static void setup_dd_communication(gmx_domdec_t *dd,
7925                                    matrix box, gmx_ddbox_t *ddbox,
7926                                    t_forcerec *fr, t_state *state, rvec **f)
7927 {
7928     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7929     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7930     int                    c, i, cg, cg_gl, nrcg;
7931     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7932     gmx_domdec_comm_t     *comm;
7933     gmx_domdec_zones_t    *zones;
7934     gmx_domdec_comm_dim_t *cd;
7935     gmx_domdec_ind_t      *ind;
7936     cginfo_mb_t           *cginfo_mb;
7937     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
7938     real                   r_comm2, r_bcomm2;
7939     dd_corners_t           corners;
7940     ivec                   tric_dist;
7941     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
7942     real                   skew_fac2_d, skew_fac_01;
7943     rvec                   sf2_round;
7944     int                    nsend, nat;
7945     int                    th;
7946
7947     if (debug)
7948     {
7949         fprintf(debug, "Setting up DD communication\n");
7950     }
7951
7952     comm  = dd->comm;
7953
7954     switch (fr->cutoff_scheme)
7955     {
7956         case ecutsGROUP:
7957             cg_cm = fr->cg_cm;
7958             break;
7959         case ecutsVERLET:
7960             cg_cm = state->x;
7961             break;
7962         default:
7963             gmx_incons("unimplemented");
7964             cg_cm = NULL;
7965     }
7966
7967     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
7968     {
7969         /* Check if we need to use triclinic distances */
7970         tric_dist[dim_ind] = 0;
7971         for (i = 0; i <= dim_ind; i++)
7972         {
7973             if (ddbox->tric_dir[dd->dim[i]])
7974             {
7975                 tric_dist[dim_ind] = 1;
7976             }
7977         }
7978     }
7979
7980     bBondComm = comm->bBondComm;
7981
7982     /* Do we need to determine extra distances for multi-body bondeds? */
7983     bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
7984
7985     /* Do we need to determine extra distances for only two-body bondeds? */
7986     bDist2B = (bBondComm && !bDistMB);
7987
7988     r_comm2  = sqr(comm->cutoff);
7989     r_bcomm2 = sqr(comm->cutoff_mbody);
7990
7991     if (debug)
7992     {
7993         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
7994     }
7995
7996     zones = &comm->zones;
7997
7998     dim0 = dd->dim[0];
7999     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8000     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8001
8002     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8003
8004     /* Triclinic stuff */
8005     normal      = ddbox->normal;
8006     skew_fac_01 = 0;
8007     if (dd->ndim >= 2)
8008     {
8009         v_0 = ddbox->v[dim0];
8010         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8011         {
8012             /* Determine the coupling coefficient for the distances
8013              * to the cell planes along dim0 and dim1 through dim2.
8014              * This is required for correct rounding.
8015              */
8016             skew_fac_01 =
8017                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8018             if (debug)
8019             {
8020                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8021             }
8022         }
8023     }
8024     if (dd->ndim >= 3)
8025     {
8026         v_1 = ddbox->v[dim1];
8027     }
8028
8029     zone_cg_range = zones->cg_range;
8030     index_gl      = dd->index_gl;
8031     cgindex       = dd->cgindex;
8032     cginfo_mb     = fr->cginfo_mb;
8033
8034     zone_cg_range[0]   = 0;
8035     zone_cg_range[1]   = dd->ncg_home;
8036     comm->zone_ncg1[0] = dd->ncg_home;
8037     pos_cg             = dd->ncg_home;
8038
8039     nat_tot = dd->nat_home;
8040     nzone   = 1;
8041     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8042     {
8043         dim = dd->dim[dim_ind];
8044         cd  = &comm->cd[dim_ind];
8045
8046         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8047         {
8048             /* No pbc in this dimension, the first node should not comm. */
8049             nzone_send = 0;
8050         }
8051         else
8052         {
8053             nzone_send = nzone;
8054         }
8055
8056         v_d         = ddbox->v[dim];
8057         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8058
8059         cd->bInPlace = TRUE;
8060         for (p = 0; p < cd->np; p++)
8061         {
8062             /* Only atoms communicated in the first pulse are used
8063              * for multi-body bonded interactions or for bBondComm.
8064              */
8065             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8066
8067             ind   = &cd->ind[p];
8068             nsend = 0;
8069             nat   = 0;
8070             for (zone = 0; zone < nzone_send; zone++)
8071             {
8072                 if (tric_dist[dim_ind] && dim_ind > 0)
8073                 {
8074                     /* Determine slightly more optimized skew_fac's
8075                      * for rounding.
8076                      * This reduces the number of communicated atoms
8077                      * by about 10% for 3D DD of rhombic dodecahedra.
8078                      */
8079                     for (dimd = 0; dimd < dim; dimd++)
8080                     {
8081                         sf2_round[dimd] = 1;
8082                         if (ddbox->tric_dir[dimd])
8083                         {
8084                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8085                             {
8086                                 /* If we are shifted in dimension i
8087                                  * and the cell plane is tilted forward
8088                                  * in dimension i, skip this coupling.
8089                                  */
8090                                 if (!(zones->shift[nzone+zone][i] &&
8091                                       ddbox->v[dimd][i][dimd] >= 0))
8092                                 {
8093                                     sf2_round[dimd] +=
8094                                         sqr(ddbox->v[dimd][i][dimd]);
8095                                 }
8096                             }
8097                             sf2_round[dimd] = 1/sf2_round[dimd];
8098                         }
8099                     }
8100                 }
8101
8102                 zonei = zone_perm[dim_ind][zone];
8103                 if (p == 0)
8104                 {
8105                     /* Here we permutate the zones to obtain a convenient order
8106                      * for neighbor searching
8107                      */
8108                     cg0 = zone_cg_range[zonei];
8109                     cg1 = zone_cg_range[zonei+1];
8110                 }
8111                 else
8112                 {
8113                     /* Look only at the cg's received in the previous grid pulse
8114                      */
8115                     cg1 = zone_cg_range[nzone+zone+1];
8116                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8117                 }
8118
8119 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8120                 for (th = 0; th < comm->nth; th++)
8121                 {
8122                     try
8123                     {
8124                         gmx_domdec_ind_t *ind_p;
8125                         int             **ibuf_p, *ibuf_nalloc_p;
8126                         vec_rvec_t       *vbuf_p;
8127                         int              *nsend_p, *nat_p;
8128                         int              *nsend_zone_p;
8129                         int               cg0_th, cg1_th;
8130
8131                         if (th == 0)
8132                         {
8133                             /* Thread 0 writes in the comm buffers */
8134                             ind_p         = ind;
8135                             ibuf_p        = &comm->buf_int;
8136                             ibuf_nalloc_p = &comm->nalloc_int;
8137                             vbuf_p        = &comm->vbuf;
8138                             nsend_p       = &nsend;
8139                             nat_p         = &nat;
8140                             nsend_zone_p  = &ind->nsend[zone];
8141                         }
8142                         else
8143                         {
8144                             /* Other threads write into temp buffers */
8145                             ind_p         = &comm->dth[th].ind;
8146                             ibuf_p        = &comm->dth[th].ibuf;
8147                             ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8148                             vbuf_p        = &comm->dth[th].vbuf;
8149                             nsend_p       = &comm->dth[th].nsend;
8150                             nat_p         = &comm->dth[th].nat;
8151                             nsend_zone_p  = &comm->dth[th].nsend_zone;
8152
8153                             comm->dth[th].nsend      = 0;
8154                             comm->dth[th].nat        = 0;
8155                             comm->dth[th].nsend_zone = 0;
8156                         }
8157
8158                         if (comm->nth == 1)
8159                         {
8160                             cg0_th = cg0;
8161                             cg1_th = cg1;
8162                         }
8163                         else
8164                         {
8165                             cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8166                             cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8167                         }
8168
8169                         /* Get the cg's for this pulse in this zone */
8170                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8171                                            index_gl, cgindex,
8172                                            dim, dim_ind, dim0, dim1, dim2,
8173                                            r_comm2, r_bcomm2,
8174                                            box, tric_dist,
8175                                            normal, skew_fac2_d, skew_fac_01,
8176                                            v_d, v_0, v_1, &corners, sf2_round,
8177                                            bDistBonded, bBondComm,
8178                                            bDist2B, bDistMB,
8179                                            cg_cm, fr->cginfo,
8180                                            ind_p,
8181                                            ibuf_p, ibuf_nalloc_p,
8182                                            vbuf_p,
8183                                            nsend_p, nat_p,
8184                                            nsend_zone_p);
8185                     }
8186                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
8187                 } // END
8188
8189                 /* Append data of threads>=1 to the communication buffers */
8190                 for (th = 1; th < comm->nth; th++)
8191                 {
8192                     dd_comm_setup_work_t *dth;
8193                     int                   i, ns1;
8194
8195                     dth = &comm->dth[th];
8196
8197                     ns1 = nsend + dth->nsend_zone;
8198                     if (ns1 > ind->nalloc)
8199                     {
8200                         ind->nalloc = over_alloc_dd(ns1);
8201                         srenew(ind->index, ind->nalloc);
8202                     }
8203                     if (ns1 > comm->nalloc_int)
8204                     {
8205                         comm->nalloc_int = over_alloc_dd(ns1);
8206                         srenew(comm->buf_int, comm->nalloc_int);
8207                     }
8208                     if (ns1 > comm->vbuf.nalloc)
8209                     {
8210                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8211                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8212                     }
8213
8214                     for (i = 0; i < dth->nsend_zone; i++)
8215                     {
8216                         ind->index[nsend]    = dth->ind.index[i];
8217                         comm->buf_int[nsend] = dth->ibuf[i];
8218                         copy_rvec(dth->vbuf.v[i],
8219                                   comm->vbuf.v[nsend]);
8220                         nsend++;
8221                     }
8222                     nat              += dth->nat;
8223                     ind->nsend[zone] += dth->nsend_zone;
8224                 }
8225             }
8226             /* Clear the counts in case we do not have pbc */
8227             for (zone = nzone_send; zone < nzone; zone++)
8228             {
8229                 ind->nsend[zone] = 0;
8230             }
8231             ind->nsend[nzone]   = nsend;
8232             ind->nsend[nzone+1] = nat;
8233             /* Communicate the number of cg's and atoms to receive */
8234             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8235                             ind->nsend, nzone+2,
8236                             ind->nrecv, nzone+2);
8237
8238             /* The rvec buffer is also required for atom buffers of size nsend
8239              * in dd_move_x and dd_move_f.
8240              */
8241             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8242
8243             if (p > 0)
8244             {
8245                 /* We can receive in place if only the last zone is not empty */
8246                 for (zone = 0; zone < nzone-1; zone++)
8247                 {
8248                     if (ind->nrecv[zone] > 0)
8249                     {
8250                         cd->bInPlace = FALSE;
8251                     }
8252                 }
8253                 if (!cd->bInPlace)
8254                 {
8255                     /* The int buffer is only required here for the cg indices */
8256                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8257                     {
8258                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8259                         srenew(comm->buf_int2, comm->nalloc_int2);
8260                     }
8261                     /* The rvec buffer is also required for atom buffers
8262                      * of size nrecv in dd_move_x and dd_move_f.
8263                      */
8264                     i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8265                     vec_rvec_check_alloc(&comm->vbuf2, i);
8266                 }
8267             }
8268
8269             /* Make space for the global cg indices */
8270             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8271                 || dd->cg_nalloc == 0)
8272             {
8273                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8274                 srenew(index_gl, dd->cg_nalloc);
8275                 srenew(cgindex, dd->cg_nalloc+1);
8276             }
8277             /* Communicate the global cg indices */
8278             if (cd->bInPlace)
8279             {
8280                 recv_i = index_gl + pos_cg;
8281             }
8282             else
8283             {
8284                 recv_i = comm->buf_int2;
8285             }
8286             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8287                             comm->buf_int, nsend,
8288                             recv_i,        ind->nrecv[nzone]);
8289
8290             /* Make space for cg_cm */
8291             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8292             if (fr->cutoff_scheme == ecutsGROUP)
8293             {
8294                 cg_cm = fr->cg_cm;
8295             }
8296             else
8297             {
8298                 cg_cm = state->x;
8299             }
8300             /* Communicate cg_cm */
8301             if (cd->bInPlace)
8302             {
8303                 recv_vr = cg_cm + pos_cg;
8304             }
8305             else
8306             {
8307                 recv_vr = comm->vbuf2.v;
8308             }
8309             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8310                              comm->vbuf.v, nsend,
8311                              recv_vr,      ind->nrecv[nzone]);
8312
8313             /* Make the charge group index */
8314             if (cd->bInPlace)
8315             {
8316                 zone = (p == 0 ? 0 : nzone - 1);
8317                 while (zone < nzone)
8318                 {
8319                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8320                     {
8321                         cg_gl              = index_gl[pos_cg];
8322                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8323                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8324                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8325                         if (bBondComm)
8326                         {
8327                             /* Update the charge group presence,
8328                              * so we can use it in the next pass of the loop.
8329                              */
8330                             comm->bLocalCG[cg_gl] = TRUE;
8331                         }
8332                         pos_cg++;
8333                     }
8334                     if (p == 0)
8335                     {
8336                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8337                     }
8338                     zone++;
8339                     zone_cg_range[nzone+zone] = pos_cg;
8340                 }
8341             }
8342             else
8343             {
8344                 /* This part of the code is never executed with bBondComm. */
8345                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8346                                  index_gl, recv_i, cg_cm, recv_vr,
8347                                  cgindex, fr->cginfo_mb, fr->cginfo);
8348                 pos_cg += ind->nrecv[nzone];
8349             }
8350             nat_tot += ind->nrecv[nzone+1];
8351         }
8352         if (!cd->bInPlace)
8353         {
8354             /* Store the atom block for easy copying of communication buffers */
8355             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8356         }
8357         nzone += nzone;
8358     }
8359     dd->index_gl = index_gl;
8360     dd->cgindex  = cgindex;
8361
8362     dd->ncg_tot          = zone_cg_range[zones->n];
8363     dd->nat_tot          = nat_tot;
8364     comm->nat[ddnatHOME] = dd->nat_home;
8365     for (i = ddnatZONE; i < ddnatNR; i++)
8366     {
8367         comm->nat[i] = dd->nat_tot;
8368     }
8369
8370     if (!bBondComm)
8371     {
8372         /* We don't need to update cginfo, since that was alrady done above.
8373          * So we pass NULL for the forcerec.
8374          */
8375         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8376                       NULL, comm->bLocalCG);
8377     }
8378
8379     if (debug)
8380     {
8381         fprintf(debug, "Finished setting up DD communication, zones:");
8382         for (c = 0; c < zones->n; c++)
8383         {
8384             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8385         }
8386         fprintf(debug, "\n");
8387     }
8388 }
8389
8390 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8391 {
8392     int c;
8393
8394     for (c = 0; c < zones->nizone; c++)
8395     {
8396         zones->izone[c].cg1  = zones->cg_range[c+1];
8397         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8398         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8399     }
8400 }
8401
8402 static void set_zones_size(gmx_domdec_t *dd,
8403                            matrix box, const gmx_ddbox_t *ddbox,
8404                            int zone_start, int zone_end)
8405 {
8406     gmx_domdec_comm_t  *comm;
8407     gmx_domdec_zones_t *zones;
8408     gmx_bool            bDistMB;
8409     int                 z, zi, d, dim;
8410     real                rcs, rcmbs;
8411     int                 i, j;
8412     real                vol;
8413
8414     comm = dd->comm;
8415
8416     zones = &comm->zones;
8417
8418     /* Do we need to determine extra distances for multi-body bondeds? */
8419     bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
8420
8421     for (z = zone_start; z < zone_end; z++)
8422     {
8423         /* Copy cell limits to zone limits.
8424          * Valid for non-DD dims and non-shifted dims.
8425          */
8426         copy_rvec(comm->cell_x0, zones->size[z].x0);
8427         copy_rvec(comm->cell_x1, zones->size[z].x1);
8428     }
8429
8430     for (d = 0; d < dd->ndim; d++)
8431     {
8432         dim = dd->dim[d];
8433
8434         for (z = 0; z < zones->n; z++)
8435         {
8436             /* With a staggered grid we have different sizes
8437              * for non-shifted dimensions.
8438              */
8439             if (dlbIsOn(dd->comm) && zones->shift[z][dim] == 0)
8440             {
8441                 if (d == 1)
8442                 {
8443                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8444                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8445                 }
8446                 else if (d == 2)
8447                 {
8448                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8449                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8450                 }
8451             }
8452         }
8453
8454         rcs   = comm->cutoff;
8455         rcmbs = comm->cutoff_mbody;
8456         if (ddbox->tric_dir[dim])
8457         {
8458             rcs   /= ddbox->skew_fac[dim];
8459             rcmbs /= ddbox->skew_fac[dim];
8460         }
8461
8462         /* Set the lower limit for the shifted zone dimensions */
8463         for (z = zone_start; z < zone_end; z++)
8464         {
8465             if (zones->shift[z][dim] > 0)
8466             {
8467                 dim = dd->dim[d];
8468                 if (!dlbIsOn(dd->comm) || d == 0)
8469                 {
8470                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8471                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8472                 }
8473                 else
8474                 {
8475                     /* Here we take the lower limit of the zone from
8476                      * the lowest domain of the zone below.
8477                      */
8478                     if (z < 4)
8479                     {
8480                         zones->size[z].x0[dim] =
8481                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8482                     }
8483                     else
8484                     {
8485                         if (d == 1)
8486                         {
8487                             zones->size[z].x0[dim] =
8488                                 zones->size[zone_perm[2][z-4]].x0[dim];
8489                         }
8490                         else
8491                         {
8492                             zones->size[z].x0[dim] =
8493                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8494                         }
8495                     }
8496                     /* A temporary limit, is updated below */
8497                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8498
8499                     if (bDistMB)
8500                     {
8501                         for (zi = 0; zi < zones->nizone; zi++)
8502                         {
8503                             if (zones->shift[zi][dim] == 0)
8504                             {
8505                                 /* This takes the whole zone into account.
8506                                  * With multiple pulses this will lead
8507                                  * to a larger zone then strictly necessary.
8508                                  */
8509                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8510                                                                   zones->size[zi].x1[dim]+rcmbs);
8511                             }
8512                         }
8513                     }
8514                 }
8515             }
8516         }
8517
8518         /* Loop over the i-zones to set the upper limit of each
8519          * j-zone they see.
8520          */
8521         for (zi = 0; zi < zones->nizone; zi++)
8522         {
8523             if (zones->shift[zi][dim] == 0)
8524             {
8525                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8526                 {
8527                     if (zones->shift[z][dim] > 0)
8528                     {
8529                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8530                                                           zones->size[zi].x1[dim]+rcs);
8531                     }
8532                 }
8533             }
8534         }
8535     }
8536
8537     for (z = zone_start; z < zone_end; z++)
8538     {
8539         /* Initialization only required to keep the compiler happy */
8540         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8541         int  nc, c;
8542
8543         /* To determine the bounding box for a zone we need to find
8544          * the extreme corners of 4, 2 or 1 corners.
8545          */
8546         nc = 1 << (ddbox->nboundeddim - 1);
8547
8548         for (c = 0; c < nc; c++)
8549         {
8550             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8551             corner[XX] = 0;
8552             if ((c & 1) == 0)
8553             {
8554                 corner[YY] = zones->size[z].x0[YY];
8555             }
8556             else
8557             {
8558                 corner[YY] = zones->size[z].x1[YY];
8559             }
8560             if ((c & 2) == 0)
8561             {
8562                 corner[ZZ] = zones->size[z].x0[ZZ];
8563             }
8564             else
8565             {
8566                 corner[ZZ] = zones->size[z].x1[ZZ];
8567             }
8568             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
8569                 box[ZZ][1 - dd->dim[0]] != 0)
8570             {
8571                 /* With 1D domain decomposition the cg's are not in
8572                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
8573                  * Shift the corner of the z-vector back to along the box
8574                  * vector of dimension d, so it will later end up at 0 along d.
8575                  * This can affect the location of this corner along dd->dim[0]
8576                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
8577                  */
8578                 int d = 1 - dd->dim[0];
8579
8580                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
8581             }
8582             /* Apply the triclinic couplings */
8583             assert(ddbox->npbcdim <= DIM);
8584             for (i = YY; i < ddbox->npbcdim; i++)
8585             {
8586                 for (j = XX; j < i; j++)
8587                 {
8588                     corner[j] += corner[i]*box[i][j]/box[i][i];
8589                 }
8590             }
8591             if (c == 0)
8592             {
8593                 copy_rvec(corner, corner_min);
8594                 copy_rvec(corner, corner_max);
8595             }
8596             else
8597             {
8598                 for (i = 0; i < DIM; i++)
8599                 {
8600                     corner_min[i] = std::min(corner_min[i], corner[i]);
8601                     corner_max[i] = std::max(corner_max[i], corner[i]);
8602                 }
8603             }
8604         }
8605         /* Copy the extreme cornes without offset along x */
8606         for (i = 0; i < DIM; i++)
8607         {
8608             zones->size[z].bb_x0[i] = corner_min[i];
8609             zones->size[z].bb_x1[i] = corner_max[i];
8610         }
8611         /* Add the offset along x */
8612         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8613         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8614     }
8615
8616     if (zone_start == 0)
8617     {
8618         vol = 1;
8619         for (dim = 0; dim < DIM; dim++)
8620         {
8621             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8622         }
8623         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8624     }
8625
8626     if (debug)
8627     {
8628         for (z = zone_start; z < zone_end; z++)
8629         {
8630             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8631                     z,
8632                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8633                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8634                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8635             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8636                     z,
8637                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8638                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8639                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8640         }
8641     }
8642 }
8643
8644 static int comp_cgsort(const void *a, const void *b)
8645 {
8646     int           comp;
8647
8648     gmx_cgsort_t *cga, *cgb;
8649     cga = (gmx_cgsort_t *)a;
8650     cgb = (gmx_cgsort_t *)b;
8651
8652     comp = cga->nsc - cgb->nsc;
8653     if (comp == 0)
8654     {
8655         comp = cga->ind_gl - cgb->ind_gl;
8656     }
8657
8658     return comp;
8659 }
8660
8661 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8662                          int *a, int *buf)
8663 {
8664     int i;
8665
8666     /* Order the data */
8667     for (i = 0; i < n; i++)
8668     {
8669         buf[i] = a[sort[i].ind];
8670     }
8671
8672     /* Copy back to the original array */
8673     for (i = 0; i < n; i++)
8674     {
8675         a[i] = buf[i];
8676     }
8677 }
8678
8679 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8680                          rvec *v, rvec *buf)
8681 {
8682     int i;
8683
8684     /* Order the data */
8685     for (i = 0; i < n; i++)
8686     {
8687         copy_rvec(v[sort[i].ind], buf[i]);
8688     }
8689
8690     /* Copy back to the original array */
8691     for (i = 0; i < n; i++)
8692     {
8693         copy_rvec(buf[i], v[i]);
8694     }
8695 }
8696
8697 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8698                            rvec *v, rvec *buf)
8699 {
8700     int a, atot, cg, cg0, cg1, i;
8701
8702     if (cgindex == NULL)
8703     {
8704         /* Avoid the useless loop of the atoms within a cg */
8705         order_vec_cg(ncg, sort, v, buf);
8706
8707         return;
8708     }
8709
8710     /* Order the data */
8711     a = 0;
8712     for (cg = 0; cg < ncg; cg++)
8713     {
8714         cg0 = cgindex[sort[cg].ind];
8715         cg1 = cgindex[sort[cg].ind+1];
8716         for (i = cg0; i < cg1; i++)
8717         {
8718             copy_rvec(v[i], buf[a]);
8719             a++;
8720         }
8721     }
8722     atot = a;
8723
8724     /* Copy back to the original array */
8725     for (a = 0; a < atot; a++)
8726     {
8727         copy_rvec(buf[a], v[a]);
8728     }
8729 }
8730
8731 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8732                          int nsort_new, gmx_cgsort_t *sort_new,
8733                          gmx_cgsort_t *sort1)
8734 {
8735     int i1, i2, i_new;
8736
8737     /* The new indices are not very ordered, so we qsort them */
8738     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8739
8740     /* sort2 is already ordered, so now we can merge the two arrays */
8741     i1    = 0;
8742     i2    = 0;
8743     i_new = 0;
8744     while (i2 < nsort2 || i_new < nsort_new)
8745     {
8746         if (i2 == nsort2)
8747         {
8748             sort1[i1++] = sort_new[i_new++];
8749         }
8750         else if (i_new == nsort_new)
8751         {
8752             sort1[i1++] = sort2[i2++];
8753         }
8754         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8755                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8756                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8757         {
8758             sort1[i1++] = sort2[i2++];
8759         }
8760         else
8761         {
8762             sort1[i1++] = sort_new[i_new++];
8763         }
8764     }
8765 }
8766
8767 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8768 {
8769     gmx_domdec_sort_t *sort;
8770     gmx_cgsort_t      *cgsort, *sort_i;
8771     int                ncg_new, nsort2, nsort_new, i, *a, moved;
8772
8773     sort = dd->comm->sort;
8774
8775     a = fr->ns.grid->cell_index;
8776
8777     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8778
8779     if (ncg_home_old >= 0)
8780     {
8781         /* The charge groups that remained in the same ns grid cell
8782          * are completely ordered. So we can sort efficiently by sorting
8783          * the charge groups that did move into the stationary list.
8784          */
8785         ncg_new   = 0;
8786         nsort2    = 0;
8787         nsort_new = 0;
8788         for (i = 0; i < dd->ncg_home; i++)
8789         {
8790             /* Check if this cg did not move to another node */
8791             if (a[i] < moved)
8792             {
8793                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8794                 {
8795                     /* This cg is new on this node or moved ns grid cell */
8796                     if (nsort_new >= sort->sort_new_nalloc)
8797                     {
8798                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8799                         srenew(sort->sort_new, sort->sort_new_nalloc);
8800                     }
8801                     sort_i = &(sort->sort_new[nsort_new++]);
8802                 }
8803                 else
8804                 {
8805                     /* This cg did not move */
8806                     sort_i = &(sort->sort2[nsort2++]);
8807                 }
8808                 /* Sort on the ns grid cell indices
8809                  * and the global topology index.
8810                  * index_gl is irrelevant with cell ns,
8811                  * but we set it here anyhow to avoid a conditional.
8812                  */
8813                 sort_i->nsc    = a[i];
8814                 sort_i->ind_gl = dd->index_gl[i];
8815                 sort_i->ind    = i;
8816                 ncg_new++;
8817             }
8818         }
8819         if (debug)
8820         {
8821             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8822                     nsort2, nsort_new);
8823         }
8824         /* Sort efficiently */
8825         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8826                      sort->sort);
8827     }
8828     else
8829     {
8830         cgsort  = sort->sort;
8831         ncg_new = 0;
8832         for (i = 0; i < dd->ncg_home; i++)
8833         {
8834             /* Sort on the ns grid cell indices
8835              * and the global topology index
8836              */
8837             cgsort[i].nsc    = a[i];
8838             cgsort[i].ind_gl = dd->index_gl[i];
8839             cgsort[i].ind    = i;
8840             if (cgsort[i].nsc < moved)
8841             {
8842                 ncg_new++;
8843             }
8844         }
8845         if (debug)
8846         {
8847             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8848         }
8849         /* Determine the order of the charge groups using qsort */
8850         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8851     }
8852
8853     return ncg_new;
8854 }
8855
8856 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8857 {
8858     gmx_cgsort_t *sort;
8859     int           ncg_new, i, na;
8860     const int    *a;
8861
8862     sort = dd->comm->sort->sort;
8863
8864     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8865
8866     ncg_new = 0;
8867     for (i = 0; i < na; i++)
8868     {
8869         if (a[i] >= 0)
8870         {
8871             sort[ncg_new].ind = a[i];
8872             ncg_new++;
8873         }
8874     }
8875
8876     return ncg_new;
8877 }
8878
8879 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
8880                           int ncg_home_old)
8881 {
8882     gmx_domdec_sort_t *sort;
8883     gmx_cgsort_t      *cgsort;
8884     int               *cgindex;
8885     int                ncg_new, i, *ibuf, cgsize;
8886     rvec              *vbuf;
8887
8888     sort = dd->comm->sort;
8889
8890     if (dd->ncg_home > sort->sort_nalloc)
8891     {
8892         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8893         srenew(sort->sort, sort->sort_nalloc);
8894         srenew(sort->sort2, sort->sort_nalloc);
8895     }
8896     cgsort = sort->sort;
8897
8898     switch (fr->cutoff_scheme)
8899     {
8900         case ecutsGROUP:
8901             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8902             break;
8903         case ecutsVERLET:
8904             ncg_new = dd_sort_order_nbnxn(dd, fr);
8905             break;
8906         default:
8907             gmx_incons("unimplemented");
8908             ncg_new = 0;
8909     }
8910
8911     /* We alloc with the old size, since cgindex is still old */
8912     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8913     vbuf = dd->comm->vbuf.v;
8914
8915     if (dd->comm->bCGs)
8916     {
8917         cgindex = dd->cgindex;
8918     }
8919     else
8920     {
8921         cgindex = NULL;
8922     }
8923
8924     /* Remove the charge groups which are no longer at home here */
8925     dd->ncg_home = ncg_new;
8926     if (debug)
8927     {
8928         fprintf(debug, "Set the new home charge group count to %d\n",
8929                 dd->ncg_home);
8930     }
8931
8932     /* Reorder the state */
8933     for (i = 0; i < estNR; i++)
8934     {
8935         if (EST_DISTR(i) && (state->flags & (1<<i)))
8936         {
8937             switch (i)
8938             {
8939                 case estX:
8940                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
8941                     break;
8942                 case estV:
8943                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
8944                     break;
8945                 case estSDX:
8946                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
8947                     break;
8948                 case estCGP:
8949                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
8950                     break;
8951                 case estLD_RNG:
8952                 case estLD_RNGI:
8953                 case estDISRE_INITF:
8954                 case estDISRE_RM3TAV:
8955                 case estORIRE_INITF:
8956                 case estORIRE_DTAV:
8957                     /* No ordering required */
8958                     break;
8959                 default:
8960                     gmx_incons("Unknown state entry encountered in dd_sort_state");
8961                     break;
8962             }
8963         }
8964     }
8965     if (fr->cutoff_scheme == ecutsGROUP)
8966     {
8967         /* Reorder cgcm */
8968         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
8969     }
8970
8971     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8972     {
8973         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8974         srenew(sort->ibuf, sort->ibuf_nalloc);
8975     }
8976     ibuf = sort->ibuf;
8977     /* Reorder the global cg index */
8978     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
8979     /* Reorder the cginfo */
8980     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
8981     /* Rebuild the local cg index */
8982     if (dd->comm->bCGs)
8983     {
8984         ibuf[0] = 0;
8985         for (i = 0; i < dd->ncg_home; i++)
8986         {
8987             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8988             ibuf[i+1] = ibuf[i] + cgsize;
8989         }
8990         for (i = 0; i < dd->ncg_home+1; i++)
8991         {
8992             dd->cgindex[i] = ibuf[i];
8993         }
8994     }
8995     else
8996     {
8997         for (i = 0; i < dd->ncg_home+1; i++)
8998         {
8999             dd->cgindex[i] = i;
9000         }
9001     }
9002     /* Set the home atom number */
9003     dd->nat_home = dd->cgindex[dd->ncg_home];
9004
9005     if (fr->cutoff_scheme == ecutsVERLET)
9006     {
9007         /* The atoms are now exactly in grid order, update the grid order */
9008         nbnxn_set_atomorder(fr->nbv->nbs);
9009     }
9010     else
9011     {
9012         /* Copy the sorted ns cell indices back to the ns grid struct */
9013         for (i = 0; i < dd->ncg_home; i++)
9014         {
9015             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9016         }
9017         fr->ns.grid->nr = dd->ncg_home;
9018     }
9019 }
9020
9021 static void add_dd_statistics(gmx_domdec_t *dd)
9022 {
9023     gmx_domdec_comm_t *comm;
9024     int                ddnat;
9025
9026     comm = dd->comm;
9027
9028     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9029     {
9030         comm->sum_nat[ddnat-ddnatZONE] +=
9031             comm->nat[ddnat] - comm->nat[ddnat-1];
9032     }
9033     comm->ndecomp++;
9034 }
9035
9036 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9037 {
9038     gmx_domdec_comm_t *comm;
9039     int                ddnat;
9040
9041     comm = dd->comm;
9042
9043     /* Reset all the statistics and counters for total run counting */
9044     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9045     {
9046         comm->sum_nat[ddnat-ddnatZONE] = 0;
9047     }
9048     comm->ndecomp   = 0;
9049     comm->nload     = 0;
9050     comm->load_step = 0;
9051     comm->load_sum  = 0;
9052     comm->load_max  = 0;
9053     clear_ivec(comm->load_lim);
9054     comm->load_mdf = 0;
9055     comm->load_pme = 0;
9056 }
9057
9058 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9059 {
9060     gmx_domdec_comm_t *comm;
9061     int                ddnat;
9062     double             av;
9063
9064     comm = cr->dd->comm;
9065
9066     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9067
9068     if (fplog == NULL)
9069     {
9070         return;
9071     }
9072
9073     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9074
9075     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9076     {
9077         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9078         switch (ddnat)
9079         {
9080             case ddnatZONE:
9081                 fprintf(fplog,
9082                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9083                         2, av);
9084                 break;
9085             case ddnatVSITE:
9086                 if (cr->dd->vsite_comm)
9087                 {
9088                     fprintf(fplog,
9089                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9090                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9091                             av);
9092                 }
9093                 break;
9094             case ddnatCON:
9095                 if (cr->dd->constraint_comm)
9096                 {
9097                     fprintf(fplog,
9098                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9099                             1 + ir->nLincsIter, av);
9100                 }
9101                 break;
9102             default:
9103                 gmx_incons(" Unknown type for DD statistics");
9104         }
9105     }
9106     fprintf(fplog, "\n");
9107
9108     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9109     {
9110         print_dd_load_av(fplog, cr->dd);
9111     }
9112 }
9113
9114 void dd_partition_system(FILE                *fplog,
9115                          gmx_int64_t          step,
9116                          t_commrec           *cr,
9117                          gmx_bool             bMasterState,
9118                          int                  nstglobalcomm,
9119                          t_state             *state_global,
9120                          gmx_mtop_t          *top_global,
9121                          t_inputrec          *ir,
9122                          t_state             *state_local,
9123                          rvec               **f,
9124                          t_mdatoms           *mdatoms,
9125                          gmx_localtop_t      *top_local,
9126                          t_forcerec          *fr,
9127                          gmx_vsite_t         *vsite,
9128                          gmx_shellfc_t       *shellfc,
9129                          gmx_constr_t         constr,
9130                          t_nrnb              *nrnb,
9131                          gmx_wallcycle_t      wcycle,
9132                          gmx_bool             bVerbose)
9133 {
9134     gmx_domdec_t      *dd;
9135     gmx_domdec_comm_t *comm;
9136     gmx_ddbox_t        ddbox = {0};
9137     t_block           *cgs_gl;
9138     gmx_int64_t        step_pcoupl;
9139     rvec               cell_ns_x0, cell_ns_x1;
9140     int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9141     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bTurnOnDLB, bLogLoad;
9142     gmx_bool           bRedist, bSortCG, bResortAll;
9143     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9144     real               grid_density;
9145     char               sbuf[22];
9146
9147     wallcycle_start(wcycle, ewcDOMDEC);
9148
9149     dd   = cr->dd;
9150     comm = dd->comm;
9151
9152     bBoxChanged = (bMasterState || DEFORM(*ir));
9153     if (ir->epc != epcNO)
9154     {
9155         /* With nstpcouple > 1 pressure coupling happens.
9156          * one step after calculating the pressure.
9157          * Box scaling happens at the end of the MD step,
9158          * after the DD partitioning.
9159          * We therefore have to do DLB in the first partitioning
9160          * after an MD step where P-coupling occured.
9161          * We need to determine the last step in which p-coupling occurred.
9162          * MRS -- need to validate this for vv?
9163          */
9164         n = ir->nstpcouple;
9165         if (n == 1)
9166         {
9167             step_pcoupl = step - 1;
9168         }
9169         else
9170         {
9171             step_pcoupl = ((step - 1)/n)*n + 1;
9172         }
9173         if (step_pcoupl >= comm->partition_step)
9174         {
9175             bBoxChanged = TRUE;
9176         }
9177     }
9178
9179     bNStGlobalComm = (step % nstglobalcomm == 0);
9180
9181     if (!dlbIsOn(comm))
9182     {
9183         bDoDLB = FALSE;
9184     }
9185     else
9186     {
9187         /* Should we do dynamic load balacing this step?
9188          * Since it requires (possibly expensive) global communication,
9189          * we might want to do DLB less frequently.
9190          */
9191         if (bBoxChanged || ir->epc != epcNO)
9192         {
9193             bDoDLB = bBoxChanged;
9194         }
9195         else
9196         {
9197             bDoDLB = bNStGlobalComm;
9198         }
9199     }
9200
9201     /* Check if we have recorded loads on the nodes */
9202     if (comm->bRecordLoad && dd_load_count(comm) > 0)
9203     {
9204         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
9205
9206         /* Print load every nstlog, first and last step to the log file */
9207         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9208                     comm->n_load_collect == 0 ||
9209                     (ir->nsteps >= 0 &&
9210                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9211
9212         /* Avoid extra communication due to verbose screen output
9213          * when nstglobalcomm is set.
9214          */
9215         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
9216             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9217         {
9218             get_load_distribution(dd, wcycle);
9219             if (DDMASTER(dd))
9220             {
9221                 if (bLogLoad)
9222                 {
9223                     dd_print_load(fplog, dd, step-1);
9224                 }
9225                 if (bVerbose)
9226                 {
9227                     dd_print_load_verbose(dd);
9228                 }
9229             }
9230             comm->n_load_collect++;
9231
9232             if (bCheckWhetherToTurnDlbOn)
9233             {
9234                 /* Since the timings are node dependent, the master decides */
9235                 if (DDMASTER(dd))
9236                 {
9237                     /* Here we check if the max PME rank load is more than 0.98
9238                      * the max PP force load. If so, PP DLB will not help,
9239                      * since we are (almost) limited by PME. Furthermore,
9240                      * DLB will cause a significant extra x/f redistribution
9241                      * cost on the PME ranks, which will then surely result
9242                      * in lower total performance.
9243                      * This check might be fragile, since one measurement
9244                      * below 0.98 (although only done once every 100 DD part.)
9245                      * could turn on DLB for the rest of the run.
9246                      */
9247                     if (cr->npmenodes > 0 &&
9248                         dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
9249                     {
9250                         bTurnOnDLB = FALSE;
9251                     }
9252                     else
9253                     {
9254                         bTurnOnDLB =
9255                             (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9256                     }
9257                     if (debug)
9258                     {
9259                         fprintf(debug, "step %s, imb loss %f\n",
9260                                 gmx_step_str(step, sbuf),
9261                                 dd_force_imb_perf_loss(dd));
9262                     }
9263                 }
9264                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9265                 if (bTurnOnDLB)
9266                 {
9267                     turn_on_dlb(fplog, cr, step);
9268                     bDoDLB = TRUE;
9269                 }
9270             }
9271         }
9272         comm->n_load_have++;
9273     }
9274
9275     cgs_gl = &comm->cgs_gl;
9276
9277     bRedist = FALSE;
9278     if (bMasterState)
9279     {
9280         /* Clear the old state */
9281         clear_dd_indices(dd, 0, 0);
9282         ncgindex_set = 0;
9283
9284         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9285                   TRUE, cgs_gl, state_global->x, &ddbox);
9286
9287         get_cg_distribution(fplog, dd, cgs_gl,
9288                             state_global->box, &ddbox, state_global->x);
9289
9290         dd_distribute_state(dd, cgs_gl,
9291                             state_global, state_local, f);
9292
9293         dd_make_local_cgs(dd, &top_local->cgs);
9294
9295         /* Ensure that we have space for the new distribution */
9296         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9297
9298         if (fr->cutoff_scheme == ecutsGROUP)
9299         {
9300             calc_cgcm(fplog, 0, dd->ncg_home,
9301                       &top_local->cgs, state_local->x, fr->cg_cm);
9302         }
9303
9304         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9305
9306         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9307     }
9308     else if (state_local->ddp_count != dd->ddp_count)
9309     {
9310         if (state_local->ddp_count > dd->ddp_count)
9311         {
9312             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9313         }
9314
9315         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9316         {
9317             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9318         }
9319
9320         /* Clear the old state */
9321         clear_dd_indices(dd, 0, 0);
9322
9323         /* Build the new indices */
9324         rebuild_cgindex(dd, cgs_gl->index, state_local);
9325         make_dd_indices(dd, cgs_gl->index, 0);
9326         ncgindex_set = dd->ncg_home;
9327
9328         if (fr->cutoff_scheme == ecutsGROUP)
9329         {
9330             /* Redetermine the cg COMs */
9331             calc_cgcm(fplog, 0, dd->ncg_home,
9332                       &top_local->cgs, state_local->x, fr->cg_cm);
9333         }
9334
9335         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9336
9337         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9338
9339         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9340                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9341
9342         bRedist = dlbIsOn(comm);
9343     }
9344     else
9345     {
9346         /* We have the full state, only redistribute the cgs */
9347
9348         /* Clear the non-home indices */
9349         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9350         ncgindex_set = 0;
9351
9352         /* Avoid global communication for dim's without pbc and -gcom */
9353         if (!bNStGlobalComm)
9354         {
9355             copy_rvec(comm->box0, ddbox.box0    );
9356             copy_rvec(comm->box_size, ddbox.box_size);
9357         }
9358         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9359                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9360
9361         bBoxChanged = TRUE;
9362         bRedist     = TRUE;
9363     }
9364     /* For dim's without pbc and -gcom */
9365     copy_rvec(ddbox.box0, comm->box0    );
9366     copy_rvec(ddbox.box_size, comm->box_size);
9367
9368     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9369                       step, wcycle);
9370
9371     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9372     {
9373         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9374     }
9375
9376     /* Check if we should sort the charge groups */
9377     if (comm->nstSortCG > 0)
9378     {
9379         bSortCG = (bMasterState ||
9380                    (bRedist && (step % comm->nstSortCG == 0)));
9381     }
9382     else
9383     {
9384         bSortCG = FALSE;
9385     }
9386
9387     ncg_home_old = dd->ncg_home;
9388
9389     ncg_moved = 0;
9390     if (bRedist)
9391     {
9392         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9393
9394         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9395                            state_local, f, fr,
9396                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9397
9398         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9399     }
9400
9401     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9402                           dd, &ddbox,
9403                           &comm->cell_x0, &comm->cell_x1,
9404                           dd->ncg_home, fr->cg_cm,
9405                           cell_ns_x0, cell_ns_x1, &grid_density);
9406
9407     if (bBoxChanged)
9408     {
9409         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9410     }
9411
9412     switch (fr->cutoff_scheme)
9413     {
9414         case ecutsGROUP:
9415             copy_ivec(fr->ns.grid->n, ncells_old);
9416             grid_first(fplog, fr->ns.grid, dd, &ddbox,
9417                        state_local->box, cell_ns_x0, cell_ns_x1,
9418                        fr->rlistlong, grid_density);
9419             break;
9420         case ecutsVERLET:
9421             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9422             break;
9423         default:
9424             gmx_incons("unimplemented");
9425     }
9426     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9427     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9428
9429     if (bSortCG)
9430     {
9431         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9432
9433         /* Sort the state on charge group position.
9434          * This enables exact restarts from this step.
9435          * It also improves performance by about 15% with larger numbers
9436          * of atoms per node.
9437          */
9438
9439         /* Fill the ns grid with the home cell,
9440          * so we can sort with the indices.
9441          */
9442         set_zones_ncg_home(dd);
9443
9444         switch (fr->cutoff_scheme)
9445         {
9446             case ecutsVERLET:
9447                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9448
9449                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9450                                   0,
9451                                   comm->zones.size[0].bb_x0,
9452                                   comm->zones.size[0].bb_x1,
9453                                   0, dd->ncg_home,
9454                                   comm->zones.dens_zone0,
9455                                   fr->cginfo,
9456                                   state_local->x,
9457                                   ncg_moved, bRedist ? comm->moved : NULL,
9458                                   fr->nbv->grp[eintLocal].kernel_type,
9459                                   fr->nbv->grp[eintLocal].nbat);
9460
9461                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9462                 break;
9463             case ecutsGROUP:
9464                 fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
9465                           0, dd->ncg_home, fr->cg_cm);
9466
9467                 copy_ivec(fr->ns.grid->n, ncells_new);
9468                 break;
9469             default:
9470                 gmx_incons("unimplemented");
9471         }
9472
9473         bResortAll = bMasterState;
9474
9475         /* Check if we can user the old order and ns grid cell indices
9476          * of the charge groups to sort the charge groups efficiently.
9477          */
9478         if (ncells_new[XX] != ncells_old[XX] ||
9479             ncells_new[YY] != ncells_old[YY] ||
9480             ncells_new[ZZ] != ncells_old[ZZ])
9481         {
9482             bResortAll = TRUE;
9483         }
9484
9485         if (debug)
9486         {
9487             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9488                     gmx_step_str(step, sbuf), dd->ncg_home);
9489         }
9490         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9491                       bResortAll ? -1 : ncg_home_old);
9492         /* Rebuild all the indices */
9493         ga2la_clear(dd->ga2la);
9494         ncgindex_set = 0;
9495
9496         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9497     }
9498
9499     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9500
9501     /* Setup up the communication and communicate the coordinates */
9502     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9503
9504     /* Set the indices */
9505     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9506
9507     /* Set the charge group boundaries for neighbor searching */
9508     set_cg_boundaries(&comm->zones);
9509
9510     if (fr->cutoff_scheme == ecutsVERLET)
9511     {
9512         set_zones_size(dd, state_local->box, &ddbox,
9513                        bSortCG ? 1 : 0, comm->zones.n);
9514     }
9515
9516     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9517
9518     /*
9519        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9520                  -1,state_local->x,state_local->box);
9521      */
9522
9523     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9524
9525     /* Extract a local topology from the global topology */
9526     for (i = 0; i < dd->ndim; i++)
9527     {
9528         np[dd->dim[i]] = comm->cd[i].np;
9529     }
9530     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9531                       comm->cellsize_min, np,
9532                       fr,
9533                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9534                       vsite, top_global, top_local);
9535
9536     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9537
9538     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9539
9540     /* Set up the special atom communication */
9541     n = comm->nat[ddnatZONE];
9542     for (i = ddnatZONE+1; i < ddnatNR; i++)
9543     {
9544         switch (i)
9545         {
9546             case ddnatVSITE:
9547                 if (vsite && vsite->n_intercg_vsite)
9548                 {
9549                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9550                 }
9551                 break;
9552             case ddnatCON:
9553                 if (dd->bInterCGcons || dd->bInterCGsettles)
9554                 {
9555                     /* Only for inter-cg constraints we need special code */
9556                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9557                                                   constr, ir->nProjOrder,
9558                                                   top_local->idef.il);
9559                 }
9560                 break;
9561             default:
9562                 gmx_incons("Unknown special atom type setup");
9563         }
9564         comm->nat[i] = n;
9565     }
9566
9567     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9568
9569     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9570
9571     /* Make space for the extra coordinates for virtual site
9572      * or constraint communication.
9573      */
9574     state_local->natoms = comm->nat[ddnatNR-1];
9575     if (state_local->natoms > state_local->nalloc)
9576     {
9577         dd_realloc_state(state_local, f, state_local->natoms);
9578     }
9579
9580     if (fr->bF_NoVirSum)
9581     {
9582         if (vsite && vsite->n_intercg_vsite)
9583         {
9584             nat_f_novirsum = comm->nat[ddnatVSITE];
9585         }
9586         else
9587         {
9588             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9589             {
9590                 nat_f_novirsum = dd->nat_tot;
9591             }
9592             else
9593             {
9594                 nat_f_novirsum = dd->nat_home;
9595             }
9596         }
9597     }
9598     else
9599     {
9600         nat_f_novirsum = 0;
9601     }
9602
9603     /* Set the number of atoms required for the force calculation.
9604      * Forces need to be constrained when using a twin-range setup
9605      * or with energy minimization. For simple simulations we could
9606      * avoid some allocation, zeroing and copying, but this is
9607      * probably not worth the complications ande checking.
9608      */
9609     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9610                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9611
9612     /* We make the all mdatoms up to nat_tot_con.
9613      * We could save some work by only setting invmass
9614      * between nat_tot and nat_tot_con.
9615      */
9616     /* This call also sets the new number of home particles to dd->nat_home */
9617     atoms2md(top_global, ir,
9618              comm->nat[ddnatCON], dd->gatindex, dd->nat_home, mdatoms);
9619
9620     /* Now we have the charges we can sort the FE interactions */
9621     dd_sort_local_top(dd, mdatoms, top_local);
9622
9623     if (vsite != NULL)
9624     {
9625         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9626         split_vsites_over_threads(top_local->idef.il, top_local->idef.iparams,
9627                                   mdatoms, FALSE, vsite);
9628     }
9629
9630     if (shellfc)
9631     {
9632         /* Make the local shell stuff, currently no communication is done */
9633         make_local_shells(cr, mdatoms, shellfc);
9634     }
9635
9636     if (ir->implicit_solvent)
9637     {
9638         make_local_gb(cr, fr->born, ir->gb_algorithm);
9639     }
9640
9641     setup_bonded_threading(fr, &top_local->idef);
9642
9643     if (!(cr->duty & DUTY_PME))
9644     {
9645         /* Send the charges and/or c6/sigmas to our PME only node */
9646         gmx_pme_send_parameters(cr,
9647                                 fr->ic,
9648                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9649                                 mdatoms->chargeA, mdatoms->chargeB,
9650                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9651                                 mdatoms->sigmaA, mdatoms->sigmaB,
9652                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9653     }
9654
9655     if (constr)
9656     {
9657         set_constraints(constr, top_local, ir, mdatoms, cr);
9658     }
9659
9660     if (ir->bPull)
9661     {
9662         /* Update the local pull groups */
9663         dd_make_local_pull_groups(cr, ir->pull_work, mdatoms);
9664     }
9665
9666     if (ir->bRot)
9667     {
9668         /* Update the local rotation groups */
9669         dd_make_local_rotation_groups(dd, ir->rot);
9670     }
9671
9672     if (ir->eSwapCoords != eswapNO)
9673     {
9674         /* Update the local groups needed for ion swapping */
9675         dd_make_local_swap_groups(dd, ir->swap);
9676     }
9677
9678     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9679     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9680
9681     add_dd_statistics(dd);
9682
9683     /* Make sure we only count the cycles for this DD partitioning */
9684     clear_dd_cycle_counts(dd);
9685
9686     /* Because the order of the atoms might have changed since
9687      * the last vsite construction, we need to communicate the constructing
9688      * atom coordinates again (for spreading the forces this MD step).
9689      */
9690     dd_move_x_vsites(dd, state_local->box, state_local->x);
9691
9692     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9693
9694     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9695     {
9696         dd_move_x(dd, state_local->box, state_local->x);
9697         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9698                      -1, state_local->x, state_local->box);
9699     }
9700
9701     /* Store the partitioning step */
9702     comm->partition_step = step;
9703
9704     /* Increase the DD partitioning counter */
9705     dd->ddp_count++;
9706     /* The state currently matches this DD partitioning count, store it */
9707     state_local->ddp_count = dd->ddp_count;
9708     if (bMasterState)
9709     {
9710         /* The DD master node knows the complete cg distribution,
9711          * store the count so we can possibly skip the cg info communication.
9712          */
9713         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9714     }
9715
9716     if (comm->DD_debug > 0)
9717     {
9718         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9719         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9720                                 "after partitioning");
9721     }
9722
9723     wallcycle_stop(wcycle, ewcDOMDEC);
9724 }