src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <limits.h>
  44 #include <math.h>
  45 #include <stdio.h>
  46 #include <stdlib.h>
  47 #include <string.h>
  48
  49 #include <algorithm>
  50
  51 #include "gromacs/domdec/domdec_network.h"
  52 #include "gromacs/domdec/ga2la.h"
  53 #include "gromacs/ewald/pme.h"
  54 #include "gromacs/fileio/gmxfio.h"
  55 #include "gromacs/fileio/pdbio.h"
  56 #include "gromacs/gmxlib/chargegroup.h"
  57 #include "gromacs/gmxlib/gmx_omp_nthreads.h"
  58 #include "gromacs/gmxlib/network.h"
  59 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
  60 #include "gromacs/imd/imd.h"
  61 #include "gromacs/legacyheaders/names.h"
  62 #include "gromacs/legacyheaders/nrnb.h"
  63 #include "gromacs/legacyheaders/types/commrec.h"
  64 #include "gromacs/legacyheaders/types/enums.h"
  65 #include "gromacs/legacyheaders/types/forcerec.h"
  66 #include "gromacs/legacyheaders/types/hw_info.h"
  67 #include "gromacs/legacyheaders/types/ifunc.h"
  68 #include "gromacs/legacyheaders/types/nrnb.h"
  69 #include "gromacs/legacyheaders/types/ns.h"
  70 #include "gromacs/listed-forces/manage-threading.h"
  71 #include "gromacs/math/vec.h"
  72 #include "gromacs/math/vectypes.h"
  73 #include "gromacs/mdlib/constr.h"
  74 #include "gromacs/mdlib/force.h"
  75 #include "gromacs/mdlib/forcerec.h"
  76 #include "gromacs/mdlib/genborn.h"
  77 #include "gromacs/mdlib/mdatoms.h"
  78 #include "gromacs/mdlib/mdrun.h"
  79 #include "gromacs/mdlib/nb_verlet.h"
  80 #include "gromacs/mdlib/nbnxn_grid.h"
  81 #include "gromacs/mdlib/nsgrid.h"
  82 #include "gromacs/mdlib/shellfc.h"
  83 #include "gromacs/mdlib/vsite.h"
  84 #include "gromacs/mdtypes/df_history.h"
  85 #include "gromacs/mdtypes/inputrec.h"
  86 #include "gromacs/mdtypes/mdatom.h"
  87 #include "gromacs/mdtypes/state.h"
  88 #include "gromacs/pbcutil/ishift.h"
  89 #include "gromacs/pbcutil/pbc.h"
  90 #include "gromacs/pulling/pull.h"
  91 #include "gromacs/pulling/pull_rotation.h"
  92 #include "gromacs/swap/swapcoords.h"
  93 #include "gromacs/timing/wallcycle.h"
  94 #include "gromacs/topology/block.h"
  95 #include "gromacs/topology/idef.h"
  96 #include "gromacs/topology/mtop_util.h"
  97 #include "gromacs/topology/topology.h"
  98 #include "gromacs/utility/basedefinitions.h"
  99 #include "gromacs/utility/basenetwork.h"
 100 #include "gromacs/utility/cstringutil.h"
 101 #include "gromacs/utility/exceptions.h"
 102 #include "gromacs/utility/fatalerror.h"
 103 #include "gromacs/utility/gmxmpi.h"
 104 #include "gromacs/utility/qsort_threadsafe.h"
 105 #include "gromacs/utility/real.h"
 106 #include "gromacs/utility/smalloc.h"
 107
 108 #include "domdec_constraints.h"
 109 #include "domdec_internal.h"
 110 #include "domdec_vsite.h"
 111
 112 #define DDRANK(dd, rank)    (rank)
 113 #define DDMASTERRANK(dd)   (dd->masterrank)
 114
 115 typedef struct gmx_domdec_master
 116 {
 117     /* The cell boundaries */
 118     real **cell_x;
 119     /* The global charge group division */
 120     int   *ncg;    /* Number of home charge groups for each node */
 121     int   *index;  /* Index of nnodes+1 into cg */
 122     int   *cg;     /* Global charge group index */
 123     int   *nat;    /* Number of home atoms for each node. */
 124     int   *ibuf;   /* Buffer for communication */
 125     rvec  *vbuf;   /* Buffer for state scattering and gathering */
 126 } gmx_domdec_master_t;
 127
 128 #define DD_NLOAD_MAX 9
 129
 130 const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on" };
 131
 132 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 133 #define DD_CGIBS 2
 134
 135 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 136 #define DD_FLAG_NRCG  65535
 137 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 138 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 139
 140 /* The DD zone order */
 141 static const ivec dd_zo[DD_MAXZONE] =
 142 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 143
 144 /* The 3D setup */
 145 #define dd_z3n  8
 146 #define dd_zp3n 4
 147 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 148
 149 /* The 2D setup */
 150 #define dd_z2n  4
 151 #define dd_zp2n 2
 152 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 153
 154 /* The 1D setup */
 155 #define dd_z1n  2
 156 #define dd_zp1n 1
 157 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 158
 159 /* The 0D setup */
 160 #define dd_z0n  1
 161 #define dd_zp0n 1
 162 static const ivec dd_zp0[dd_zp0n] = {{0, 0, 1}};
 163
 164 /* Factors used to avoid problems due to rounding issues */
 165 #define DD_CELL_MARGIN       1.0001
 166 #define DD_CELL_MARGIN2      1.00005
 167 /* Factor to account for pressure scaling during nstlist steps */
 168 #define DD_PRES_SCALE_MARGIN 1.02
 169
 170 /* Turn on DLB when the load imbalance causes this amount of total loss.
 171  * There is a bit of overhead with DLB and it's difficult to achieve
 172  * a load imbalance of less than 2% with DLB.
 173  */
 174 #define DD_PERF_LOSS_DLB_ON  0.02
 175
 176 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 177 #define DD_PERF_LOSS_WARN    0.05
 178
 179 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 180
 181 /* Use separate MPI send and receive commands
 182  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 183  * This saves memory (and some copying for small nnodes).
 184  * For high parallelization scatter and gather calls are used.
 185  */
 186 #define GMX_DD_NNODES_SENDRECV 4
 187
 188
 189 /*
 190    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 191
 192    static void index2xyz(ivec nc,int ind,ivec xyz)
 193    {
 194    xyz[XX] = ind % nc[XX];
 195    xyz[YY] = (ind / nc[XX]) % nc[YY];
 196    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 197    }
 198  */
 199
 200 /* This order is required to minimize the coordinate communication in PME
 201  * which uses decomposition in the x direction.
 202  */
 203 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 204
 205 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 206 {
 207     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 208     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 209     xyz[ZZ] = ind % nc[ZZ];
 210 }
 211
 212 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 213 {
 214     int ddindex;
 215     int ddnodeid = -1;
 216
 217     ddindex = dd_index(dd->nc, c);
 218     if (dd->comm->bCartesianPP_PME)
 219     {
 220         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 221     }
 222     else if (dd->comm->bCartesianPP)
 223     {
 224 #ifdef GMX_MPI
 225         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 226 #endif
 227     }
 228     else
 229     {
 230         ddnodeid = ddindex;
 231     }
 232
 233     return ddnodeid;
 234 }
 235
 236 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 237 {
 238     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 239 }
 240
 241 int ddglatnr(gmx_domdec_t *dd, int i)
 242 {
 243     int atnr;
 244
 245     if (dd == NULL)
 246     {
 247         atnr = i + 1;
 248     }
 249     else
 250     {
 251         if (i >= dd->comm->nat[ddnatNR-1])
 252         {
 253             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 254         }
 255         atnr = dd->gatindex[i] + 1;
 256     }
 257
 258     return atnr;
 259 }
 260
 261 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 262 {
 263     return &dd->comm->cgs_gl;
 264 }
 265
 266 static bool dlbIsOn(const gmx_domdec_comm_t *comm)
 267 {
 268     return (comm->dlbState == edlbsOn);
 269 }
 270
 271 static void vec_rvec_init(vec_rvec_t *v)
 272 {
 273     v->nalloc = 0;
 274     v->v      = NULL;
 275 }
 276
 277 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 278 {
 279     if (n > v->nalloc)
 280     {
 281         v->nalloc = over_alloc_dd(n);
 282         srenew(v->v, v->nalloc);
 283     }
 284 }
 285
 286 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 287 {
 288     int i;
 289
 290     if (state->ddp_count != dd->ddp_count)
 291     {
 292         gmx_incons("The state does not the domain decomposition state");
 293     }
 294
 295     state->ncg_gl = dd->ncg_home;
 296     if (state->ncg_gl > state->cg_gl_nalloc)
 297     {
 298         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 299         srenew(state->cg_gl, state->cg_gl_nalloc);
 300     }
 301     for (i = 0; i < state->ncg_gl; i++)
 302     {
 303         state->cg_gl[i] = dd->index_gl[i];
 304     }
 305
 306     state->ddp_count_cg_gl = dd->ddp_count;
 307 }
 308
 309 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 310 {
 311     return &dd->comm->zones;
 312 }
 313
 314 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 315                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 316 {
 317     gmx_domdec_zones_t *zones;
 318     int                 izone, d, dim;
 319
 320     zones = &dd->comm->zones;
 321
 322     izone = 0;
 323     while (icg >= zones->izone[izone].cg1)
 324     {
 325         izone++;
 326     }
 327
 328     if (izone == 0)
 329     {
 330         *jcg0 = icg;
 331     }
 332     else if (izone < zones->nizone)
 333     {
 334         *jcg0 = zones->izone[izone].jcg0;
 335     }
 336     else
 337     {
 338         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 339                   icg, izone, zones->nizone);
 340     }
 341
 342     *jcg1 = zones->izone[izone].jcg1;
 343
 344     for (d = 0; d < dd->ndim; d++)
 345     {
 346         dim         = dd->dim[d];
 347         shift0[dim] = zones->izone[izone].shift0[dim];
 348         shift1[dim] = zones->izone[izone].shift1[dim];
 349         if (dd->comm->tric_dir[dim] || (dlbIsOn(dd->comm) && d > 0))
 350         {
 351             /* A conservative approach, this can be optimized */
 352             shift0[dim] -= 1;
 353             shift1[dim] += 1;
 354         }
 355     }
 356 }
 357
 358 int dd_natoms_vsite(gmx_domdec_t *dd)
 359 {
 360     return dd->comm->nat[ddnatVSITE];
 361 }
 362
 363 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 364 {
 365     *at_start = dd->comm->nat[ddnatCON-1];
 366     *at_end   = dd->comm->nat[ddnatCON];
 367 }
 368
 369 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 370 {
 371     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 372     int                   *index, *cgindex;
 373     gmx_domdec_comm_t     *comm;
 374     gmx_domdec_comm_dim_t *cd;
 375     gmx_domdec_ind_t      *ind;
 376     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 377     gmx_bool               bPBC, bScrew;
 378
 379     comm = dd->comm;
 380
 381     cgindex = dd->cgindex;
 382
 383     buf = comm->vbuf.v;
 384
 385     nzone   = 1;
 386     nat_tot = dd->nat_home;
 387     for (d = 0; d < dd->ndim; d++)
 388     {
 389         bPBC   = (dd->ci[dd->dim[d]] == 0);
 390         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 391         if (bPBC)
 392         {
 393             copy_rvec(box[dd->dim[d]], shift);
 394         }
 395         cd = &comm->cd[d];
 396         for (p = 0; p < cd->np; p++)
 397         {
 398             ind   = &cd->ind[p];
 399             index = ind->index;
 400             n     = 0;
 401             if (!bPBC)
 402             {
 403                 for (i = 0; i < ind->nsend[nzone]; i++)
 404                 {
 405                     at0 = cgindex[index[i]];
 406                     at1 = cgindex[index[i]+1];
 407                     for (j = at0; j < at1; j++)
 408                     {
 409                         copy_rvec(x[j], buf[n]);
 410                         n++;
 411                     }
 412                 }
 413             }
 414             else if (!bScrew)
 415             {
 416                 for (i = 0; i < ind->nsend[nzone]; i++)
 417                 {
 418                     at0 = cgindex[index[i]];
 419                     at1 = cgindex[index[i]+1];
 420                     for (j = at0; j < at1; j++)
 421                     {
 422                         /* We need to shift the coordinates */
 423                         rvec_add(x[j], shift, buf[n]);
 424                         n++;
 425                     }
 426                 }
 427             }
 428             else
 429             {
 430                 for (i = 0; i < ind->nsend[nzone]; i++)
 431                 {
 432                     at0 = cgindex[index[i]];
 433                     at1 = cgindex[index[i]+1];
 434                     for (j = at0; j < at1; j++)
 435                     {
 436                         /* Shift x */
 437                         buf[n][XX] = x[j][XX] + shift[XX];
 438                         /* Rotate y and z.
 439                          * This operation requires a special shift force
 440                          * treatment, which is performed in calc_vir.
 441                          */
 442                         buf[n][YY] = box[YY][YY] - x[j][YY];
 443                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 444                         n++;
 445                     }
 446                 }
 447             }
 448
 449             if (cd->bInPlace)
 450             {
 451                 rbuf = x + nat_tot;
 452             }
 453             else
 454             {
 455                 rbuf = comm->vbuf2.v;
 456             }
 457             /* Send and receive the coordinates */
 458             dd_sendrecv_rvec(dd, d, dddirBackward,
 459                              buf,  ind->nsend[nzone+1],
 460                              rbuf, ind->nrecv[nzone+1]);
 461             if (!cd->bInPlace)
 462             {
 463                 j = 0;
 464                 for (zone = 0; zone < nzone; zone++)
 465                 {
 466                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 467                     {
 468                         copy_rvec(rbuf[j], x[i]);
 469                         j++;
 470                     }
 471                 }
 472             }
 473             nat_tot += ind->nrecv[nzone+1];
 474         }
 475         nzone += nzone;
 476     }
 477 }
 478
 479 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 480 {
 481     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 482     int                   *index, *cgindex;
 483     gmx_domdec_comm_t     *comm;
 484     gmx_domdec_comm_dim_t *cd;
 485     gmx_domdec_ind_t      *ind;
 486     rvec                  *buf, *sbuf;
 487     ivec                   vis;
 488     int                    is;
 489     gmx_bool               bShiftForcesNeedPbc, bScrew;
 490
 491     comm = dd->comm;
 492
 493     cgindex = dd->cgindex;
 494
 495     buf = comm->vbuf.v;
 496
 497     nzone   = comm->zones.n/2;
 498     nat_tot = dd->nat_tot;
 499     for (d = dd->ndim-1; d >= 0; d--)
 500     {
 501         /* Only forces in domains near the PBC boundaries need to
 502            consider PBC in the treatment of fshift */
 503         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 504         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 505         if (fshift == NULL && !bScrew)
 506         {
 507             bShiftForcesNeedPbc = FALSE;
 508         }
 509         /* Determine which shift vector we need */
 510         clear_ivec(vis);
 511         vis[dd->dim[d]] = 1;
 512         is              = IVEC2IS(vis);
 513
 514         cd = &comm->cd[d];
 515         for (p = cd->np-1; p >= 0; p--)
 516         {
 517             ind      = &cd->ind[p];
 518             nat_tot -= ind->nrecv[nzone+1];
 519             if (cd->bInPlace)
 520             {
 521                 sbuf = f + nat_tot;
 522             }
 523             else
 524             {
 525                 sbuf = comm->vbuf2.v;
 526                 j    = 0;
 527                 for (zone = 0; zone < nzone; zone++)
 528                 {
 529                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 530                     {
 531                         copy_rvec(f[i], sbuf[j]);
 532                         j++;
 533                     }
 534                 }
 535             }
 536             /* Communicate the forces */
 537             dd_sendrecv_rvec(dd, d, dddirForward,
 538                              sbuf, ind->nrecv[nzone+1],
 539                              buf,  ind->nsend[nzone+1]);
 540             index = ind->index;
 541             /* Add the received forces */
 542             n = 0;
 543             if (!bShiftForcesNeedPbc)
 544             {
 545                 for (i = 0; i < ind->nsend[nzone]; i++)
 546                 {
 547                     at0 = cgindex[index[i]];
 548                     at1 = cgindex[index[i]+1];
 549                     for (j = at0; j < at1; j++)
 550                     {
 551                         rvec_inc(f[j], buf[n]);
 552                         n++;
 553                     }
 554                 }
 555             }
 556             else if (!bScrew)
 557             {
 558                 /* fshift should always be defined if this function is
 559                  * called when bShiftForcesNeedPbc is true */
 560                 assert(NULL != fshift);
 561                 for (i = 0; i < ind->nsend[nzone]; i++)
 562                 {
 563                     at0 = cgindex[index[i]];
 564                     at1 = cgindex[index[i]+1];
 565                     for (j = at0; j < at1; j++)
 566                     {
 567                         rvec_inc(f[j], buf[n]);
 568                         /* Add this force to the shift force */
 569                         rvec_inc(fshift[is], buf[n]);
 570                         n++;
 571                     }
 572                 }
 573             }
 574             else
 575             {
 576                 for (i = 0; i < ind->nsend[nzone]; i++)
 577                 {
 578                     at0 = cgindex[index[i]];
 579                     at1 = cgindex[index[i]+1];
 580                     for (j = at0; j < at1; j++)
 581                     {
 582                         /* Rotate the force */
 583                         f[j][XX] += buf[n][XX];
 584                         f[j][YY] -= buf[n][YY];
 585                         f[j][ZZ] -= buf[n][ZZ];
 586                         if (fshift)
 587                         {
 588                             /* Add this force to the shift force */
 589                             rvec_inc(fshift[is], buf[n]);
 590                         }
 591                         n++;
 592                     }
 593                 }
 594             }
 595         }
 596         nzone /= 2;
 597     }
 598 }
 599
 600 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 601 {
 602     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 603     int                   *index, *cgindex;
 604     gmx_domdec_comm_t     *comm;
 605     gmx_domdec_comm_dim_t *cd;
 606     gmx_domdec_ind_t      *ind;
 607     real                  *buf, *rbuf;
 608
 609     comm = dd->comm;
 610
 611     cgindex = dd->cgindex;
 612
 613     buf = &comm->vbuf.v[0][0];
 614
 615     nzone   = 1;
 616     nat_tot = dd->nat_home;
 617     for (d = 0; d < dd->ndim; d++)
 618     {
 619         cd = &comm->cd[d];
 620         for (p = 0; p < cd->np; p++)
 621         {
 622             ind   = &cd->ind[p];
 623             index = ind->index;
 624             n     = 0;
 625             for (i = 0; i < ind->nsend[nzone]; i++)
 626             {
 627                 at0 = cgindex[index[i]];
 628                 at1 = cgindex[index[i]+1];
 629                 for (j = at0; j < at1; j++)
 630                 {
 631                     buf[n] = v[j];
 632                     n++;
 633                 }
 634             }
 635
 636             if (cd->bInPlace)
 637             {
 638                 rbuf = v + nat_tot;
 639             }
 640             else
 641             {
 642                 rbuf = &comm->vbuf2.v[0][0];
 643             }
 644             /* Send and receive the coordinates */
 645             dd_sendrecv_real(dd, d, dddirBackward,
 646                              buf,  ind->nsend[nzone+1],
 647                              rbuf, ind->nrecv[nzone+1]);
 648             if (!cd->bInPlace)
 649             {
 650                 j = 0;
 651                 for (zone = 0; zone < nzone; zone++)
 652                 {
 653                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 654                     {
 655                         v[i] = rbuf[j];
 656                         j++;
 657                     }
 658                 }
 659             }
 660             nat_tot += ind->nrecv[nzone+1];
 661         }
 662         nzone += nzone;
 663     }
 664 }
 665
 666 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 667 {
 668     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 669     int                   *index, *cgindex;
 670     gmx_domdec_comm_t     *comm;
 671     gmx_domdec_comm_dim_t *cd;
 672     gmx_domdec_ind_t      *ind;
 673     real                  *buf, *sbuf;
 674
 675     comm = dd->comm;
 676
 677     cgindex = dd->cgindex;
 678
 679     buf = &comm->vbuf.v[0][0];
 680
 681     nzone   = comm->zones.n/2;
 682     nat_tot = dd->nat_tot;
 683     for (d = dd->ndim-1; d >= 0; d--)
 684     {
 685         cd = &comm->cd[d];
 686         for (p = cd->np-1; p >= 0; p--)
 687         {
 688             ind      = &cd->ind[p];
 689             nat_tot -= ind->nrecv[nzone+1];
 690             if (cd->bInPlace)
 691             {
 692                 sbuf = v + nat_tot;
 693             }
 694             else
 695             {
 696                 sbuf = &comm->vbuf2.v[0][0];
 697                 j    = 0;
 698                 for (zone = 0; zone < nzone; zone++)
 699                 {
 700                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 701                     {
 702                         sbuf[j] = v[i];
 703                         j++;
 704                     }
 705                 }
 706             }
 707             /* Communicate the forces */
 708             dd_sendrecv_real(dd, d, dddirForward,
 709                              sbuf, ind->nrecv[nzone+1],
 710                              buf,  ind->nsend[nzone+1]);
 711             index = ind->index;
 712             /* Add the received forces */
 713             n = 0;
 714             for (i = 0; i < ind->nsend[nzone]; i++)
 715             {
 716                 at0 = cgindex[index[i]];
 717                 at1 = cgindex[index[i]+1];
 718                 for (j = at0; j < at1; j++)
 719                 {
 720                     v[j] += buf[n];
 721                     n++;
 722                 }
 723             }
 724         }
 725         nzone /= 2;
 726     }
 727 }
 728
 729 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 730 {
 731     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 732             d, i, j,
 733             zone->min0, zone->max1,
 734             zone->mch0, zone->mch0,
 735             zone->p1_0, zone->p1_1);
 736 }
 737
 738
 739 #define DDZONECOMM_MAXZONE  5
 740 #define DDZONECOMM_BUFSIZE  3
 741
 742 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 743                                int ddimind, int direction,
 744                                gmx_ddzone_t *buf_s, int n_s,
 745                                gmx_ddzone_t *buf_r, int n_r)
 746 {
 747 #define ZBS  DDZONECOMM_BUFSIZE
 748     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 749     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 750     int  i;
 751
 752     for (i = 0; i < n_s; i++)
 753     {
 754         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 755         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 756         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 757         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 758         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 759         vbuf_s[i*ZBS+1][2] = 0;
 760         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 761         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 762         vbuf_s[i*ZBS+2][2] = 0;
 763     }
 764
 765     dd_sendrecv_rvec(dd, ddimind, direction,
 766                      vbuf_s, n_s*ZBS,
 767                      vbuf_r, n_r*ZBS);
 768
 769     for (i = 0; i < n_r; i++)
 770     {
 771         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 772         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 773         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 774         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 775         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 776         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 777         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 778     }
 779
 780 #undef ZBS
 781 }
 782
 783 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 784                           rvec cell_ns_x0, rvec cell_ns_x1)
 785 {
 786     int                d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
 787     gmx_ddzone_t      *zp;
 788     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 789     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 790     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 791     rvec               extr_s[2], extr_r[2];
 792     rvec               dh;
 793     real               dist_d, c = 0, det;
 794     gmx_domdec_comm_t *comm;
 795     gmx_bool           bPBC, bUse;
 796
 797     comm = dd->comm;
 798
 799     for (d = 1; d < dd->ndim; d++)
 800     {
 801         dim      = dd->dim[d];
 802         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 803         zp->min0 = cell_ns_x0[dim];
 804         zp->max1 = cell_ns_x1[dim];
 805         zp->min1 = cell_ns_x1[dim];
 806         zp->mch0 = cell_ns_x0[dim];
 807         zp->mch1 = cell_ns_x1[dim];
 808         zp->p1_0 = cell_ns_x0[dim];
 809         zp->p1_1 = cell_ns_x1[dim];
 810     }
 811
 812     for (d = dd->ndim-2; d >= 0; d--)
 813     {
 814         dim  = dd->dim[d];
 815         bPBC = (dim < ddbox->npbcdim);
 816
 817         /* Use an rvec to store two reals */
 818         extr_s[d][0] = comm->cell_f0[d+1];
 819         extr_s[d][1] = comm->cell_f1[d+1];
 820         extr_s[d][2] = comm->cell_f1[d+1];
 821
 822         pos = 0;
 823         /* Store the extremes in the backward sending buffer,
 824          * so the get updated separately from the forward communication.
 825          */
 826         for (d1 = d; d1 < dd->ndim-1; d1++)
 827         {
 828             /* We invert the order to be able to use the same loop for buf_e */
 829             buf_s[pos].min0 = extr_s[d1][1];
 830             buf_s[pos].max1 = extr_s[d1][0];
 831             buf_s[pos].min1 = extr_s[d1][2];
 832             buf_s[pos].mch0 = 0;
 833             buf_s[pos].mch1 = 0;
 834             /* Store the cell corner of the dimension we communicate along */
 835             buf_s[pos].p1_0 = comm->cell_x0[dim];
 836             buf_s[pos].p1_1 = 0;
 837             pos++;
 838         }
 839
 840         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 841         pos++;
 842
 843         if (dd->ndim == 3 && d == 0)
 844         {
 845             buf_s[pos] = comm->zone_d2[0][1];
 846             pos++;
 847             buf_s[pos] = comm->zone_d1[0];
 848             pos++;
 849         }
 850
 851         /* We only need to communicate the extremes
 852          * in the forward direction
 853          */
 854         npulse = comm->cd[d].np;
 855         if (bPBC)
 856         {
 857             /* Take the minimum to avoid double communication */
 858             npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
 859         }
 860         else
 861         {
 862             /* Without PBC we should really not communicate over
 863              * the boundaries, but implementing that complicates
 864              * the communication setup and therefore we simply
 865              * do all communication, but ignore some data.
 866              */
 867             npulse_min = npulse;
 868         }
 869         for (p = 0; p < npulse_min; p++)
 870         {
 871             /* Communicate the extremes forward */
 872             bUse = (bPBC || dd->ci[dim] > 0);
 873
 874             dd_sendrecv_rvec(dd, d, dddirForward,
 875                              extr_s+d, dd->ndim-d-1,
 876                              extr_r+d, dd->ndim-d-1);
 877
 878             if (bUse)
 879             {
 880                 for (d1 = d; d1 < dd->ndim-1; d1++)
 881                 {
 882                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 883                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 884                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 885                 }
 886             }
 887         }
 888
 889         buf_size = pos;
 890         for (p = 0; p < npulse; p++)
 891         {
 892             /* Communicate all the zone information backward */
 893             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 894
 895             dd_sendrecv_ddzone(dd, d, dddirBackward,
 896                                buf_s, buf_size,
 897                                buf_r, buf_size);
 898
 899             clear_rvec(dh);
 900             if (p > 0)
 901             {
 902                 for (d1 = d+1; d1 < dd->ndim; d1++)
 903                 {
 904                     /* Determine the decrease of maximum required
 905                      * communication height along d1 due to the distance along d,
 906                      * this avoids a lot of useless atom communication.
 907                      */
 908                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 909
 910                     if (ddbox->tric_dir[dim])
 911                     {
 912                         /* c is the off-diagonal coupling between the cell planes
 913                          * along directions d and d1.
 914                          */
 915                         c = ddbox->v[dim][dd->dim[d1]][dim];
 916                     }
 917                     else
 918                     {
 919                         c = 0;
 920                     }
 921                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 922                     if (det > 0)
 923                     {
 924                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
 925                     }
 926                     else
 927                     {
 928                         /* A negative value signals out of range */
 929                         dh[d1] = -1;
 930                     }
 931                 }
 932             }
 933
 934             /* Accumulate the extremes over all pulses */
 935             for (i = 0; i < buf_size; i++)
 936             {
 937                 if (p == 0)
 938                 {
 939                     buf_e[i] = buf_r[i];
 940                 }
 941                 else
 942                 {
 943                     if (bUse)
 944                     {
 945                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 946                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 947                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 948                     }
 949
 950                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 951                     {
 952                         d1 = 1;
 953                     }
 954                     else
 955                     {
 956                         d1 = d + 1;
 957                     }
 958                     if (bUse && dh[d1] >= 0)
 959                     {
 960                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 961                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 962                     }
 963                 }
 964                 /* Copy the received buffer to the send buffer,
 965                  * to pass the data through with the next pulse.
 966                  */
 967                 buf_s[i] = buf_r[i];
 968             }
 969             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 970                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 971             {
 972                 /* Store the extremes */
 973                 pos = 0;
 974
 975                 for (d1 = d; d1 < dd->ndim-1; d1++)
 976                 {
 977                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 978                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 979                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
 980                     pos++;
 981                 }
 982
 983                 if (d == 1 || (d == 0 && dd->ndim == 3))
 984                 {
 985                     for (i = d; i < 2; i++)
 986                     {
 987                         comm->zone_d2[1-d][i] = buf_e[pos];
 988                         pos++;
 989                     }
 990                 }
 991                 if (d == 0)
 992                 {
 993                     comm->zone_d1[1] = buf_e[pos];
 994                     pos++;
 995                 }
 996             }
 997         }
 998     }
 999
1000     if (dd->ndim >= 2)
1001     {
1002         dim = dd->dim[1];
1003         for (i = 0; i < 2; i++)
1004         {
1005             if (debug)
1006             {
1007                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1008             }
1009             cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1010             cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1011         }
1012     }
1013     if (dd->ndim >= 3)
1014     {
1015         dim = dd->dim[2];
1016         for (i = 0; i < 2; i++)
1017         {
1018             for (j = 0; j < 2; j++)
1019             {
1020                 if (debug)
1021                 {
1022                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1023                 }
1024                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1025                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1026             }
1027         }
1028     }
1029     for (d = 1; d < dd->ndim; d++)
1030     {
1031         comm->cell_f_max0[d] = extr_s[d-1][0];
1032         comm->cell_f_min1[d] = extr_s[d-1][1];
1033         if (debug)
1034         {
1035             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1036                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1037         }
1038     }
1039 }
1040
1041 static void dd_collect_cg(gmx_domdec_t *dd,
1042                           t_state      *state_local)
1043 {
1044     gmx_domdec_master_t *ma = NULL;
1045     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1046
1047     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1048     {
1049         /* The master has the correct distribution */
1050         return;
1051     }
1052
1053     if (state_local->ddp_count == dd->ddp_count)
1054     {
1055         /* The local state and DD are in sync, use the DD indices */
1056         ncg_home = dd->ncg_home;
1057         cg       = dd->index_gl;
1058         nat_home = dd->nat_home;
1059     }
1060     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1061     {
1062         /* The DD is out of sync with the local state, but we have stored
1063          * the cg indices with the local state, so we can use those.
1064          */
1065         t_block *cgs_gl;
1066
1067         cgs_gl = &dd->comm->cgs_gl;
1068
1069         ncg_home = state_local->ncg_gl;
1070         cg       = state_local->cg_gl;
1071         nat_home = 0;
1072         for (i = 0; i < ncg_home; i++)
1073         {
1074             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1075         }
1076     }
1077     else
1078     {
1079         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1080     }
1081
1082     buf2[0] = ncg_home;
1083     buf2[1] = nat_home;
1084     if (DDMASTER(dd))
1085     {
1086         ma   = dd->ma;
1087         ibuf = ma->ibuf;
1088     }
1089     else
1090     {
1091         ibuf = NULL;
1092     }
1093     /* Collect the charge group and atom counts on the master */
1094     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1095
1096     if (DDMASTER(dd))
1097     {
1098         ma->index[0] = 0;
1099         for (i = 0; i < dd->nnodes; i++)
1100         {
1101             ma->ncg[i]     = ma->ibuf[2*i];
1102             ma->nat[i]     = ma->ibuf[2*i+1];
1103             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1104
1105         }
1106         /* Make byte counts and indices */
1107         for (i = 0; i < dd->nnodes; i++)
1108         {
1109             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1110             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1111         }
1112         if (debug)
1113         {
1114             fprintf(debug, "Initial charge group distribution: ");
1115             for (i = 0; i < dd->nnodes; i++)
1116             {
1117                 fprintf(debug, " %d", ma->ncg[i]);
1118             }
1119             fprintf(debug, "\n");
1120         }
1121     }
1122
1123     /* Collect the charge group indices on the master */
1124     dd_gatherv(dd,
1125                ncg_home*sizeof(int), cg,
1126                DDMASTER(dd) ? ma->ibuf : NULL,
1127                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1128                DDMASTER(dd) ? ma->cg : NULL);
1129
1130     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1131 }
1132
1133 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1134                                     rvec *lv, rvec *v)
1135 {
1136     gmx_domdec_master_t *ma;
1137     int                  n, i, c, a, nalloc = 0;
1138     rvec                *buf = NULL;
1139     t_block             *cgs_gl;
1140
1141     ma = dd->ma;
1142
1143     if (!DDMASTER(dd))
1144     {
1145 #ifdef GMX_MPI
1146         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1147                  dd->rank, dd->mpi_comm_all);
1148 #endif
1149     }
1150     else
1151     {
1152         /* Copy the master coordinates to the global array */
1153         cgs_gl = &dd->comm->cgs_gl;
1154
1155         n = DDMASTERRANK(dd);
1156         a = 0;
1157         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1158         {
1159             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1160             {
1161                 copy_rvec(lv[a++], v[c]);
1162             }
1163         }
1164
1165         for (n = 0; n < dd->nnodes; n++)
1166         {
1167             if (n != dd->rank)
1168             {
1169                 if (ma->nat[n] > nalloc)
1170                 {
1171                     nalloc = over_alloc_dd(ma->nat[n]);
1172                     srenew(buf, nalloc);
1173                 }
1174 #ifdef GMX_MPI
1175                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1176                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1177 #endif
1178                 a = 0;
1179                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1180                 {
1181                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1182                     {
1183                         copy_rvec(buf[a++], v[c]);
1184                     }
1185                 }
1186             }
1187         }
1188         sfree(buf);
1189     }
1190 }
1191
1192 static void get_commbuffer_counts(gmx_domdec_t *dd,
1193                                   int **counts, int **disps)
1194 {
1195     gmx_domdec_master_t *ma;
1196     int                  n;
1197
1198     ma = dd->ma;
1199
1200     /* Make the rvec count and displacment arrays */
1201     *counts  = ma->ibuf;
1202     *disps   = ma->ibuf + dd->nnodes;
1203     for (n = 0; n < dd->nnodes; n++)
1204     {
1205         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1206         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1207     }
1208 }
1209
1210 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1211                                    rvec *lv, rvec *v)
1212 {
1213     gmx_domdec_master_t *ma;
1214     int                 *rcounts = NULL, *disps = NULL;
1215     int                  n, i, c, a;
1216     rvec                *buf = NULL;
1217     t_block             *cgs_gl;
1218
1219     ma = dd->ma;
1220
1221     if (DDMASTER(dd))
1222     {
1223         get_commbuffer_counts(dd, &rcounts, &disps);
1224
1225         buf = ma->vbuf;
1226     }
1227
1228     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1229
1230     if (DDMASTER(dd))
1231     {
1232         cgs_gl = &dd->comm->cgs_gl;
1233
1234         a = 0;
1235         for (n = 0; n < dd->nnodes; n++)
1236         {
1237             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1238             {
1239                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1240                 {
1241                     copy_rvec(buf[a++], v[c]);
1242                 }
1243             }
1244         }
1245     }
1246 }
1247
1248 void dd_collect_vec(gmx_domdec_t *dd,
1249                     t_state *state_local, rvec *lv, rvec *v)
1250 {
1251     dd_collect_cg(dd, state_local);
1252
1253     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1254     {
1255         dd_collect_vec_sendrecv(dd, lv, v);
1256     }
1257     else
1258     {
1259         dd_collect_vec_gatherv(dd, lv, v);
1260     }
1261 }
1262
1263
1264 void dd_collect_state(gmx_domdec_t *dd,
1265                       t_state *state_local, t_state *state)
1266 {
1267     int est, i, j, nh;
1268
1269     nh = state->nhchainlength;
1270
1271     if (DDMASTER(dd))
1272     {
1273         for (i = 0; i < efptNR; i++)
1274         {
1275             state->lambda[i] = state_local->lambda[i];
1276         }
1277         state->fep_state = state_local->fep_state;
1278         state->veta      = state_local->veta;
1279         state->vol0      = state_local->vol0;
1280         copy_mat(state_local->box, state->box);
1281         copy_mat(state_local->boxv, state->boxv);
1282         copy_mat(state_local->svir_prev, state->svir_prev);
1283         copy_mat(state_local->fvir_prev, state->fvir_prev);
1284         copy_mat(state_local->pres_prev, state->pres_prev);
1285
1286         for (i = 0; i < state_local->ngtc; i++)
1287         {
1288             for (j = 0; j < nh; j++)
1289             {
1290                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1291                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1292             }
1293             state->therm_integral[i] = state_local->therm_integral[i];
1294         }
1295         for (i = 0; i < state_local->nnhpres; i++)
1296         {
1297             for (j = 0; j < nh; j++)
1298             {
1299                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1300                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1301             }
1302         }
1303     }
1304     for (est = 0; est < estNR; est++)
1305     {
1306         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1307         {
1308             switch (est)
1309             {
1310                 case estX:
1311                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1312                     break;
1313                 case estV:
1314                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1315                     break;
1316                 case estSDX:
1317                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1318                     break;
1319                 case estCGP:
1320                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1321                     break;
1322                 case estDISRE_INITF:
1323                 case estDISRE_RM3TAV:
1324                 case estORIRE_INITF:
1325                 case estORIRE_DTAV:
1326                     break;
1327                 default:
1328                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1329             }
1330         }
1331     }
1332 }
1333
1334 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1335 {
1336     int est;
1337
1338     if (debug)
1339     {
1340         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1341     }
1342
1343     state->nalloc = over_alloc_dd(nalloc);
1344
1345     for (est = 0; est < estNR; est++)
1346     {
1347         if (EST_DISTR(est) && (state->flags & (1<<est)))
1348         {
1349             switch (est)
1350             {
1351                 case estX:
1352                     srenew(state->x, state->nalloc);
1353                     break;
1354                 case estV:
1355                     srenew(state->v, state->nalloc);
1356                     break;
1357                 case estSDX:
1358                     srenew(state->sd_X, state->nalloc);
1359                     break;
1360                 case estCGP:
1361                     srenew(state->cg_p, state->nalloc);
1362                     break;
1363                 case estDISRE_INITF:
1364                 case estDISRE_RM3TAV:
1365                 case estORIRE_INITF:
1366                 case estORIRE_DTAV:
1367                     /* No reallocation required */
1368                     break;
1369                 default:
1370                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1371             }
1372         }
1373     }
1374
1375     if (f != NULL)
1376     {
1377         srenew(*f, state->nalloc);
1378     }
1379 }
1380
1381 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1382                                int nalloc)
1383 {
1384     if (nalloc > fr->cg_nalloc)
1385     {
1386         if (debug)
1387         {
1388             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1389         }
1390         fr->cg_nalloc = over_alloc_dd(nalloc);
1391         srenew(fr->cginfo, fr->cg_nalloc);
1392         if (fr->cutoff_scheme == ecutsGROUP)
1393         {
1394             srenew(fr->cg_cm, fr->cg_nalloc);
1395         }
1396     }
1397     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1398     {
1399         /* We don't use charge groups, we use x in state to set up
1400          * the atom communication.
1401          */
1402         dd_realloc_state(state, f, nalloc);
1403     }
1404 }
1405
1406 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1407                                        rvec *v, rvec *lv)
1408 {
1409     gmx_domdec_master_t *ma;
1410     int                  n, i, c, a, nalloc = 0;
1411     rvec                *buf = NULL;
1412
1413     if (DDMASTER(dd))
1414     {
1415         ma  = dd->ma;
1416
1417         for (n = 0; n < dd->nnodes; n++)
1418         {
1419             if (n != dd->rank)
1420             {
1421                 if (ma->nat[n] > nalloc)
1422                 {
1423                     nalloc = over_alloc_dd(ma->nat[n]);
1424                     srenew(buf, nalloc);
1425                 }
1426                 /* Use lv as a temporary buffer */
1427                 a = 0;
1428                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1429                 {
1430                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1431                     {
1432                         copy_rvec(v[c], buf[a++]);
1433                     }
1434                 }
1435                 if (a != ma->nat[n])
1436                 {
1437                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1438                               a, ma->nat[n]);
1439                 }
1440
1441 #ifdef GMX_MPI
1442                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1443                          DDRANK(dd, n), n, dd->mpi_comm_all);
1444 #endif
1445             }
1446         }
1447         sfree(buf);
1448         n = DDMASTERRANK(dd);
1449         a = 0;
1450         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1451         {
1452             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1453             {
1454                 copy_rvec(v[c], lv[a++]);
1455             }
1456         }
1457     }
1458     else
1459     {
1460 #ifdef GMX_MPI
1461         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1462                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1463 #endif
1464     }
1465 }
1466
1467 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1468                                        rvec *v, rvec *lv)
1469 {
1470     gmx_domdec_master_t *ma;
1471     int                 *scounts = NULL, *disps = NULL;
1472     int                  n, i, c, a;
1473     rvec                *buf = NULL;
1474
1475     if (DDMASTER(dd))
1476     {
1477         ma  = dd->ma;
1478
1479         get_commbuffer_counts(dd, &scounts, &disps);
1480
1481         buf = ma->vbuf;
1482         a   = 0;
1483         for (n = 0; n < dd->nnodes; n++)
1484         {
1485             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1486             {
1487                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1488                 {
1489                     copy_rvec(v[c], buf[a++]);
1490                 }
1491             }
1492         }
1493     }
1494
1495     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1496 }
1497
1498 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1499 {
1500     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1501     {
1502         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1503     }
1504     else
1505     {
1506         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1507     }
1508 }
1509
1510 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1511 {
1512     int i;
1513     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1514     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1515     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1516
1517     if (dfhist->nlambda > 0)
1518     {
1519         int nlam = dfhist->nlambda;
1520         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1521         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1522         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1523         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1524         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1525         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1526
1527         for (i = 0; i < nlam; i++)
1528         {
1529             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1530             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1531             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1532             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1533             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1534             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1535         }
1536     }
1537 }
1538
1539 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1540                                 t_state *state, t_state *state_local,
1541                                 rvec **f)
1542 {
1543     int  i, j, nh;
1544
1545     nh = state->nhchainlength;
1546
1547     if (DDMASTER(dd))
1548     {
1549         for (i = 0; i < efptNR; i++)
1550         {
1551             state_local->lambda[i] = state->lambda[i];
1552         }
1553         state_local->fep_state = state->fep_state;
1554         state_local->veta      = state->veta;
1555         state_local->vol0      = state->vol0;
1556         copy_mat(state->box, state_local->box);
1557         copy_mat(state->box_rel, state_local->box_rel);
1558         copy_mat(state->boxv, state_local->boxv);
1559         copy_mat(state->svir_prev, state_local->svir_prev);
1560         copy_mat(state->fvir_prev, state_local->fvir_prev);
1561         copy_df_history(&state_local->dfhist, &state->dfhist);
1562         for (i = 0; i < state_local->ngtc; i++)
1563         {
1564             for (j = 0; j < nh; j++)
1565             {
1566                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1567                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1568             }
1569             state_local->therm_integral[i] = state->therm_integral[i];
1570         }
1571         for (i = 0; i < state_local->nnhpres; i++)
1572         {
1573             for (j = 0; j < nh; j++)
1574             {
1575                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1576                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1577             }
1578         }
1579     }
1580     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1581     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1582     dd_bcast(dd, sizeof(real), &state_local->veta);
1583     dd_bcast(dd, sizeof(real), &state_local->vol0);
1584     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1585     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1586     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1587     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1588     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1589     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1590     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1591     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1592     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1593     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1594
1595     /* communicate df_history -- required for restarting from checkpoint */
1596     dd_distribute_dfhist(dd, &state_local->dfhist);
1597
1598     if (dd->nat_home > state_local->nalloc)
1599     {
1600         dd_realloc_state(state_local, f, dd->nat_home);
1601     }
1602     for (i = 0; i < estNR; i++)
1603     {
1604         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1605         {
1606             switch (i)
1607             {
1608                 case estX:
1609                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1610                     break;
1611                 case estV:
1612                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1613                     break;
1614                 case estSDX:
1615                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1616                     break;
1617                 case estCGP:
1618                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1619                     break;
1620                 case estDISRE_INITF:
1621                 case estDISRE_RM3TAV:
1622                 case estORIRE_INITF:
1623                 case estORIRE_DTAV:
1624                     /* Not implemented yet */
1625                     break;
1626                 default:
1627                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1628             }
1629         }
1630     }
1631 }
1632
1633 static char dim2char(int dim)
1634 {
1635     char c = '?';
1636
1637     switch (dim)
1638     {
1639         case XX: c = 'X'; break;
1640         case YY: c = 'Y'; break;
1641         case ZZ: c = 'Z'; break;
1642         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1643     }
1644
1645     return c;
1646 }
1647
1648 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1649                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1650 {
1651     rvec   grid_s[2], *grid_r = NULL, cx, r;
1652     char   fname[STRLEN], buf[22];
1653     FILE  *out;
1654     int    a, i, d, z, y, x;
1655     matrix tric;
1656     real   vol;
1657
1658     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1659     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1660
1661     if (DDMASTER(dd))
1662     {
1663         snew(grid_r, 2*dd->nnodes);
1664     }
1665
1666     dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : NULL);
1667
1668     if (DDMASTER(dd))
1669     {
1670         for (d = 0; d < DIM; d++)
1671         {
1672             for (i = 0; i < DIM; i++)
1673             {
1674                 if (d == i)
1675                 {
1676                     tric[d][i] = 1;
1677                 }
1678                 else
1679                 {
1680                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1681                     {
1682                         tric[d][i] = box[i][d]/box[i][i];
1683                     }
1684                     else
1685                     {
1686                         tric[d][i] = 0;
1687                     }
1688                 }
1689             }
1690         }
1691         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1692         out = gmx_fio_fopen(fname, "w");
1693         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1694         a = 1;
1695         for (i = 0; i < dd->nnodes; i++)
1696         {
1697             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1698             for (d = 0; d < DIM; d++)
1699             {
1700                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1701             }
1702             for (z = 0; z < 2; z++)
1703             {
1704                 for (y = 0; y < 2; y++)
1705                 {
1706                     for (x = 0; x < 2; x++)
1707                     {
1708                         cx[XX] = grid_r[i*2+x][XX];
1709                         cx[YY] = grid_r[i*2+y][YY];
1710                         cx[ZZ] = grid_r[i*2+z][ZZ];
1711                         mvmul(tric, cx, r);
1712                         gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
1713                                                  10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
1714                     }
1715                 }
1716             }
1717             for (d = 0; d < DIM; d++)
1718             {
1719                 for (x = 0; x < 4; x++)
1720                 {
1721                     switch (d)
1722                     {
1723                         case 0: y = 1 + i*8 + 2*x; break;
1724                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1725                         case 2: y = 1 + i*8 + x; break;
1726                     }
1727                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
1728                 }
1729             }
1730         }
1731         gmx_fio_fclose(out);
1732         sfree(grid_r);
1733     }
1734 }
1735
1736 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
1737                   gmx_mtop_t *mtop, t_commrec *cr,
1738                   int natoms, rvec x[], matrix box)
1739 {
1740     char          fname[STRLEN], buf[22];
1741     FILE         *out;
1742     int           i, ii, resnr, c;
1743     char         *atomname, *resname;
1744     real          b;
1745     gmx_domdec_t *dd;
1746
1747     dd = cr->dd;
1748     if (natoms == -1)
1749     {
1750         natoms = dd->comm->nat[ddnatVSITE];
1751     }
1752
1753     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
1754
1755     out = gmx_fio_fopen(fname, "w");
1756
1757     fprintf(out, "TITLE     %s\n", title);
1758     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1759     for (i = 0; i < natoms; i++)
1760     {
1761         ii = dd->gatindex[i];
1762         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
1763         if (i < dd->comm->nat[ddnatZONE])
1764         {
1765             c = 0;
1766             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1767             {
1768                 c++;
1769             }
1770             b = c;
1771         }
1772         else if (i < dd->comm->nat[ddnatVSITE])
1773         {
1774             b = dd->comm->zones.n;
1775         }
1776         else
1777         {
1778             b = dd->comm->zones.n + 1;
1779         }
1780         gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
1781                                  10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
1782     }
1783     fprintf(out, "TER\n");
1784
1785     gmx_fio_fclose(out);
1786 }
1787
1788 real dd_cutoff_multibody(const gmx_domdec_t *dd)
1789 {
1790     gmx_domdec_comm_t *comm;
1791     int                di;
1792     real               r;
1793
1794     comm = dd->comm;
1795
1796     r = -1;
1797     if (comm->bInterCGBondeds)
1798     {
1799         if (comm->cutoff_mbody > 0)
1800         {
1801             r = comm->cutoff_mbody;
1802         }
1803         else
1804         {
1805             /* cutoff_mbody=0 means we do not have DLB */
1806             r = comm->cellsize_min[dd->dim[0]];
1807             for (di = 1; di < dd->ndim; di++)
1808             {
1809                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
1810             }
1811             if (comm->bBondComm)
1812             {
1813                 r = std::max(r, comm->cutoff_mbody);
1814             }
1815             else
1816             {
1817                 r = std::min(r, comm->cutoff);
1818             }
1819         }
1820     }
1821
1822     return r;
1823 }
1824
1825 real dd_cutoff_twobody(const gmx_domdec_t *dd)
1826 {
1827     real r_mb;
1828
1829     r_mb = dd_cutoff_multibody(dd);
1830
1831     return std::max(dd->comm->cutoff, r_mb);
1832 }
1833
1834
1835 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
1836 {
1837     int nc, ntot;
1838
1839     nc   = dd->nc[dd->comm->cartpmedim];
1840     ntot = dd->comm->ntot[dd->comm->cartpmedim];
1841     copy_ivec(coord, coord_pme);
1842     coord_pme[dd->comm->cartpmedim] =
1843         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
1844 }
1845
1846 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
1847 {
1848     /* Here we assign a PME node to communicate with this DD node
1849      * by assuming that the major index of both is x.
1850      * We add cr->npmenodes/2 to obtain an even distribution.
1851      */
1852     return (ddindex*npme + npme/2)/ndd;
1853 }
1854
1855 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
1856 {
1857     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
1858 }
1859
1860 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
1861 {
1862     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
1863 }
1864
1865 static int *dd_pmenodes(t_commrec *cr)
1866 {
1867     int *pmenodes;
1868     int  n, i, p0, p1;
1869
1870     snew(pmenodes, cr->npmenodes);
1871     n = 0;
1872     for (i = 0; i < cr->dd->nnodes; i++)
1873     {
1874         p0 = cr_ddindex2pmeindex(cr, i);
1875         p1 = cr_ddindex2pmeindex(cr, i+1);
1876         if (i+1 == cr->dd->nnodes || p1 > p0)
1877         {
1878             if (debug)
1879             {
1880                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
1881             }
1882             pmenodes[n] = i + 1 + n;
1883             n++;
1884         }
1885     }
1886
1887     return pmenodes;
1888 }
1889
1890 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
1891 {
1892     gmx_domdec_t *dd;
1893     ivec          coords;
1894     int           slab;
1895
1896     dd = cr->dd;
1897     /*
1898        if (dd->comm->bCartesian) {
1899        gmx_ddindex2xyz(dd->nc,ddindex,coords);
1900        dd_coords2pmecoords(dd,coords,coords_pme);
1901        copy_ivec(dd->ntot,nc);
1902        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
1903        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1904
1905        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
1906        } else {
1907        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
1908        }
1909      */
1910     coords[XX] = x;
1911     coords[YY] = y;
1912     coords[ZZ] = z;
1913     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
1914
1915     return slab;
1916 }
1917
1918 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
1919 {
1920     gmx_domdec_comm_t *comm;
1921     ivec               coords;
1922     int                ddindex, nodeid = -1;
1923
1924     comm = cr->dd->comm;
1925
1926     coords[XX] = x;
1927     coords[YY] = y;
1928     coords[ZZ] = z;
1929     if (comm->bCartesianPP_PME)
1930     {
1931 #ifdef GMX_MPI
1932         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
1933 #endif
1934     }
1935     else
1936     {
1937         ddindex = dd_index(cr->dd->nc, coords);
1938         if (comm->bCartesianPP)
1939         {
1940             nodeid = comm->ddindex2simnodeid[ddindex];
1941         }
1942         else
1943         {
1944             if (comm->pmenodes)
1945             {
1946                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
1947             }
1948             else
1949             {
1950                 nodeid = ddindex;
1951             }
1952         }
1953     }
1954
1955     return nodeid;
1956 }
1957
1958 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
1959 {
1960     gmx_domdec_t      *dd;
1961     gmx_domdec_comm_t *comm;
1962     int                i;
1963     int                pmenode = -1;
1964
1965     dd   = cr->dd;
1966     comm = dd->comm;
1967
1968     /* This assumes a uniform x domain decomposition grid cell size */
1969     if (comm->bCartesianPP_PME)
1970     {
1971 #ifdef GMX_MPI
1972         ivec coord, coord_pme;
1973         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
1974         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1975         {
1976             /* This is a PP node */
1977             dd_cart_coord2pmecoord(dd, coord, coord_pme);
1978             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
1979         }
1980 #endif
1981     }
1982     else if (comm->bCartesianPP)
1983     {
1984         if (sim_nodeid < dd->nnodes)
1985         {
1986             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1987         }
1988     }
1989     else
1990     {
1991         /* This assumes DD cells with identical x coordinates
1992          * are numbered sequentially.
1993          */
1994         if (dd->comm->pmenodes == NULL)
1995         {
1996             if (sim_nodeid < dd->nnodes)
1997             {
1998                 /* The DD index equals the nodeid */
1999                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2000             }
2001         }
2002         else
2003         {
2004             i = 0;
2005             while (sim_nodeid > dd->comm->pmenodes[i])
2006             {
2007                 i++;
2008             }
2009             if (sim_nodeid < dd->comm->pmenodes[i])
2010             {
2011                 pmenode = dd->comm->pmenodes[i];
2012             }
2013         }
2014     }
2015
2016     return pmenode;
2017 }
2018
2019 void get_pme_nnodes(const gmx_domdec_t *dd,
2020                     int *npmenodes_x, int *npmenodes_y)
2021 {
2022     if (dd != NULL)
2023     {
2024         *npmenodes_x = dd->comm->npmenodes_x;
2025         *npmenodes_y = dd->comm->npmenodes_y;
2026     }
2027     else
2028     {
2029         *npmenodes_x = 1;
2030         *npmenodes_y = 1;
2031     }
2032 }
2033
2034 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2035                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2036 {
2037     gmx_domdec_t *dd;
2038     int           x, y, z;
2039     ivec          coord, coord_pme;
2040
2041     dd = cr->dd;
2042
2043     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2044
2045     *nmy_ddnodes = 0;
2046     for (x = 0; x < dd->nc[XX]; x++)
2047     {
2048         for (y = 0; y < dd->nc[YY]; y++)
2049         {
2050             for (z = 0; z < dd->nc[ZZ]; z++)
2051             {
2052                 if (dd->comm->bCartesianPP_PME)
2053                 {
2054                     coord[XX] = x;
2055                     coord[YY] = y;
2056                     coord[ZZ] = z;
2057                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2058                     if (dd->ci[XX] == coord_pme[XX] &&
2059                         dd->ci[YY] == coord_pme[YY] &&
2060                         dd->ci[ZZ] == coord_pme[ZZ])
2061                     {
2062                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2063                     }
2064                 }
2065                 else
2066                 {
2067                     /* The slab corresponds to the nodeid in the PME group */
2068                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2069                     {
2070                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2071                     }
2072                 }
2073             }
2074         }
2075     }
2076
2077     /* The last PP-only node is the peer node */
2078     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2079
2080     if (debug)
2081     {
2082         fprintf(debug, "Receive coordinates from PP ranks:");
2083         for (x = 0; x < *nmy_ddnodes; x++)
2084         {
2085             fprintf(debug, " %d", (*my_ddnodes)[x]);
2086         }
2087         fprintf(debug, "\n");
2088     }
2089 }
2090
2091 static gmx_bool receive_vir_ener(t_commrec *cr)
2092 {
2093     gmx_domdec_comm_t *comm;
2094     int                pmenode;
2095     gmx_bool           bReceive;
2096
2097     bReceive = TRUE;
2098     if (cr->npmenodes < cr->dd->nnodes)
2099     {
2100         comm = cr->dd->comm;
2101         if (comm->bCartesianPP_PME)
2102         {
2103             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2104 #ifdef GMX_MPI
2105             ivec coords;
2106             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2107             coords[comm->cartpmedim]++;
2108             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2109             {
2110                 int rank;
2111                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2112                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2113                 {
2114                     /* This is not the last PP node for pmenode */
2115                     bReceive = FALSE;
2116                 }
2117             }
2118 #endif
2119         }
2120         else
2121         {
2122             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2123             if (cr->sim_nodeid+1 < cr->nnodes &&
2124                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2125             {
2126                 /* This is not the last PP node for pmenode */
2127                 bReceive = FALSE;
2128             }
2129         }
2130     }
2131
2132     return bReceive;
2133 }
2134
2135 static void set_zones_ncg_home(gmx_domdec_t *dd)
2136 {
2137     gmx_domdec_zones_t *zones;
2138     int                 i;
2139
2140     zones = &dd->comm->zones;
2141
2142     zones->cg_range[0] = 0;
2143     for (i = 1; i < zones->n+1; i++)
2144     {
2145         zones->cg_range[i] = dd->ncg_home;
2146     }
2147     /* zone_ncg1[0] should always be equal to ncg_home */
2148     dd->comm->zone_ncg1[0] = dd->ncg_home;
2149 }
2150
2151 static void rebuild_cgindex(gmx_domdec_t *dd,
2152                             const int *gcgs_index, t_state *state)
2153 {
2154     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2155
2156     ind        = state->cg_gl;
2157     dd_cg_gl   = dd->index_gl;
2158     cgindex    = dd->cgindex;
2159     nat        = 0;
2160     cgindex[0] = nat;
2161     for (i = 0; i < state->ncg_gl; i++)
2162     {
2163         cgindex[i]  = nat;
2164         cg_gl       = ind[i];
2165         dd_cg_gl[i] = cg_gl;
2166         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2167     }
2168     cgindex[i] = nat;
2169
2170     dd->ncg_home = state->ncg_gl;
2171     dd->nat_home = nat;
2172
2173     set_zones_ncg_home(dd);
2174 }
2175
2176 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2177 {
2178     while (cg >= cginfo_mb->cg_end)
2179     {
2180         cginfo_mb++;
2181     }
2182
2183     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2184 }
2185
2186 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2187                           t_forcerec *fr, char *bLocalCG)
2188 {
2189     cginfo_mb_t *cginfo_mb;
2190     int         *cginfo;
2191     int          cg;
2192
2193     if (fr != NULL)
2194     {
2195         cginfo_mb = fr->cginfo_mb;
2196         cginfo    = fr->cginfo;
2197
2198         for (cg = cg0; cg < cg1; cg++)
2199         {
2200             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2201         }
2202     }
2203
2204     if (bLocalCG != NULL)
2205     {
2206         for (cg = cg0; cg < cg1; cg++)
2207         {
2208             bLocalCG[index_gl[cg]] = TRUE;
2209         }
2210     }
2211 }
2212
2213 static void make_dd_indices(gmx_domdec_t *dd,
2214                             const int *gcgs_index, int cg_start)
2215 {
2216     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2217     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2218     gmx_bool     bCGs;
2219
2220     if (dd->nat_tot > dd->gatindex_nalloc)
2221     {
2222         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2223         srenew(dd->gatindex, dd->gatindex_nalloc);
2224     }
2225
2226     nzone      = dd->comm->zones.n;
2227     zone2cg    = dd->comm->zones.cg_range;
2228     zone_ncg1  = dd->comm->zone_ncg1;
2229     index_gl   = dd->index_gl;
2230     gatindex   = dd->gatindex;
2231     bCGs       = dd->comm->bCGs;
2232
2233     if (zone2cg[1] != dd->ncg_home)
2234     {
2235         gmx_incons("dd->ncg_zone is not up to date");
2236     }
2237
2238     /* Make the local to global and global to local atom index */
2239     a = dd->cgindex[cg_start];
2240     for (zone = 0; zone < nzone; zone++)
2241     {
2242         if (zone == 0)
2243         {
2244             cg0 = cg_start;
2245         }
2246         else
2247         {
2248             cg0 = zone2cg[zone];
2249         }
2250         cg1    = zone2cg[zone+1];
2251         cg1_p1 = cg0 + zone_ncg1[zone];
2252
2253         for (cg = cg0; cg < cg1; cg++)
2254         {
2255             zone1 = zone;
2256             if (cg >= cg1_p1)
2257             {
2258                 /* Signal that this cg is from more than one pulse away */
2259                 zone1 += nzone;
2260             }
2261             cg_gl = index_gl[cg];
2262             if (bCGs)
2263             {
2264                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2265                 {
2266                     gatindex[a] = a_gl;
2267                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2268                     a++;
2269                 }
2270             }
2271             else
2272             {
2273                 gatindex[a] = cg_gl;
2274                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2275                 a++;
2276             }
2277         }
2278     }
2279 }
2280
2281 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2282                           const char *where)
2283 {
2284     int i, ngl, nerr;
2285
2286     nerr = 0;
2287     if (bLocalCG == NULL)
2288     {
2289         return nerr;
2290     }
2291     for (i = 0; i < dd->ncg_tot; i++)
2292     {
2293         if (!bLocalCG[dd->index_gl[i]])
2294         {
2295             fprintf(stderr,
2296                     "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2297             nerr++;
2298         }
2299     }
2300     ngl = 0;
2301     for (i = 0; i < ncg_sys; i++)
2302     {
2303         if (bLocalCG[i])
2304         {
2305             ngl++;
2306         }
2307     }
2308     if (ngl != dd->ncg_tot)
2309     {
2310         fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2311         nerr++;
2312     }
2313
2314     return nerr;
2315 }
2316
2317 static void check_index_consistency(gmx_domdec_t *dd,
2318                                     int natoms_sys, int ncg_sys,
2319                                     const char *where)
2320 {
2321     int   nerr, ngl, i, a, cell;
2322     int  *have;
2323
2324     nerr = 0;
2325
2326     if (dd->comm->DD_debug > 1)
2327     {
2328         snew(have, natoms_sys);
2329         for (a = 0; a < dd->nat_tot; a++)
2330         {
2331             if (have[dd->gatindex[a]] > 0)
2332             {
2333                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2334             }
2335             else
2336             {
2337                 have[dd->gatindex[a]] = a + 1;
2338             }
2339         }
2340         sfree(have);
2341     }
2342
2343     snew(have, dd->nat_tot);
2344
2345     ngl  = 0;
2346     for (i = 0; i < natoms_sys; i++)
2347     {
2348         if (ga2la_get(dd->ga2la, i, &a, &cell))
2349         {
2350             if (a >= dd->nat_tot)
2351             {
2352                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2353                 nerr++;
2354             }
2355             else
2356             {
2357                 have[a] = 1;
2358                 if (dd->gatindex[a] != i)
2359                 {
2360                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2361                     nerr++;
2362                 }
2363             }
2364             ngl++;
2365         }
2366     }
2367     if (ngl != dd->nat_tot)
2368     {
2369         fprintf(stderr,
2370                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2371                 dd->rank, where, ngl, dd->nat_tot);
2372     }
2373     for (a = 0; a < dd->nat_tot; a++)
2374     {
2375         if (have[a] == 0)
2376         {
2377             fprintf(stderr,
2378                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
2379                     dd->rank, where, a+1, dd->gatindex[a]+1);
2380         }
2381     }
2382     sfree(have);
2383
2384     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2385
2386     if (nerr > 0)
2387     {
2388         gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2389                   dd->rank, where, nerr);
2390     }
2391 }
2392
2393 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2394 {
2395     int   i;
2396     char *bLocalCG;
2397
2398     if (a_start == 0)
2399     {
2400         /* Clear the whole list without searching */
2401         ga2la_clear(dd->ga2la);
2402     }
2403     else
2404     {
2405         for (i = a_start; i < dd->nat_tot; i++)
2406         {
2407             ga2la_del(dd->ga2la, dd->gatindex[i]);
2408         }
2409     }
2410
2411     bLocalCG = dd->comm->bLocalCG;
2412     if (bLocalCG)
2413     {
2414         for (i = cg_start; i < dd->ncg_tot; i++)
2415         {
2416             bLocalCG[dd->index_gl[i]] = FALSE;
2417         }
2418     }
2419
2420     dd_clear_local_vsite_indices(dd);
2421
2422     if (dd->constraints)
2423     {
2424         dd_clear_local_constraint_indices(dd);
2425     }
2426 }
2427
2428 /* This function should be used for moving the domain boudaries during DLB,
2429  * for obtaining the minimum cell size. It checks the initially set limit
2430  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2431  * and, possibly, a longer cut-off limit set for PME load balancing.
2432  */
2433 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2434 {
2435     real cellsize_min;
2436
2437     cellsize_min = comm->cellsize_min[dim];
2438
2439     if (!comm->bVacDLBNoLimit)
2440     {
2441         /* The cut-off might have changed, e.g. by PME load balacning,
2442          * from the value used to set comm->cellsize_min, so check it.
2443          */
2444         cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2445
2446         if (comm->bPMELoadBalDLBLimits)
2447         {
2448             /* Check for the cut-off limit set by the PME load balancing */
2449             cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2450         }
2451     }
2452
2453     return cellsize_min;
2454 }
2455
2456 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2457                             int dim_ind)
2458 {
2459     real grid_jump_limit;
2460
2461     /* The distance between the boundaries of cells at distance
2462      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2463      * and by the fact that cells should not be shifted by more than
2464      * half their size, such that cg's only shift by one cell
2465      * at redecomposition.
2466      */
2467     grid_jump_limit = comm->cellsize_limit;
2468     if (!comm->bVacDLBNoLimit)
2469     {
2470         if (comm->bPMELoadBalDLBLimits)
2471         {
2472             cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
2473         }
2474         grid_jump_limit = std::max(grid_jump_limit,
2475                                    cutoff/comm->cd[dim_ind].np);
2476     }
2477
2478     return grid_jump_limit;
2479 }
2480
2481 static gmx_bool check_grid_jump(gmx_int64_t     step,
2482                                 gmx_domdec_t   *dd,
2483                                 real            cutoff,
2484                                 gmx_ddbox_t    *ddbox,
2485                                 gmx_bool        bFatal)
2486 {
2487     gmx_domdec_comm_t *comm;
2488     int                d, dim;
2489     real               limit, bfac;
2490     gmx_bool           bInvalid;
2491
2492     bInvalid = FALSE;
2493
2494     comm = dd->comm;
2495
2496     for (d = 1; d < dd->ndim; d++)
2497     {
2498         dim   = dd->dim[d];
2499         limit = grid_jump_limit(comm, cutoff, d);
2500         bfac  = ddbox->box_size[dim];
2501         if (ddbox->tric_dir[dim])
2502         {
2503             bfac *= ddbox->skew_fac[dim];
2504         }
2505         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2506                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2507         {
2508             bInvalid = TRUE;
2509
2510             if (bFatal)
2511             {
2512                 char buf[22];
2513
2514                 /* This error should never be triggered under normal
2515                  * circumstances, but you never know ...
2516                  */
2517                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2518                           gmx_step_str(step, buf),
2519                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2520             }
2521         }
2522     }
2523
2524     return bInvalid;
2525 }
2526
2527 static int dd_load_count(gmx_domdec_comm_t *comm)
2528 {
2529     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2530 }
2531
2532 static float dd_force_load(gmx_domdec_comm_t *comm)
2533 {
2534     float load;
2535
2536     if (comm->eFlop)
2537     {
2538         load = comm->flop;
2539         if (comm->eFlop > 1)
2540         {
2541             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2542         }
2543     }
2544     else
2545     {
2546         load = comm->cycl[ddCyclF];
2547         if (comm->cycl_n[ddCyclF] > 1)
2548         {
2549             /* Subtract the maximum of the last n cycle counts
2550              * to get rid of possible high counts due to other sources,
2551              * for instance system activity, that would otherwise
2552              * affect the dynamic load balancing.
2553              */
2554             load -= comm->cycl_max[ddCyclF];
2555         }
2556
2557 #ifdef GMX_MPI
2558         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2559         {
2560             float gpu_wait, gpu_wait_sum;
2561
2562             gpu_wait = comm->cycl[ddCyclWaitGPU];
2563             if (comm->cycl_n[ddCyclF] > 1)
2564             {
2565                 /* We should remove the WaitGPU time of the same MD step
2566                  * as the one with the maximum F time, since the F time
2567                  * and the wait time are not independent.
2568                  * Furthermore, the step for the max F time should be chosen
2569                  * the same on all ranks that share the same GPU.
2570                  * But to keep the code simple, we remove the average instead.
2571                  * The main reason for artificially long times at some steps
2572                  * is spurious CPU activity or MPI time, so we don't expect
2573                  * that changes in the GPU wait time matter a lot here.
2574                  */
2575                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2576             }
2577             /* Sum the wait times over the ranks that share the same GPU */
2578             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2579                           comm->mpi_comm_gpu_shared);
2580             /* Replace the wait time by the average over the ranks */
2581             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2582         }
2583 #endif
2584     }
2585
2586     return load;
2587 }
2588
2589 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2590 {
2591     gmx_domdec_comm_t *comm;
2592     int                i;
2593
2594     comm = dd->comm;
2595
2596     snew(*dim_f, dd->nc[dim]+1);
2597     (*dim_f)[0] = 0;
2598     for (i = 1; i < dd->nc[dim]; i++)
2599     {
2600         if (comm->slb_frac[dim])
2601         {
2602             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2603         }
2604         else
2605         {
2606             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2607         }
2608     }
2609     (*dim_f)[dd->nc[dim]] = 1;
2610 }
2611
2612 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2613 {
2614     int  pmeindex, slab, nso, i;
2615     ivec xyz;
2616
2617     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2618     {
2619         ddpme->dim = YY;
2620     }
2621     else
2622     {
2623         ddpme->dim = dimind;
2624     }
2625     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2626
2627     ddpme->nslab = (ddpme->dim == 0 ?
2628                     dd->comm->npmenodes_x :
2629                     dd->comm->npmenodes_y);
2630
2631     if (ddpme->nslab <= 1)
2632     {
2633         return;
2634     }
2635
2636     nso = dd->comm->npmenodes/ddpme->nslab;
2637     /* Determine for each PME slab the PP location range for dimension dim */
2638     snew(ddpme->pp_min, ddpme->nslab);
2639     snew(ddpme->pp_max, ddpme->nslab);
2640     for (slab = 0; slab < ddpme->nslab; slab++)
2641     {
2642         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2643         ddpme->pp_max[slab] = 0;
2644     }
2645     for (i = 0; i < dd->nnodes; i++)
2646     {
2647         ddindex2xyz(dd->nc, i, xyz);
2648         /* For y only use our y/z slab.
2649          * This assumes that the PME x grid size matches the DD grid size.
2650          */
2651         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2652         {
2653             pmeindex = ddindex2pmeindex(dd, i);
2654             if (dimind == 0)
2655             {
2656                 slab = pmeindex/nso;
2657             }
2658             else
2659             {
2660                 slab = pmeindex % ddpme->nslab;
2661             }
2662             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
2663             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
2664         }
2665     }
2666
2667     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2668 }
2669
2670 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2671 {
2672     if (dd->comm->ddpme[0].dim == XX)
2673     {
2674         return dd->comm->ddpme[0].maxshift;
2675     }
2676     else
2677     {
2678         return 0;
2679     }
2680 }
2681
2682 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2683 {
2684     if (dd->comm->ddpme[0].dim == YY)
2685     {
2686         return dd->comm->ddpme[0].maxshift;
2687     }
2688     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2689     {
2690         return dd->comm->ddpme[1].maxshift;
2691     }
2692     else
2693     {
2694         return 0;
2695     }
2696 }
2697
2698 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2699                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
2700 {
2701     gmx_domdec_comm_t *comm;
2702     int                nc, ns, s;
2703     int               *xmin, *xmax;
2704     real               range, pme_boundary;
2705     int                sh;
2706
2707     comm = dd->comm;
2708     nc   = dd->nc[ddpme->dim];
2709     ns   = ddpme->nslab;
2710
2711     if (!ddpme->dim_match)
2712     {
2713         /* PP decomposition is not along dim: the worst situation */
2714         sh = ns/2;
2715     }
2716     else if (ns <= 3 || (bUniform && ns == nc))
2717     {
2718         /* The optimal situation */
2719         sh = 1;
2720     }
2721     else
2722     {
2723         /* We need to check for all pme nodes which nodes they
2724          * could possibly need to communicate with.
2725          */
2726         xmin = ddpme->pp_min;
2727         xmax = ddpme->pp_max;
2728         /* Allow for atoms to be maximally 2/3 times the cut-off
2729          * out of their DD cell. This is a reasonable balance between
2730          * between performance and support for most charge-group/cut-off
2731          * combinations.
2732          */
2733         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2734         /* Avoid extra communication when we are exactly at a boundary */
2735         range *= 0.999;
2736
2737         sh = 1;
2738         for (s = 0; s < ns; s++)
2739         {
2740             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2741             pme_boundary = (real)s/ns;
2742             while (sh+1 < ns &&
2743                    ((s-(sh+1) >= 0 &&
2744                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2745                     (s-(sh+1) <  0 &&
2746                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2747             {
2748                 sh++;
2749             }
2750             pme_boundary = (real)(s+1)/ns;
2751             while (sh+1 < ns &&
2752                    ((s+(sh+1) <  ns &&
2753                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2754                     (s+(sh+1) >= ns &&
2755                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2756             {
2757                 sh++;
2758             }
2759         }
2760     }
2761
2762     ddpme->maxshift = sh;
2763
2764     if (debug)
2765     {
2766         fprintf(debug, "PME slab communication range for dim %d is %d\n",
2767                 ddpme->dim, ddpme->maxshift);
2768     }
2769 }
2770
2771 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
2772 {
2773     int d, dim;
2774
2775     for (d = 0; d < dd->ndim; d++)
2776     {
2777         dim = dd->dim[d];
2778         if (dim < ddbox->nboundeddim &&
2779             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2780             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2781         {
2782             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2783                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2784                       dd->nc[dim], dd->comm->cellsize_limit);
2785         }
2786     }
2787 }
2788
2789 enum {
2790     setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
2791 };
2792
2793 /* Set the domain boundaries. Use for static (or no) load balancing,
2794  * and also for the starting state for dynamic load balancing.
2795  * setmode determine if and where the boundaries are stored, use enum above.
2796  * Returns the number communication pulses in npulse.
2797  */
2798 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
2799                                   int setmode, ivec npulse)
2800 {
2801     gmx_domdec_comm_t *comm;
2802     int                d, j;
2803     rvec               cellsize_min;
2804     real              *cell_x, cell_dx, cellsize;
2805
2806     comm = dd->comm;
2807
2808     for (d = 0; d < DIM; d++)
2809     {
2810         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2811         npulse[d]       = 1;
2812         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2813         {
2814             /* Uniform grid */
2815             cell_dx = ddbox->box_size[d]/dd->nc[d];
2816             switch (setmode)
2817             {
2818                 case setcellsizeslbMASTER:
2819                     for (j = 0; j < dd->nc[d]+1; j++)
2820                     {
2821                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2822                     }
2823                     break;
2824                 case setcellsizeslbLOCAL:
2825                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2826                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2827                     break;
2828                 default:
2829                     break;
2830             }
2831             cellsize = cell_dx*ddbox->skew_fac[d];
2832             while (cellsize*npulse[d] < comm->cutoff)
2833             {
2834                 npulse[d]++;
2835             }
2836             cellsize_min[d] = cellsize;
2837         }
2838         else
2839         {
2840             /* Statically load balanced grid */
2841             /* Also when we are not doing a master distribution we determine
2842              * all cell borders in a loop to obtain identical values
2843              * to the master distribution case and to determine npulse.
2844              */
2845             if (setmode == setcellsizeslbMASTER)
2846             {
2847                 cell_x = dd->ma->cell_x[d];
2848             }
2849             else
2850             {
2851                 snew(cell_x, dd->nc[d]+1);
2852             }
2853             cell_x[0] = ddbox->box0[d];
2854             for (j = 0; j < dd->nc[d]; j++)
2855             {
2856                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
2857                 cell_x[j+1] = cell_x[j] + cell_dx;
2858                 cellsize    = cell_dx*ddbox->skew_fac[d];
2859                 while (cellsize*npulse[d] < comm->cutoff &&
2860                        npulse[d] < dd->nc[d]-1)
2861                 {
2862                     npulse[d]++;
2863                 }
2864                 cellsize_min[d] = std::min(cellsize_min[d], cellsize);
2865             }
2866             if (setmode == setcellsizeslbLOCAL)
2867             {
2868                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2869                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2870             }
2871             if (setmode != setcellsizeslbMASTER)
2872             {
2873                 sfree(cell_x);
2874             }
2875         }
2876         /* The following limitation is to avoid that a cell would receive
2877          * some of its own home charge groups back over the periodic boundary.
2878          * Double charge groups cause trouble with the global indices.
2879          */
2880         if (d < ddbox->npbcdim &&
2881             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2882         {
2883             char error_string[STRLEN];
2884
2885             sprintf(error_string,
2886                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2887                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
2888                     comm->cutoff,
2889                     dd->nc[d], dd->nc[d],
2890                     dd->nnodes > dd->nc[d] ? "cells" : "ranks");
2891
2892             if (setmode == setcellsizeslbLOCAL)
2893             {
2894                 gmx_fatal_collective(FARGS, NULL, dd, error_string);
2895             }
2896             else
2897             {
2898                 gmx_fatal(FARGS, error_string);
2899             }
2900         }
2901     }
2902
2903     if (!dlbIsOn(comm))
2904     {
2905         copy_rvec(cellsize_min, comm->cellsize_min);
2906     }
2907
2908     for (d = 0; d < comm->npmedecompdim; d++)
2909     {
2910         set_pme_maxshift(dd, &comm->ddpme[d],
2911                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
2912                          comm->ddpme[d].slb_dim_f);
2913     }
2914 }
2915
2916
2917 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2918                                                   int d, int dim, domdec_root_t *root,
2919                                                   gmx_ddbox_t *ddbox,
2920                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
2921 {
2922     gmx_domdec_comm_t *comm;
2923     int                ncd, i, j, nmin, nmin_old;
2924     gmx_bool           bLimLo, bLimHi;
2925     real              *cell_size;
2926     real               fac, halfway, cellsize_limit_f_i, region_size;
2927     gmx_bool           bPBC, bLastHi = FALSE;
2928     int                nrange[] = {range[0], range[1]};
2929
2930     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
2931
2932     comm = dd->comm;
2933
2934     ncd = dd->nc[dim];
2935
2936     bPBC = (dim < ddbox->npbcdim);
2937
2938     cell_size = root->buf_ncd;
2939
2940     if (debug)
2941     {
2942         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
2943     }
2944
2945     /* First we need to check if the scaling does not make cells
2946      * smaller than the smallest allowed size.
2947      * We need to do this iteratively, since if a cell is too small,
2948      * it needs to be enlarged, which makes all the other cells smaller,
2949      * which could in turn make another cell smaller than allowed.
2950      */
2951     for (i = range[0]; i < range[1]; i++)
2952     {
2953         root->bCellMin[i] = FALSE;
2954     }
2955     nmin = 0;
2956     do
2957     {
2958         nmin_old = nmin;
2959         /* We need the total for normalization */
2960         fac = 0;
2961         for (i = range[0]; i < range[1]; i++)
2962         {
2963             if (root->bCellMin[i] == FALSE)
2964             {
2965                 fac += cell_size[i];
2966             }
2967         }
2968         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
2969         /* Determine the cell boundaries */
2970         for (i = range[0]; i < range[1]; i++)
2971         {
2972             if (root->bCellMin[i] == FALSE)
2973             {
2974                 cell_size[i] *= fac;
2975                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
2976                 {
2977                     cellsize_limit_f_i = 0;
2978                 }
2979                 else
2980                 {
2981                     cellsize_limit_f_i = cellsize_limit_f;
2982                 }
2983                 if (cell_size[i] < cellsize_limit_f_i)
2984                 {
2985                     root->bCellMin[i] = TRUE;
2986                     cell_size[i]      = cellsize_limit_f_i;
2987                     nmin++;
2988                 }
2989             }
2990             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
2991         }
2992     }
2993     while (nmin > nmin_old);
2994
2995     i            = range[1]-1;
2996     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
2997     /* For this check we should not use DD_CELL_MARGIN,
2998      * but a slightly smaller factor,
2999      * since rounding could get use below the limit.
3000      */
3001     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3002     {
3003         char buf[22];
3004         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3005                   gmx_step_str(step, buf),
3006                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3007                   ncd, comm->cellsize_min[dim]);
3008     }
3009
3010     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3011
3012     if (!bUniform)
3013     {
3014         /* Check if the boundary did not displace more than halfway
3015          * each of the cells it bounds, as this could cause problems,
3016          * especially when the differences between cell sizes are large.
3017          * If changes are applied, they will not make cells smaller
3018          * than the cut-off, as we check all the boundaries which
3019          * might be affected by a change and if the old state was ok,
3020          * the cells will at most be shrunk back to their old size.
3021          */
3022         for (i = range[0]+1; i < range[1]; i++)
3023         {
3024             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3025             if (root->cell_f[i] < halfway)
3026             {
3027                 root->cell_f[i] = halfway;
3028                 /* Check if the change also causes shifts of the next boundaries */
3029                 for (j = i+1; j < range[1]; j++)
3030                 {
3031                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3032                     {
3033                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3034                     }
3035                 }
3036             }
3037             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3038             if (root->cell_f[i] > halfway)
3039             {
3040                 root->cell_f[i] = halfway;
3041                 /* Check if the change also causes shifts of the next boundaries */
3042                 for (j = i-1; j >= range[0]+1; j--)
3043                 {
3044                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3045                     {
3046                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3047                     }
3048                 }
3049             }
3050         }
3051     }
3052
3053     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3054     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3055      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3056      * for a and b nrange is used */
3057     if (d > 0)
3058     {
3059         /* Take care of the staggering of the cell boundaries */
3060         if (bUniform)
3061         {
3062             for (i = range[0]; i < range[1]; i++)
3063             {
3064                 root->cell_f_max0[i] = root->cell_f[i];
3065                 root->cell_f_min1[i] = root->cell_f[i+1];
3066             }
3067         }
3068         else
3069         {
3070             for (i = range[0]+1; i < range[1]; i++)
3071             {
3072                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3073                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3074                 if (bLimLo && bLimHi)
3075                 {
3076                     /* Both limits violated, try the best we can */
3077                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3078                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3079                     nrange[0]       = range[0];
3080                     nrange[1]       = i;
3081                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3082
3083                     nrange[0] = i;
3084                     nrange[1] = range[1];
3085                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3086
3087                     return;
3088                 }
3089                 else if (bLimLo)
3090                 {
3091                     /* root->cell_f[i] = root->bound_min[i]; */
3092                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3093                     bLastHi   = FALSE;
3094                 }
3095                 else if (bLimHi && !bLastHi)
3096                 {
3097                     bLastHi = TRUE;
3098                     if (nrange[1] < range[1])   /* found a LimLo before */
3099                     {
3100                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3101                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3102                         nrange[0] = nrange[1];
3103                     }
3104                     root->cell_f[i] = root->bound_max[i];
3105                     nrange[1]       = i;
3106                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3107                     nrange[0] = i;
3108                     nrange[1] = range[1];
3109                 }
3110             }
3111             if (nrange[1] < range[1])   /* found last a LimLo */
3112             {
3113                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3114                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3115                 nrange[0] = nrange[1];
3116                 nrange[1] = range[1];
3117                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3118             }
3119             else if (nrange[0] > range[0]) /* found at least one LimHi */
3120             {
3121                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3122             }
3123         }
3124     }
3125 }
3126
3127
3128 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3129                                        int d, int dim, domdec_root_t *root,
3130                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3131                                        gmx_bool bUniform, gmx_int64_t step)
3132 {
3133     gmx_domdec_comm_t *comm;
3134     int                ncd, d1, i, pos;
3135     real              *cell_size;
3136     real               load_aver, load_i, imbalance, change, change_max, sc;
3137     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3138     real               change_limit;
3139     real               relax = 0.5;
3140     gmx_bool           bPBC;
3141     int                range[] = { 0, 0 };
3142
3143     comm = dd->comm;
3144
3145     /* Convert the maximum change from the input percentage to a fraction */
3146     change_limit = comm->dlb_scale_lim*0.01;
3147
3148     ncd = dd->nc[dim];
3149
3150     bPBC = (dim < ddbox->npbcdim);
3151
3152     cell_size = root->buf_ncd;
3153
3154     /* Store the original boundaries */
3155     for (i = 0; i < ncd+1; i++)
3156     {
3157         root->old_cell_f[i] = root->cell_f[i];
3158     }
3159     if (bUniform)
3160     {
3161         for (i = 0; i < ncd; i++)
3162         {
3163             cell_size[i] = 1.0/ncd;
3164         }
3165     }
3166     else if (dd_load_count(comm) > 0)
3167     {
3168         load_aver  = comm->load[d].sum_m/ncd;
3169         change_max = 0;
3170         for (i = 0; i < ncd; i++)
3171         {
3172             /* Determine the relative imbalance of cell i */
3173             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3174             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3175             /* Determine the change of the cell size using underrelaxation */
3176             change     = -relax*imbalance;
3177             change_max = std::max(change_max, std::max(change, -change));
3178         }
3179         /* Limit the amount of scaling.
3180          * We need to use the same rescaling for all cells in one row,
3181          * otherwise the load balancing might not converge.
3182          */
3183         sc = relax;
3184         if (change_max > change_limit)
3185         {
3186             sc *= change_limit/change_max;
3187         }
3188         for (i = 0; i < ncd; i++)
3189         {
3190             /* Determine the relative imbalance of cell i */
3191             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3192             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3193             /* Determine the change of the cell size using underrelaxation */
3194             change       = -sc*imbalance;
3195             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3196         }
3197     }
3198
3199     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3200     cellsize_limit_f *= DD_CELL_MARGIN;
3201     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3202     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3203     if (ddbox->tric_dir[dim])
3204     {
3205         cellsize_limit_f /= ddbox->skew_fac[dim];
3206         dist_min_f       /= ddbox->skew_fac[dim];
3207     }
3208     if (bDynamicBox && d > 0)
3209     {
3210         dist_min_f *= DD_PRES_SCALE_MARGIN;
3211     }
3212     if (d > 0 && !bUniform)
3213     {
3214         /* Make sure that the grid is not shifted too much */
3215         for (i = 1; i < ncd; i++)
3216         {
3217             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3218             {
3219                 gmx_incons("Inconsistent DD boundary staggering limits!");
3220             }
3221             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3222             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3223             if (space > 0)
3224             {
3225                 root->bound_min[i] += 0.5*space;
3226             }
3227             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3228             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3229             if (space < 0)
3230             {
3231                 root->bound_max[i] += 0.5*space;
3232             }
3233             if (debug)
3234             {
3235                 fprintf(debug,
3236                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3237                         d, i,
3238                         root->cell_f_max0[i-1] + dist_min_f,
3239                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3240                         root->cell_f_min1[i] - dist_min_f);
3241             }
3242         }
3243     }
3244     range[1]          = ncd;
3245     root->cell_f[0]   = 0;
3246     root->cell_f[ncd] = 1;
3247     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3248
3249
3250     /* After the checks above, the cells should obey the cut-off
3251      * restrictions, but it does not hurt to check.
3252      */
3253     for (i = 0; i < ncd; i++)
3254     {
3255         if (debug)
3256         {
3257             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3258                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3259         }
3260
3261         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3262             root->cell_f[i+1] - root->cell_f[i] <
3263             cellsize_limit_f/DD_CELL_MARGIN)
3264         {
3265             char buf[22];
3266             fprintf(stderr,
3267                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3268                     gmx_step_str(step, buf), dim2char(dim), i,
3269                     (root->cell_f[i+1] - root->cell_f[i])
3270                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3271         }
3272     }
3273
3274     pos = ncd + 1;
3275     /* Store the cell boundaries of the lower dimensions at the end */
3276     for (d1 = 0; d1 < d; d1++)
3277     {
3278         root->cell_f[pos++] = comm->cell_f0[d1];
3279         root->cell_f[pos++] = comm->cell_f1[d1];
3280     }
3281
3282     if (d < comm->npmedecompdim)
3283     {
3284         /* The master determines the maximum shift for
3285          * the coordinate communication between separate PME nodes.
3286          */
3287         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3288     }
3289     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3290     if (d >= 1)
3291     {
3292         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3293     }
3294 }
3295
3296 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3297                                              gmx_ddbox_t *ddbox, int dimind)
3298 {
3299     gmx_domdec_comm_t *comm;
3300     int                dim;
3301
3302     comm = dd->comm;
3303
3304     /* Set the cell dimensions */
3305     dim                = dd->dim[dimind];
3306     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3307     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3308     if (dim >= ddbox->nboundeddim)
3309     {
3310         comm->cell_x0[dim] += ddbox->box0[dim];
3311         comm->cell_x1[dim] += ddbox->box0[dim];
3312     }
3313 }
3314
3315 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3316                                          int d, int dim, real *cell_f_row,
3317                                          gmx_ddbox_t *ddbox)
3318 {
3319     gmx_domdec_comm_t *comm;
3320     int                d1, pos;
3321
3322     comm = dd->comm;
3323
3324 #ifdef GMX_MPI
3325     /* Each node would only need to know two fractions,
3326      * but it is probably cheaper to broadcast the whole array.
3327      */
3328     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3329               0, comm->mpi_comm_load[d]);
3330 #endif
3331     /* Copy the fractions for this dimension from the buffer */
3332     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3333     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3334     /* The whole array was communicated, so set the buffer position */
3335     pos = dd->nc[dim] + 1;
3336     for (d1 = 0; d1 <= d; d1++)
3337     {
3338         if (d1 < d)
3339         {
3340             /* Copy the cell fractions of the lower dimensions */
3341             comm->cell_f0[d1] = cell_f_row[pos++];
3342             comm->cell_f1[d1] = cell_f_row[pos++];
3343         }
3344         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3345     }
3346     /* Convert the communicated shift from float to int */
3347     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3348     if (d >= 1)
3349     {
3350         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3351     }
3352 }
3353
3354 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3355                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3356                                          gmx_bool bUniform, gmx_int64_t step)
3357 {
3358     gmx_domdec_comm_t *comm;
3359     int                d, dim, d1;
3360     gmx_bool           bRowMember, bRowRoot;
3361     real              *cell_f_row;
3362
3363     comm = dd->comm;
3364
3365     for (d = 0; d < dd->ndim; d++)
3366     {
3367         dim        = dd->dim[d];
3368         bRowMember = TRUE;
3369         bRowRoot   = TRUE;
3370         for (d1 = d; d1 < dd->ndim; d1++)
3371         {
3372             if (dd->ci[dd->dim[d1]] > 0)
3373             {
3374                 if (d1 != d)
3375                 {
3376                     bRowMember = FALSE;
3377                 }
3378                 bRowRoot = FALSE;
3379             }
3380         }
3381         if (bRowMember)
3382         {
3383             if (bRowRoot)
3384             {
3385                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3386                                            ddbox, bDynamicBox, bUniform, step);
3387                 cell_f_row = comm->root[d]->cell_f;
3388             }
3389             else
3390             {
3391                 cell_f_row = comm->cell_f_row;
3392             }
3393             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3394         }
3395     }
3396 }
3397
3398 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3399 {
3400     int d;
3401
3402     /* This function assumes the box is static and should therefore
3403      * not be called when the box has changed since the last
3404      * call to dd_partition_system.
3405      */
3406     for (d = 0; d < dd->ndim; d++)
3407     {
3408         relative_to_absolute_cell_bounds(dd, ddbox, d);
3409     }
3410 }
3411
3412
3413
3414 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3415                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3416                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3417                                   gmx_wallcycle_t wcycle)
3418 {
3419     gmx_domdec_comm_t *comm;
3420     int                dim;
3421
3422     comm = dd->comm;
3423
3424     if (bDoDLB)
3425     {
3426         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3427         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3428         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3429     }
3430     else if (bDynamicBox)
3431     {
3432         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3433     }
3434
3435     /* Set the dimensions for which no DD is used */
3436     for (dim = 0; dim < DIM; dim++)
3437     {
3438         if (dd->nc[dim] == 1)
3439         {
3440             comm->cell_x0[dim] = 0;
3441             comm->cell_x1[dim] = ddbox->box_size[dim];
3442             if (dim >= ddbox->nboundeddim)
3443             {
3444                 comm->cell_x0[dim] += ddbox->box0[dim];
3445                 comm->cell_x1[dim] += ddbox->box0[dim];
3446             }
3447         }
3448     }
3449 }
3450
3451 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3452 {
3453     int                    d, np, i;
3454     gmx_domdec_comm_dim_t *cd;
3455
3456     for (d = 0; d < dd->ndim; d++)
3457     {
3458         cd = &dd->comm->cd[d];
3459         np = npulse[dd->dim[d]];
3460         if (np > cd->np_nalloc)
3461         {
3462             if (debug)
3463             {
3464                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3465                         dim2char(dd->dim[d]), np);
3466             }
3467             if (DDMASTER(dd) && cd->np_nalloc > 0)
3468             {
3469                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3470             }
3471             srenew(cd->ind, np);
3472             for (i = cd->np_nalloc; i < np; i++)
3473             {
3474                 cd->ind[i].index  = NULL;
3475                 cd->ind[i].nalloc = 0;
3476             }
3477             cd->np_nalloc = np;
3478         }
3479         cd->np = np;
3480     }
3481 }
3482
3483
3484 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3485                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3486                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3487                               gmx_wallcycle_t wcycle)
3488 {
3489     gmx_domdec_comm_t *comm;
3490     int                d;
3491     ivec               npulse;
3492
3493     comm = dd->comm;
3494
3495     /* Copy the old cell boundaries for the cg displacement check */
3496     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3497     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3498
3499     if (dlbIsOn(comm))
3500     {
3501         if (DDMASTER(dd))
3502         {
3503             check_box_size(dd, ddbox);
3504         }
3505         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3506     }
3507     else
3508     {
3509         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3510         realloc_comm_ind(dd, npulse);
3511     }
3512
3513     if (debug)
3514     {
3515         for (d = 0; d < DIM; d++)
3516         {
3517             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3518                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3519         }
3520     }
3521 }
3522
3523 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3524                                   gmx_ddbox_t *ddbox,
3525                                   rvec cell_ns_x0, rvec cell_ns_x1,
3526                                   gmx_int64_t step)
3527 {
3528     gmx_domdec_comm_t *comm;
3529     int                dim_ind, dim;
3530
3531     comm = dd->comm;
3532
3533     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3534     {
3535         dim = dd->dim[dim_ind];
3536
3537         /* Without PBC we don't have restrictions on the outer cells */
3538         if (!(dim >= ddbox->npbcdim &&
3539               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3540             dlbIsOn(comm) &&
3541             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3542             comm->cellsize_min[dim])
3543         {
3544             char buf[22];
3545             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3546                       gmx_step_str(step, buf), dim2char(dim),
3547                       comm->cell_x1[dim] - comm->cell_x0[dim],
3548                       ddbox->skew_fac[dim],
3549                       dd->comm->cellsize_min[dim],
3550                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3551         }
3552     }
3553
3554     if ((dlbIsOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3555     {
3556         /* Communicate the boundaries and update cell_ns_x0/1 */
3557         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3558         if (dlbIsOn(dd->comm) && dd->ndim > 1)
3559         {
3560             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3561         }
3562     }
3563 }
3564
3565 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3566 {
3567     if (YY < npbcdim)
3568     {
3569         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3570     }
3571     else
3572     {
3573         tcm[YY][XX] = 0;
3574     }
3575     if (ZZ < npbcdim)
3576     {
3577         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3578         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3579     }
3580     else
3581     {
3582         tcm[ZZ][XX] = 0;
3583         tcm[ZZ][YY] = 0;
3584     }
3585 }
3586
3587 static void check_screw_box(matrix box)
3588 {
3589     /* Mathematical limitation */
3590     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3591     {
3592         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3593     }
3594
3595     /* Limitation due to the asymmetry of the eighth shell method */
3596     if (box[ZZ][YY] != 0)
3597     {
3598         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3599     }
3600 }
3601
3602 static void distribute_cg(FILE *fplog,
3603                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3604                           gmx_domdec_t *dd)
3605 {
3606     gmx_domdec_master_t *ma;
3607     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3608     int                  i, icg, j, k, k0, k1, d;
3609     matrix               tcm;
3610     rvec                 cg_cm;
3611     ivec                 ind;
3612     real                 nrcg, inv_ncg, pos_d;
3613     int                 *cgindex;
3614     gmx_bool             bScrew;
3615
3616     ma = dd->ma;
3617
3618     if (tmp_ind == NULL)
3619     {
3620         snew(tmp_nalloc, dd->nnodes);
3621         snew(tmp_ind, dd->nnodes);
3622         for (i = 0; i < dd->nnodes; i++)
3623         {
3624             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3625             snew(tmp_ind[i], tmp_nalloc[i]);
3626         }
3627     }
3628
3629     /* Clear the count */
3630     for (i = 0; i < dd->nnodes; i++)
3631     {
3632         ma->ncg[i] = 0;
3633         ma->nat[i] = 0;
3634     }
3635
3636     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3637
3638     cgindex = cgs->index;
3639
3640     /* Compute the center of geometry for all charge groups */
3641     for (icg = 0; icg < cgs->nr; icg++)
3642     {
3643         k0      = cgindex[icg];
3644         k1      = cgindex[icg+1];
3645         nrcg    = k1 - k0;
3646         if (nrcg == 1)
3647         {
3648             copy_rvec(pos[k0], cg_cm);
3649         }
3650         else
3651         {
3652             inv_ncg = 1.0/nrcg;
3653
3654             clear_rvec(cg_cm);
3655             for (k = k0; (k < k1); k++)
3656             {
3657                 rvec_inc(cg_cm, pos[k]);
3658             }
3659             for (d = 0; (d < DIM); d++)
3660             {
3661                 cg_cm[d] *= inv_ncg;
3662             }
3663         }
3664         /* Put the charge group in the box and determine the cell index */
3665         for (d = DIM-1; d >= 0; d--)
3666         {
3667             pos_d = cg_cm[d];
3668             if (d < dd->npbcdim)
3669             {
3670                 bScrew = (dd->bScrewPBC && d == XX);
3671                 if (tric_dir[d] && dd->nc[d] > 1)
3672                 {
3673                     /* Use triclinic coordintates for this dimension */
3674                     for (j = d+1; j < DIM; j++)
3675                     {
3676                         pos_d += cg_cm[j]*tcm[j][d];
3677                     }
3678                 }
3679                 while (pos_d >= box[d][d])
3680                 {
3681                     pos_d -= box[d][d];
3682                     rvec_dec(cg_cm, box[d]);
3683                     if (bScrew)
3684                     {
3685                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3686                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3687                     }
3688                     for (k = k0; (k < k1); k++)
3689                     {
3690                         rvec_dec(pos[k], box[d]);
3691                         if (bScrew)
3692                         {
3693                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3694                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3695                         }
3696                     }
3697                 }
3698                 while (pos_d < 0)
3699                 {
3700                     pos_d += box[d][d];
3701                     rvec_inc(cg_cm, box[d]);
3702                     if (bScrew)
3703                     {
3704                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3705                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3706                     }
3707                     for (k = k0; (k < k1); k++)
3708                     {
3709                         rvec_inc(pos[k], box[d]);
3710                         if (bScrew)
3711                         {
3712                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3713                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3714                         }
3715                     }
3716                 }
3717             }
3718             /* This could be done more efficiently */
3719             ind[d] = 0;
3720             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3721             {
3722                 ind[d]++;
3723             }
3724         }
3725         i = dd_index(dd->nc, ind);
3726         if (ma->ncg[i] == tmp_nalloc[i])
3727         {
3728             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3729             srenew(tmp_ind[i], tmp_nalloc[i]);
3730         }
3731         tmp_ind[i][ma->ncg[i]] = icg;
3732         ma->ncg[i]++;
3733         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3734     }
3735
3736     k1 = 0;
3737     for (i = 0; i < dd->nnodes; i++)
3738     {
3739         ma->index[i] = k1;
3740         for (k = 0; k < ma->ncg[i]; k++)
3741         {
3742             ma->cg[k1++] = tmp_ind[i][k];
3743         }
3744     }
3745     ma->index[dd->nnodes] = k1;
3746
3747     for (i = 0; i < dd->nnodes; i++)
3748     {
3749         sfree(tmp_ind[i]);
3750     }
3751     sfree(tmp_ind);
3752     sfree(tmp_nalloc);
3753
3754     if (fplog)
3755     {
3756         /* Here we avoid int overflows due to #atoms^2: use double, dsqr */
3757         int    nat_sum, nat_min, nat_max;
3758         double nat2_sum;
3759
3760         nat_sum  = 0;
3761         nat2_sum = 0;
3762         nat_min  = ma->nat[0];
3763         nat_max  = ma->nat[0];
3764         for (i = 0; i < dd->nnodes; i++)
3765         {
3766             nat_sum  += ma->nat[i];
3767             nat2_sum += dsqr(ma->nat[i]);
3768             nat_min   = std::min(nat_min, ma->nat[i]);
3769             nat_max   = std::max(nat_max, ma->nat[i]);
3770         }
3771         nat_sum  /= dd->nnodes;
3772         nat2_sum /= dd->nnodes;
3773
3774         fprintf(fplog, "Atom distribution over %d domains: av %d stddev %d min %d max %d\n",
3775                 dd->nnodes,
3776                 nat_sum,
3777                 static_cast<int>(sqrt(nat2_sum - dsqr(nat_sum) + 0.5)),
3778                 nat_min, nat_max);
3779     }
3780 }
3781
3782 static void get_cg_distribution(FILE *fplog, gmx_domdec_t *dd,
3783                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
3784                                 rvec pos[])
3785 {
3786     gmx_domdec_master_t *ma = NULL;
3787     ivec                 npulse;
3788     int                  i, cg_gl;
3789     int                 *ibuf, buf2[2] = { 0, 0 };
3790     gmx_bool             bMaster = DDMASTER(dd);
3791
3792     if (bMaster)
3793     {
3794         ma = dd->ma;
3795
3796         if (dd->bScrewPBC)
3797         {
3798             check_screw_box(box);
3799         }
3800
3801         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
3802
3803         distribute_cg(fplog, box, ddbox->tric_dir, cgs, pos, dd);
3804         for (i = 0; i < dd->nnodes; i++)
3805         {
3806             ma->ibuf[2*i]   = ma->ncg[i];
3807             ma->ibuf[2*i+1] = ma->nat[i];
3808         }
3809         ibuf = ma->ibuf;
3810     }
3811     else
3812     {
3813         ibuf = NULL;
3814     }
3815     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
3816
3817     dd->ncg_home = buf2[0];
3818     dd->nat_home = buf2[1];
3819     dd->ncg_tot  = dd->ncg_home;
3820     dd->nat_tot  = dd->nat_home;
3821     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3822     {
3823         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3824         srenew(dd->index_gl, dd->cg_nalloc);
3825         srenew(dd->cgindex, dd->cg_nalloc+1);
3826     }
3827     if (bMaster)
3828     {
3829         for (i = 0; i < dd->nnodes; i++)
3830         {
3831             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
3832             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3833         }
3834     }
3835
3836     dd_scatterv(dd,
3837                 bMaster ? ma->ibuf : NULL,
3838                 bMaster ? ma->ibuf+dd->nnodes : NULL,
3839                 bMaster ? ma->cg : NULL,
3840                 dd->ncg_home*sizeof(int), dd->index_gl);
3841
3842     /* Determine the home charge group sizes */
3843     dd->cgindex[0] = 0;
3844     for (i = 0; i < dd->ncg_home; i++)
3845     {
3846         cg_gl            = dd->index_gl[i];
3847         dd->cgindex[i+1] =
3848             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3849     }
3850
3851     if (debug)
3852     {
3853         fprintf(debug, "Home charge groups:\n");
3854         for (i = 0; i < dd->ncg_home; i++)
3855         {
3856             fprintf(debug, " %d", dd->index_gl[i]);
3857             if (i % 10 == 9)
3858             {
3859                 fprintf(debug, "\n");
3860             }
3861         }
3862         fprintf(debug, "\n");
3863     }
3864 }
3865
3866 static int compact_and_copy_vec_at(int ncg, int *move,
3867                                    int *cgindex,
3868                                    int nvec, int vec,
3869                                    rvec *src, gmx_domdec_comm_t *comm,
3870                                    gmx_bool bCompact)
3871 {
3872     int m, icg, i, i0, i1, nrcg;
3873     int home_pos;
3874     int pos_vec[DIM*2];
3875
3876     home_pos = 0;
3877
3878     for (m = 0; m < DIM*2; m++)
3879     {
3880         pos_vec[m] = 0;
3881     }
3882
3883     i0 = 0;
3884     for (icg = 0; icg < ncg; icg++)
3885     {
3886         i1 = cgindex[icg+1];
3887         m  = move[icg];
3888         if (m == -1)
3889         {
3890             if (bCompact)
3891             {
3892                 /* Compact the home array in place */
3893                 for (i = i0; i < i1; i++)
3894                 {
3895                     copy_rvec(src[i], src[home_pos++]);
3896                 }
3897             }
3898         }
3899         else
3900         {
3901             /* Copy to the communication buffer */
3902             nrcg        = i1 - i0;
3903             pos_vec[m] += 1 + vec*nrcg;
3904             for (i = i0; i < i1; i++)
3905             {
3906                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
3907             }
3908             pos_vec[m] += (nvec - vec - 1)*nrcg;
3909         }
3910         if (!bCompact)
3911         {
3912             home_pos += i1 - i0;
3913         }
3914         i0 = i1;
3915     }
3916
3917     return home_pos;
3918 }
3919
3920 static int compact_and_copy_vec_cg(int ncg, int *move,
3921                                    int *cgindex,
3922                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
3923                                    gmx_bool bCompact)
3924 {
3925     int m, icg, i0, i1, nrcg;
3926     int home_pos;
3927     int pos_vec[DIM*2];
3928
3929     home_pos = 0;
3930
3931     for (m = 0; m < DIM*2; m++)
3932     {
3933         pos_vec[m] = 0;
3934     }
3935
3936     i0 = 0;
3937     for (icg = 0; icg < ncg; icg++)
3938     {
3939         i1 = cgindex[icg+1];
3940         m  = move[icg];
3941         if (m == -1)
3942         {
3943             if (bCompact)
3944             {
3945                 /* Compact the home array in place */
3946                 copy_rvec(src[icg], src[home_pos++]);
3947             }
3948         }
3949         else
3950         {
3951             nrcg = i1 - i0;
3952             /* Copy to the communication buffer */
3953             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
3954             pos_vec[m] += 1 + nrcg*nvec;
3955         }
3956         i0 = i1;
3957     }
3958     if (!bCompact)
3959     {
3960         home_pos = ncg;
3961     }
3962
3963     return home_pos;
3964 }
3965
3966 static int compact_ind(int ncg, int *move,
3967                        int *index_gl, int *cgindex,
3968                        int *gatindex,
3969                        gmx_ga2la_t ga2la, char *bLocalCG,
3970                        int *cginfo)
3971 {
3972     int cg, nat, a0, a1, a, a_gl;
3973     int home_pos;
3974
3975     home_pos = 0;
3976     nat      = 0;
3977     for (cg = 0; cg < ncg; cg++)
3978     {
3979         a0 = cgindex[cg];
3980         a1 = cgindex[cg+1];
3981         if (move[cg] == -1)
3982         {
3983             /* Compact the home arrays in place.
3984              * Anything that can be done here avoids access to global arrays.
3985              */
3986             cgindex[home_pos] = nat;
3987             for (a = a0; a < a1; a++)
3988             {
3989                 a_gl          = gatindex[a];
3990                 gatindex[nat] = a_gl;
3991                 /* The cell number stays 0, so we don't need to set it */
3992                 ga2la_change_la(ga2la, a_gl, nat);
3993                 nat++;
3994             }
3995             index_gl[home_pos] = index_gl[cg];
3996             cginfo[home_pos]   = cginfo[cg];
3997             /* The charge group remains local, so bLocalCG does not change */
3998             home_pos++;
3999         }
4000         else
4001         {
4002             /* Clear the global indices */
4003             for (a = a0; a < a1; a++)
4004             {
4005                 ga2la_del(ga2la, gatindex[a]);
4006             }
4007             if (bLocalCG)
4008             {
4009                 bLocalCG[index_gl[cg]] = FALSE;
4010             }
4011         }
4012     }
4013     cgindex[home_pos] = nat;
4014
4015     return home_pos;
4016 }
4017
4018 static void clear_and_mark_ind(int ncg, int *move,
4019                                int *index_gl, int *cgindex, int *gatindex,
4020                                gmx_ga2la_t ga2la, char *bLocalCG,
4021                                int *cell_index)
4022 {
4023     int cg, a0, a1, a;
4024
4025     for (cg = 0; cg < ncg; cg++)
4026     {
4027         if (move[cg] >= 0)
4028         {
4029             a0 = cgindex[cg];
4030             a1 = cgindex[cg+1];
4031             /* Clear the global indices */
4032             for (a = a0; a < a1; a++)
4033             {
4034                 ga2la_del(ga2la, gatindex[a]);
4035             }
4036             if (bLocalCG)
4037             {
4038                 bLocalCG[index_gl[cg]] = FALSE;
4039             }
4040             /* Signal that this cg has moved using the ns cell index.
4041              * Here we set it to -1. fill_grid will change it
4042              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4043              */
4044             cell_index[cg] = -1;
4045         }
4046     }
4047 }
4048
4049 static void print_cg_move(FILE *fplog,
4050                           gmx_domdec_t *dd,
4051                           gmx_int64_t step, int cg, int dim, int dir,
4052                           gmx_bool bHaveCgcmOld, real limitd,
4053                           rvec cm_old, rvec cm_new, real pos_d)
4054 {
4055     gmx_domdec_comm_t *comm;
4056     char               buf[22];
4057
4058     comm = dd->comm;
4059
4060     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4061     if (limitd > 0)
4062     {
4063         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4064                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4065                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4066     }
4067     else
4068     {
4069         /* We don't have a limiting distance available: don't print it */
4070         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4071                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4072                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4073     }
4074     fprintf(fplog, "distance out of cell %f\n",
4075             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4076     if (bHaveCgcmOld)
4077     {
4078         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4079                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4080     }
4081     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4082             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4083     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4084             dim2char(dim),
4085             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4086     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4087             dim2char(dim),
4088             comm->cell_x0[dim], comm->cell_x1[dim]);
4089 }
4090
4091 static void cg_move_error(FILE *fplog,
4092                           gmx_domdec_t *dd,
4093                           gmx_int64_t step, int cg, int dim, int dir,
4094                           gmx_bool bHaveCgcmOld, real limitd,
4095                           rvec cm_old, rvec cm_new, real pos_d)
4096 {
4097     if (fplog)
4098     {
4099         print_cg_move(fplog, dd, step, cg, dim, dir,
4100                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4101     }
4102     print_cg_move(stderr, dd, step, cg, dim, dir,
4103                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4104     gmx_fatal(FARGS,
4105               "%s moved too far between two domain decomposition steps\n"
4106               "This usually means that your system is not well equilibrated",
4107               dd->comm->bCGs ? "A charge group" : "An atom");
4108 }
4109
4110 static void rotate_state_atom(t_state *state, int a)
4111 {
4112     int est;
4113
4114     for (est = 0; est < estNR; est++)
4115     {
4116         if (EST_DISTR(est) && (state->flags & (1<<est)))
4117         {
4118             switch (est)
4119             {
4120                 case estX:
4121                     /* Rotate the complete state; for a rectangular box only */
4122                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4123                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4124                     break;
4125                 case estV:
4126                     state->v[a][YY] = -state->v[a][YY];
4127                     state->v[a][ZZ] = -state->v[a][ZZ];
4128                     break;
4129                 case estSDX:
4130                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4131                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4132                     break;
4133                 case estCGP:
4134                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4135                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4136                     break;
4137                 case estDISRE_INITF:
4138                 case estDISRE_RM3TAV:
4139                 case estORIRE_INITF:
4140                 case estORIRE_DTAV:
4141                     /* These are distances, so not affected by rotation */
4142                     break;
4143                 default:
4144                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4145             }
4146         }
4147     }
4148 }
4149
4150 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4151 {
4152     if (natoms > comm->moved_nalloc)
4153     {
4154         /* Contents should be preserved here */
4155         comm->moved_nalloc = over_alloc_dd(natoms);
4156         srenew(comm->moved, comm->moved_nalloc);
4157     }
4158
4159     return comm->moved;
4160 }
4161
4162 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4163                          gmx_domdec_t *dd,
4164                          t_state *state,
4165                          ivec tric_dir, matrix tcm,
4166                          rvec cell_x0, rvec cell_x1,
4167                          rvec limitd, rvec limit0, rvec limit1,
4168                          const int *cgindex,
4169                          int cg_start, int cg_end,
4170                          rvec *cg_cm,
4171                          int *move)
4172 {
4173     int      npbcdim;
4174     int      cg, k, k0, k1, d, dim, d2;
4175     int      mc, nrcg;
4176     int      flag;
4177     gmx_bool bScrew;
4178     ivec     dev;
4179     real     inv_ncg, pos_d;
4180     rvec     cm_new;
4181
4182     npbcdim = dd->npbcdim;
4183
4184     for (cg = cg_start; cg < cg_end; cg++)
4185     {
4186         k0   = cgindex[cg];
4187         k1   = cgindex[cg+1];
4188         nrcg = k1 - k0;
4189         if (nrcg == 1)
4190         {
4191             copy_rvec(state->x[k0], cm_new);
4192         }
4193         else
4194         {
4195             inv_ncg = 1.0/nrcg;
4196
4197             clear_rvec(cm_new);
4198             for (k = k0; (k < k1); k++)
4199             {
4200                 rvec_inc(cm_new, state->x[k]);
4201             }
4202             for (d = 0; (d < DIM); d++)
4203             {
4204                 cm_new[d] = inv_ncg*cm_new[d];
4205             }
4206         }
4207
4208         clear_ivec(dev);
4209         /* Do pbc and check DD cell boundary crossings */
4210         for (d = DIM-1; d >= 0; d--)
4211         {
4212             if (dd->nc[d] > 1)
4213             {
4214                 bScrew = (dd->bScrewPBC && d == XX);
4215                 /* Determine the location of this cg in lattice coordinates */
4216                 pos_d = cm_new[d];
4217                 if (tric_dir[d])
4218                 {
4219                     for (d2 = d+1; d2 < DIM; d2++)
4220                     {
4221                         pos_d += cm_new[d2]*tcm[d2][d];
4222                     }
4223                 }
4224                 /* Put the charge group in the triclinic unit-cell */
4225                 if (pos_d >= cell_x1[d])
4226                 {
4227                     if (pos_d >= limit1[d])
4228                     {
4229                         cg_move_error(fplog, dd, step, cg, d, 1,
4230                                       cg_cm != state->x, limitd[d],
4231                                       cg_cm[cg], cm_new, pos_d);
4232                     }
4233                     dev[d] = 1;
4234                     if (dd->ci[d] == dd->nc[d] - 1)
4235                     {
4236                         rvec_dec(cm_new, state->box[d]);
4237                         if (bScrew)
4238                         {
4239                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4240                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4241                         }
4242                         for (k = k0; (k < k1); k++)
4243                         {
4244                             rvec_dec(state->x[k], state->box[d]);
4245                             if (bScrew)
4246                             {
4247                                 rotate_state_atom(state, k);
4248                             }
4249                         }
4250                     }
4251                 }
4252                 else if (pos_d < cell_x0[d])
4253                 {
4254                     if (pos_d < limit0[d])
4255                     {
4256                         cg_move_error(fplog, dd, step, cg, d, -1,
4257                                       cg_cm != state->x, limitd[d],
4258                                       cg_cm[cg], cm_new, pos_d);
4259                     }
4260                     dev[d] = -1;
4261                     if (dd->ci[d] == 0)
4262                     {
4263                         rvec_inc(cm_new, state->box[d]);
4264                         if (bScrew)
4265                         {
4266                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4267                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4268                         }
4269                         for (k = k0; (k < k1); k++)
4270                         {
4271                             rvec_inc(state->x[k], state->box[d]);
4272                             if (bScrew)
4273                             {
4274                                 rotate_state_atom(state, k);
4275                             }
4276                         }
4277                     }
4278                 }
4279             }
4280             else if (d < npbcdim)
4281             {
4282                 /* Put the charge group in the rectangular unit-cell */
4283                 while (cm_new[d] >= state->box[d][d])
4284                 {
4285                     rvec_dec(cm_new, state->box[d]);
4286                     for (k = k0; (k < k1); k++)
4287                     {
4288                         rvec_dec(state->x[k], state->box[d]);
4289                     }
4290                 }
4291                 while (cm_new[d] < 0)
4292                 {
4293                     rvec_inc(cm_new, state->box[d]);
4294                     for (k = k0; (k < k1); k++)
4295                     {
4296                         rvec_inc(state->x[k], state->box[d]);
4297                     }
4298                 }
4299             }
4300         }
4301
4302         copy_rvec(cm_new, cg_cm[cg]);
4303
4304         /* Determine where this cg should go */
4305         flag = 0;
4306         mc   = -1;
4307         for (d = 0; d < dd->ndim; d++)
4308         {
4309             dim = dd->dim[d];
4310             if (dev[dim] == 1)
4311             {
4312                 flag |= DD_FLAG_FW(d);
4313                 if (mc == -1)
4314                 {
4315                     mc = d*2;
4316                 }
4317             }
4318             else if (dev[dim] == -1)
4319             {
4320                 flag |= DD_FLAG_BW(d);
4321                 if (mc == -1)
4322                 {
4323                     if (dd->nc[dim] > 2)
4324                     {
4325                         mc = d*2 + 1;
4326                     }
4327                     else
4328                     {
4329                         mc = d*2;
4330                     }
4331                 }
4332             }
4333         }
4334         /* Temporarily store the flag in move */
4335         move[cg] = mc + flag;
4336     }
4337 }
4338
4339 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4340                                gmx_domdec_t *dd, ivec tric_dir,
4341                                t_state *state, rvec **f,
4342                                t_forcerec *fr,
4343                                gmx_bool bCompact,
4344                                t_nrnb *nrnb,
4345                                int *ncg_stay_home,
4346                                int *ncg_moved)
4347 {
4348     int               *move;
4349     int                npbcdim;
4350     int                ncg[DIM*2], nat[DIM*2];
4351     int                c, i, cg, k, d, dim, dim2, dir, d2, d3;
4352     int                mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
4353     int                sbuf[2], rbuf[2];
4354     int                home_pos_cg, home_pos_at, buf_pos;
4355     int                flag;
4356     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4357     real               pos_d;
4358     matrix             tcm;
4359     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1;
4360     int               *cgindex;
4361     cginfo_mb_t       *cginfo_mb;
4362     gmx_domdec_comm_t *comm;
4363     int               *moved;
4364     int                nthread, thread;
4365
4366     if (dd->bScrewPBC)
4367     {
4368         check_screw_box(state->box);
4369     }
4370
4371     comm  = dd->comm;
4372     if (fr->cutoff_scheme == ecutsGROUP)
4373     {
4374         cg_cm = fr->cg_cm;
4375     }
4376
4377     for (i = 0; i < estNR; i++)
4378     {
4379         if (EST_DISTR(i))
4380         {
4381             switch (i)
4382             {
4383                 case estX: /* Always present */ break;
4384                 case estV:   bV   = (state->flags & (1<<i)); break;
4385                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4386                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4387                 case estLD_RNG:
4388                 case estLD_RNGI:
4389                 case estDISRE_INITF:
4390                 case estDISRE_RM3TAV:
4391                 case estORIRE_INITF:
4392                 case estORIRE_DTAV:
4393                     /* No processing required */
4394                     break;
4395                 default:
4396                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4397             }
4398         }
4399     }
4400
4401     if (dd->ncg_tot > comm->nalloc_int)
4402     {
4403         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4404         srenew(comm->buf_int, comm->nalloc_int);
4405     }
4406     move = comm->buf_int;
4407
4408     /* Clear the count */
4409     for (c = 0; c < dd->ndim*2; c++)
4410     {
4411         ncg[c] = 0;
4412         nat[c] = 0;
4413     }
4414
4415     npbcdim = dd->npbcdim;
4416
4417     for (d = 0; (d < DIM); d++)
4418     {
4419         limitd[d] = dd->comm->cellsize_min[d];
4420         if (d >= npbcdim && dd->ci[d] == 0)
4421         {
4422             cell_x0[d] = -GMX_FLOAT_MAX;
4423         }
4424         else
4425         {
4426             cell_x0[d] = comm->cell_x0[d];
4427         }
4428         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4429         {
4430             cell_x1[d] = GMX_FLOAT_MAX;
4431         }
4432         else
4433         {
4434             cell_x1[d] = comm->cell_x1[d];
4435         }
4436         if (d < npbcdim)
4437         {
4438             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4439             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4440         }
4441         else
4442         {
4443             /* We check after communication if a charge group moved
4444              * more than one cell. Set the pre-comm check limit to float_max.
4445              */
4446             limit0[d] = -GMX_FLOAT_MAX;
4447             limit1[d] =  GMX_FLOAT_MAX;
4448         }
4449     }
4450
4451     make_tric_corr_matrix(npbcdim, state->box, tcm);
4452
4453     cgindex = dd->cgindex;
4454
4455     nthread = gmx_omp_nthreads_get(emntDomdec);
4456
4457     /* Compute the center of geometry for all home charge groups
4458      * and put them in the box and determine where they should go.
4459      */
4460 #pragma omp parallel for num_threads(nthread) schedule(static)
4461     for (thread = 0; thread < nthread; thread++)
4462     {
4463         try
4464         {
4465             calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4466                          cell_x0, cell_x1, limitd, limit0, limit1,
4467                          cgindex,
4468                          ( thread   *dd->ncg_home)/nthread,
4469                          ((thread+1)*dd->ncg_home)/nthread,
4470                          fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4471                          move);
4472         }
4473         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4474     }
4475
4476     for (cg = 0; cg < dd->ncg_home; cg++)
4477     {
4478         if (move[cg] >= 0)
4479         {
4480             mc       = move[cg];
4481             flag     = mc & ~DD_FLAG_NRCG;
4482             mc       = mc & DD_FLAG_NRCG;
4483             move[cg] = mc;
4484
4485             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4486             {
4487                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4488                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4489             }
4490             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4491             /* We store the cg size in the lower 16 bits
4492              * and the place where the charge group should go
4493              * in the next 6 bits. This saves some communication volume.
4494              */
4495             nrcg = cgindex[cg+1] - cgindex[cg];
4496             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4497             ncg[mc] += 1;
4498             nat[mc] += nrcg;
4499         }
4500     }
4501
4502     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4503     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4504
4505     *ncg_moved = 0;
4506     for (i = 0; i < dd->ndim*2; i++)
4507     {
4508         *ncg_moved += ncg[i];
4509     }
4510
4511     nvec = 1;
4512     if (bV)
4513     {
4514         nvec++;
4515     }
4516     if (bSDX)
4517     {
4518         nvec++;
4519     }
4520     if (bCGP)
4521     {
4522         nvec++;
4523     }
4524
4525     /* Make sure the communication buffers are large enough */
4526     for (mc = 0; mc < dd->ndim*2; mc++)
4527     {
4528         nvr = ncg[mc] + nat[mc]*nvec;
4529         if (nvr > comm->cgcm_state_nalloc[mc])
4530         {
4531             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4532             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4533         }
4534     }
4535
4536     switch (fr->cutoff_scheme)
4537     {
4538         case ecutsGROUP:
4539             /* Recalculating cg_cm might be cheaper than communicating,
4540              * but that could give rise to rounding issues.
4541              */
4542             home_pos_cg =
4543                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4544                                         nvec, cg_cm, comm, bCompact);
4545             break;
4546         case ecutsVERLET:
4547             /* Without charge groups we send the moved atom coordinates
4548              * over twice. This is so the code below can be used without
4549              * many conditionals for both for with and without charge groups.
4550              */
4551             home_pos_cg =
4552                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4553                                         nvec, state->x, comm, FALSE);
4554             if (bCompact)
4555             {
4556                 home_pos_cg -= *ncg_moved;
4557             }
4558             break;
4559         default:
4560             gmx_incons("unimplemented");
4561             home_pos_cg = 0;
4562     }
4563
4564     vec         = 0;
4565     home_pos_at =
4566         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4567                                 nvec, vec++, state->x, comm, bCompact);
4568     if (bV)
4569     {
4570         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4571                                 nvec, vec++, state->v, comm, bCompact);
4572     }
4573     if (bSDX)
4574     {
4575         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4576                                 nvec, vec++, state->sd_X, comm, bCompact);
4577     }
4578     if (bCGP)
4579     {
4580         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4581                                 nvec, vec++, state->cg_p, comm, bCompact);
4582     }
4583
4584     if (bCompact)
4585     {
4586         compact_ind(dd->ncg_home, move,
4587                     dd->index_gl, dd->cgindex, dd->gatindex,
4588                     dd->ga2la, comm->bLocalCG,
4589                     fr->cginfo);
4590     }
4591     else
4592     {
4593         if (fr->cutoff_scheme == ecutsVERLET)
4594         {
4595             moved = get_moved(comm, dd->ncg_home);
4596
4597             for (k = 0; k < dd->ncg_home; k++)
4598             {
4599                 moved[k] = 0;
4600             }
4601         }
4602         else
4603         {
4604             moved = fr->ns.grid->cell_index;
4605         }
4606
4607         clear_and_mark_ind(dd->ncg_home, move,
4608                            dd->index_gl, dd->cgindex, dd->gatindex,
4609                            dd->ga2la, comm->bLocalCG,
4610                            moved);
4611     }
4612
4613     cginfo_mb = fr->cginfo_mb;
4614
4615     *ncg_stay_home = home_pos_cg;
4616     for (d = 0; d < dd->ndim; d++)
4617     {
4618         dim      = dd->dim[d];
4619         ncg_recv = 0;
4620         nvr      = 0;
4621         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4622         {
4623             cdd = d*2 + dir;
4624             /* Communicate the cg and atom counts */
4625             sbuf[0] = ncg[cdd];
4626             sbuf[1] = nat[cdd];
4627             if (debug)
4628             {
4629                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4630                         d, dir, sbuf[0], sbuf[1]);
4631             }
4632             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4633
4634             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4635             {
4636                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4637                 srenew(comm->buf_int, comm->nalloc_int);
4638             }
4639
4640             /* Communicate the charge group indices, sizes and flags */
4641             dd_sendrecv_int(dd, d, dir,
4642                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4643                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4644
4645             nvs = ncg[cdd] + nat[cdd]*nvec;
4646             i   = rbuf[0]  + rbuf[1] *nvec;
4647             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4648
4649             /* Communicate cgcm and state */
4650             dd_sendrecv_rvec(dd, d, dir,
4651                              comm->cgcm_state[cdd], nvs,
4652                              comm->vbuf.v+nvr, i);
4653             ncg_recv += rbuf[0];
4654             nvr      += i;
4655         }
4656
4657         /* Process the received charge groups */
4658         buf_pos = 0;
4659         for (cg = 0; cg < ncg_recv; cg++)
4660         {
4661             flag = comm->buf_int[cg*DD_CGIBS+1];
4662
4663             if (dim >= npbcdim && dd->nc[dim] > 2)
4664             {
4665                 /* No pbc in this dim and more than one domain boundary.
4666                  * We do a separate check if a charge group didn't move too far.
4667                  */
4668                 if (((flag & DD_FLAG_FW(d)) &&
4669                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4670                     ((flag & DD_FLAG_BW(d)) &&
4671                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4672                 {
4673                     cg_move_error(fplog, dd, step, cg, dim,
4674                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4675                                   fr->cutoff_scheme == ecutsGROUP, 0,
4676                                   comm->vbuf.v[buf_pos],
4677                                   comm->vbuf.v[buf_pos],
4678                                   comm->vbuf.v[buf_pos][dim]);
4679                 }
4680             }
4681
4682             mc = -1;
4683             if (d < dd->ndim-1)
4684             {
4685                 /* Check which direction this cg should go */
4686                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4687                 {
4688                     if (dlbIsOn(dd->comm))
4689                     {
4690                         /* The cell boundaries for dimension d2 are not equal
4691                          * for each cell row of the lower dimension(s),
4692                          * therefore we might need to redetermine where
4693                          * this cg should go.
4694                          */
4695                         dim2 = dd->dim[d2];
4696                         /* If this cg crosses the box boundary in dimension d2
4697                          * we can use the communicated flag, so we do not
4698                          * have to worry about pbc.
4699                          */
4700                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4701                                (flag & DD_FLAG_FW(d2))) ||
4702                               (dd->ci[dim2] == 0 &&
4703                                (flag & DD_FLAG_BW(d2)))))
4704                         {
4705                             /* Clear the two flags for this dimension */
4706                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4707                             /* Determine the location of this cg
4708                              * in lattice coordinates
4709                              */
4710                             pos_d = comm->vbuf.v[buf_pos][dim2];
4711                             if (tric_dir[dim2])
4712                             {
4713                                 for (d3 = dim2+1; d3 < DIM; d3++)
4714                                 {
4715                                     pos_d +=
4716                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4717                                 }
4718                             }
4719                             /* Check of we are not at the box edge.
4720                              * pbc is only handled in the first step above,
4721                              * but this check could move over pbc while
4722                              * the first step did not due to different rounding.
4723                              */
4724                             if (pos_d >= cell_x1[dim2] &&
4725                                 dd->ci[dim2] != dd->nc[dim2]-1)
4726                             {
4727                                 flag |= DD_FLAG_FW(d2);
4728                             }
4729                             else if (pos_d < cell_x0[dim2] &&
4730                                      dd->ci[dim2] != 0)
4731                             {
4732                                 flag |= DD_FLAG_BW(d2);
4733                             }
4734                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4735                         }
4736                     }
4737                     /* Set to which neighboring cell this cg should go */
4738                     if (flag & DD_FLAG_FW(d2))
4739                     {
4740                         mc = d2*2;
4741                     }
4742                     else if (flag & DD_FLAG_BW(d2))
4743                     {
4744                         if (dd->nc[dd->dim[d2]] > 2)
4745                         {
4746                             mc = d2*2+1;
4747                         }
4748                         else
4749                         {
4750                             mc = d2*2;
4751                         }
4752                     }
4753                 }
4754             }
4755
4756             nrcg = flag & DD_FLAG_NRCG;
4757             if (mc == -1)
4758             {
4759                 if (home_pos_cg+1 > dd->cg_nalloc)
4760                 {
4761                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4762                     srenew(dd->index_gl, dd->cg_nalloc);
4763                     srenew(dd->cgindex, dd->cg_nalloc+1);
4764                 }
4765                 /* Set the global charge group index and size */
4766                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
4767                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4768                 /* Copy the state from the buffer */
4769                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
4770                 if (fr->cutoff_scheme == ecutsGROUP)
4771                 {
4772                     cg_cm = fr->cg_cm;
4773                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4774                 }
4775                 buf_pos++;
4776
4777                 /* Set the cginfo */
4778                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4779                                                    dd->index_gl[home_pos_cg]);
4780                 if (comm->bLocalCG)
4781                 {
4782                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4783                 }
4784
4785                 if (home_pos_at+nrcg > state->nalloc)
4786                 {
4787                     dd_realloc_state(state, f, home_pos_at+nrcg);
4788                 }
4789                 for (i = 0; i < nrcg; i++)
4790                 {
4791                     copy_rvec(comm->vbuf.v[buf_pos++],
4792                               state->x[home_pos_at+i]);
4793                 }
4794                 if (bV)
4795                 {
4796                     for (i = 0; i < nrcg; i++)
4797                     {
4798                         copy_rvec(comm->vbuf.v[buf_pos++],
4799                                   state->v[home_pos_at+i]);
4800                     }
4801                 }
4802                 if (bSDX)
4803                 {
4804                     for (i = 0; i < nrcg; i++)
4805                     {
4806                         copy_rvec(comm->vbuf.v[buf_pos++],
4807                                   state->sd_X[home_pos_at+i]);
4808                     }
4809                 }
4810                 if (bCGP)
4811                 {
4812                     for (i = 0; i < nrcg; i++)
4813                     {
4814                         copy_rvec(comm->vbuf.v[buf_pos++],
4815                                   state->cg_p[home_pos_at+i]);
4816                     }
4817                 }
4818                 home_pos_cg += 1;
4819                 home_pos_at += nrcg;
4820             }
4821             else
4822             {
4823                 /* Reallocate the buffers if necessary  */
4824                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4825                 {
4826                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4827                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4828                 }
4829                 nvr = ncg[mc] + nat[mc]*nvec;
4830                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4831                 {
4832                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4833                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4834                 }
4835                 /* Copy from the receive to the send buffers */
4836                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4837                        comm->buf_int + cg*DD_CGIBS,
4838                        DD_CGIBS*sizeof(int));
4839                 memcpy(comm->cgcm_state[mc][nvr],
4840                        comm->vbuf.v[buf_pos],
4841                        (1+nrcg*nvec)*sizeof(rvec));
4842                 buf_pos += 1 + nrcg*nvec;
4843                 ncg[mc] += 1;
4844                 nat[mc] += nrcg;
4845             }
4846         }
4847     }
4848
4849     /* With sorting (!bCompact) the indices are now only partially up to date
4850      * and ncg_home and nat_home are not the real count, since there are
4851      * "holes" in the arrays for the charge groups that moved to neighbors.
4852      */
4853     if (fr->cutoff_scheme == ecutsVERLET)
4854     {
4855         moved = get_moved(comm, home_pos_cg);
4856
4857         for (i = dd->ncg_home; i < home_pos_cg; i++)
4858         {
4859             moved[i] = 0;
4860         }
4861     }
4862     dd->ncg_home = home_pos_cg;
4863     dd->nat_home = home_pos_at;
4864
4865     if (debug)
4866     {
4867         fprintf(debug,
4868                 "Finished repartitioning: cgs moved out %d, new home %d\n",
4869                 *ncg_moved, dd->ncg_home-*ncg_moved);
4870
4871     }
4872 }
4873
4874 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
4875 {
4876     /* Note that the cycles value can be incorrect, either 0 or some
4877      * extremely large value, when our thread migrated to another core
4878      * with an unsynchronized cycle counter. If this happens less often
4879      * that once per nstlist steps, this will not cause issues, since
4880      * we later subtract the maximum value from the sum over nstlist steps.
4881      * A zero count will slightly lower the total, but that's a small effect.
4882      * Note that the main purpose of the subtraction of the maximum value
4883      * is to avoid throwing off the load balancing when stalls occur due
4884      * e.g. system activity or network congestion.
4885      */
4886     dd->comm->cycl[ddCycl] += cycles;
4887     dd->comm->cycl_n[ddCycl]++;
4888     if (cycles > dd->comm->cycl_max[ddCycl])
4889     {
4890         dd->comm->cycl_max[ddCycl] = cycles;
4891     }
4892 }
4893
4894 static double force_flop_count(t_nrnb *nrnb)
4895 {
4896     int         i;
4897     double      sum;
4898     const char *name;
4899
4900     sum = 0;
4901     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
4902     {
4903         /* To get closer to the real timings, we half the count
4904          * for the normal loops and again half it for water loops.
4905          */
4906         name = nrnb_str(i);
4907         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
4908         {
4909             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4910         }
4911         else
4912         {
4913             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4914         }
4915     }
4916     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
4917     {
4918         name = nrnb_str(i);
4919         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
4920         {
4921             sum += nrnb->n[i]*cost_nrnb(i);
4922         }
4923     }
4924     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
4925     {
4926         sum += nrnb->n[i]*cost_nrnb(i);
4927     }
4928
4929     return sum;
4930 }
4931
4932 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
4933 {
4934     if (dd->comm->eFlop)
4935     {
4936         dd->comm->flop -= force_flop_count(nrnb);
4937     }
4938 }
4939 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
4940 {
4941     if (dd->comm->eFlop)
4942     {
4943         dd->comm->flop += force_flop_count(nrnb);
4944         dd->comm->flop_n++;
4945     }
4946 }
4947
4948 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4949 {
4950     int i;
4951
4952     for (i = 0; i < ddCyclNr; i++)
4953     {
4954         dd->comm->cycl[i]     = 0;
4955         dd->comm->cycl_n[i]   = 0;
4956         dd->comm->cycl_max[i] = 0;
4957     }
4958     dd->comm->flop   = 0;
4959     dd->comm->flop_n = 0;
4960 }
4961
4962 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
4963 {
4964     gmx_domdec_comm_t *comm;
4965     domdec_load_t     *load;
4966     domdec_root_t     *root = NULL;
4967     int                d, dim, i, pos;
4968     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
4969     gmx_bool           bSepPME;
4970
4971     if (debug)
4972     {
4973         fprintf(debug, "get_load_distribution start\n");
4974     }
4975
4976     wallcycle_start(wcycle, ewcDDCOMMLOAD);
4977
4978     comm = dd->comm;
4979
4980     bSepPME = (dd->pme_nodeid >= 0);
4981
4982     if (dd->ndim == 0 && bSepPME)
4983     {
4984         /* Without decomposition, but with PME nodes, we need the load */
4985         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
4986         comm->load[0].pme = comm->cycl[ddCyclPME];
4987     }
4988
4989     for (d = dd->ndim-1; d >= 0; d--)
4990     {
4991         dim = dd->dim[d];
4992         /* Check if we participate in the communication in this dimension */
4993         if (d == dd->ndim-1 ||
4994             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
4995         {
4996             load = &comm->load[d];
4997             if (dlbIsOn(dd->comm))
4998             {
4999                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5000             }
5001             pos = 0;
5002             if (d == dd->ndim-1)
5003             {
5004                 sbuf[pos++] = dd_force_load(comm);
5005                 sbuf[pos++] = sbuf[0];
5006                 if (dlbIsOn(dd->comm))
5007                 {
5008                     sbuf[pos++] = sbuf[0];
5009                     sbuf[pos++] = cell_frac;
5010                     if (d > 0)
5011                     {
5012                         sbuf[pos++] = comm->cell_f_max0[d];
5013                         sbuf[pos++] = comm->cell_f_min1[d];
5014                     }
5015                 }
5016                 if (bSepPME)
5017                 {
5018                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5019                     sbuf[pos++] = comm->cycl[ddCyclPME];
5020                 }
5021             }
5022             else
5023             {
5024                 sbuf[pos++] = comm->load[d+1].sum;
5025                 sbuf[pos++] = comm->load[d+1].max;
5026                 if (dlbIsOn(dd->comm))
5027                 {
5028                     sbuf[pos++] = comm->load[d+1].sum_m;
5029                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5030                     sbuf[pos++] = comm->load[d+1].flags;
5031                     if (d > 0)
5032                     {
5033                         sbuf[pos++] = comm->cell_f_max0[d];
5034                         sbuf[pos++] = comm->cell_f_min1[d];
5035                     }
5036                 }
5037                 if (bSepPME)
5038                 {
5039                     sbuf[pos++] = comm->load[d+1].mdf;
5040                     sbuf[pos++] = comm->load[d+1].pme;
5041                 }
5042             }
5043             load->nload = pos;
5044             /* Communicate a row in DD direction d.
5045              * The communicators are setup such that the root always has rank 0.
5046              */
5047 #ifdef GMX_MPI
5048             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5049                        load->load, load->nload*sizeof(float), MPI_BYTE,
5050                        0, comm->mpi_comm_load[d]);
5051 #endif
5052             if (dd->ci[dim] == dd->master_ci[dim])
5053             {
5054                 /* We are the root, process this row */
5055                 if (dlbIsOn(comm))
5056                 {
5057                     root = comm->root[d];
5058                 }
5059                 load->sum      = 0;
5060                 load->max      = 0;
5061                 load->sum_m    = 0;
5062                 load->cvol_min = 1;
5063                 load->flags    = 0;
5064                 load->mdf      = 0;
5065                 load->pme      = 0;
5066                 pos            = 0;
5067                 for (i = 0; i < dd->nc[dim]; i++)
5068                 {
5069                     load->sum += load->load[pos++];
5070                     load->max  = std::max(load->max, load->load[pos]);
5071                     pos++;
5072                     if (dlbIsOn(dd->comm))
5073                     {
5074                         if (root->bLimited)
5075                         {
5076                             /* This direction could not be load balanced properly,
5077                              * therefore we need to use the maximum iso the average load.
5078                              */
5079                             load->sum_m = std::max(load->sum_m, load->load[pos]);
5080                         }
5081                         else
5082                         {
5083                             load->sum_m += load->load[pos];
5084                         }
5085                         pos++;
5086                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
5087                         pos++;
5088                         if (d < dd->ndim-1)
5089                         {
5090                             load->flags = (int)(load->load[pos++] + 0.5);
5091                         }
5092                         if (d > 0)
5093                         {
5094                             root->cell_f_max0[i] = load->load[pos++];
5095                             root->cell_f_min1[i] = load->load[pos++];
5096                         }
5097                     }
5098                     if (bSepPME)
5099                     {
5100                         load->mdf = std::max(load->mdf, load->load[pos]);
5101                         pos++;
5102                         load->pme = std::max(load->pme, load->load[pos]);
5103                         pos++;
5104                     }
5105                 }
5106                 if (dlbIsOn(comm) && root->bLimited)
5107                 {
5108                     load->sum_m *= dd->nc[dim];
5109                     load->flags |= (1<<d);
5110                 }
5111             }
5112         }
5113     }
5114
5115     if (DDMASTER(dd))
5116     {
5117         comm->nload      += dd_load_count(comm);
5118         comm->load_step  += comm->cycl[ddCyclStep];
5119         comm->load_sum   += comm->load[0].sum;
5120         comm->load_max   += comm->load[0].max;
5121         if (dlbIsOn(comm))
5122         {
5123             for (d = 0; d < dd->ndim; d++)
5124             {
5125                 if (comm->load[0].flags & (1<<d))
5126                 {
5127                     comm->load_lim[d]++;
5128                 }
5129             }
5130         }
5131         if (bSepPME)
5132         {
5133             comm->load_mdf += comm->load[0].mdf;
5134             comm->load_pme += comm->load[0].pme;
5135         }
5136     }
5137
5138     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5139
5140     if (debug)
5141     {
5142         fprintf(debug, "get_load_distribution finished\n");
5143     }
5144 }
5145
5146 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5147 {
5148     /* Return the relative performance loss on the total run time
5149      * due to the force calculation load imbalance.
5150      */
5151     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5152     {
5153         return
5154             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5155             (dd->comm->load_step*dd->nnodes);
5156     }
5157     else
5158     {
5159         return 0;
5160     }
5161 }
5162
5163 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5164 {
5165     char               buf[STRLEN];
5166     int                npp, npme, nnodes, d, limp;
5167     float              imbal, pme_f_ratio, lossf = 0, lossp = 0;
5168     gmx_bool           bLim;
5169     gmx_domdec_comm_t *comm;
5170
5171     comm = dd->comm;
5172     if (DDMASTER(dd) && comm->nload > 0)
5173     {
5174         npp    = dd->nnodes;
5175         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5176         nnodes = npp + npme;
5177         if (dd->nnodes > 1 && comm->load_sum > 0)
5178         {
5179             imbal  = comm->load_max*npp/comm->load_sum - 1;
5180             lossf  = dd_force_imb_perf_loss(dd);
5181             sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5182             fprintf(fplog, "%s", buf);
5183             fprintf(stderr, "\n");
5184             fprintf(stderr, "%s", buf);
5185             sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5186             fprintf(fplog, "%s", buf);
5187             fprintf(stderr, "%s", buf);
5188         }
5189         bLim = FALSE;
5190         if (dlbIsOn(comm))
5191         {
5192             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5193             for (d = 0; d < dd->ndim; d++)
5194             {
5195                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5196                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5197                 if (limp >= 50)
5198                 {
5199                     bLim = TRUE;
5200                 }
5201             }
5202             sprintf(buf+strlen(buf), "\n");
5203             fprintf(fplog, "%s", buf);
5204             fprintf(stderr, "%s", buf);
5205         }
5206         if (npme > 0 && comm->load_mdf > 0 && comm->load_step > 0)
5207         {
5208             pme_f_ratio = comm->load_pme/comm->load_mdf;
5209             lossp       = (comm->load_pme - comm->load_mdf)/comm->load_step;
5210             if (lossp <= 0)
5211             {
5212                 lossp *= (float)npme/(float)nnodes;
5213             }
5214             else
5215             {
5216                 lossp *= (float)npp/(float)nnodes;
5217             }
5218             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5219             fprintf(fplog, "%s", buf);
5220             fprintf(stderr, "%s", buf);
5221             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5222             fprintf(fplog, "%s", buf);
5223             fprintf(stderr, "%s", buf);
5224         }
5225         fprintf(fplog, "\n");
5226         fprintf(stderr, "\n");
5227
5228         if (lossf >= DD_PERF_LOSS_WARN)
5229         {
5230             sprintf(buf,
5231                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5232                     "      in the domain decomposition.\n", lossf*100);
5233             if (!dlbIsOn(comm))
5234             {
5235                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5236             }
5237             else if (bLim)
5238             {
5239                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5240             }
5241             fprintf(fplog, "%s\n", buf);
5242             fprintf(stderr, "%s\n", buf);
5243         }
5244         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS_WARN)
5245         {
5246             sprintf(buf,
5247                     "NOTE: %.1f %% performance was lost because the PME ranks\n"
5248                     "      had %s work to do than the PP ranks.\n"
5249                     "      You might want to %s the number of PME ranks\n"
5250                     "      or %s the cut-off and the grid spacing.\n",
5251                     fabs(lossp*100),
5252                     (lossp < 0) ? "less"     : "more",
5253                     (lossp < 0) ? "decrease" : "increase",
5254                     (lossp < 0) ? "decrease" : "increase");
5255             fprintf(fplog, "%s\n", buf);
5256             fprintf(stderr, "%s\n", buf);
5257         }
5258     }
5259 }
5260
5261 static float dd_vol_min(gmx_domdec_t *dd)
5262 {
5263     return dd->comm->load[0].cvol_min*dd->nnodes;
5264 }
5265
5266 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5267 {
5268     return dd->comm->load[0].flags;
5269 }
5270
5271 static float dd_f_imbal(gmx_domdec_t *dd)
5272 {
5273     if (dd->comm->load[0].sum > 0)
5274     {
5275         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
5276     }
5277     else
5278     {
5279         /* Something is wrong in the cycle counting, report no load imbalance */
5280         return 0.0f;
5281     }
5282 }
5283
5284 float dd_pme_f_ratio(gmx_domdec_t *dd)
5285 {
5286     /* Should only be called on the DD master rank */
5287     assert(DDMASTER(dd));
5288
5289     if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
5290     {
5291         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5292     }
5293     else
5294     {
5295         return -1.0;
5296     }
5297 }
5298
5299 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5300 {
5301     int  flags, d;
5302     char buf[22];
5303
5304     flags = dd_load_flags(dd);
5305     if (flags)
5306     {
5307         fprintf(fplog,
5308                 "DD  load balancing is limited by minimum cell size in dimension");
5309         for (d = 0; d < dd->ndim; d++)
5310         {
5311             if (flags & (1<<d))
5312             {
5313                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5314             }
5315         }
5316         fprintf(fplog, "\n");
5317     }
5318     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5319     if (dlbIsOn(dd->comm))
5320     {
5321         fprintf(fplog, "  vol min/aver %5.3f%c",
5322                 dd_vol_min(dd), flags ? '!' : ' ');
5323     }
5324     if (dd->nnodes > 1)
5325     {
5326         fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5327     }
5328     if (dd->comm->cycl_n[ddCyclPME])
5329     {
5330         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5331     }
5332     fprintf(fplog, "\n\n");
5333 }
5334
5335 static void dd_print_load_verbose(gmx_domdec_t *dd)
5336 {
5337     if (dlbIsOn(dd->comm))
5338     {
5339         fprintf(stderr, "vol %4.2f%c ",
5340                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5341     }
5342     if (dd->nnodes > 1)
5343     {
5344         fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5345     }
5346     if (dd->comm->cycl_n[ddCyclPME])
5347     {
5348         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5349     }
5350 }
5351
5352 #ifdef GMX_MPI
5353 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5354 {
5355     MPI_Comm           c_row;
5356     int                dim, i, rank;
5357     ivec               loc_c;
5358     domdec_root_t     *root;
5359     gmx_bool           bPartOfGroup = FALSE;
5360
5361     dim = dd->dim[dim_ind];
5362     copy_ivec(loc, loc_c);
5363     for (i = 0; i < dd->nc[dim]; i++)
5364     {
5365         loc_c[dim] = i;
5366         rank       = dd_index(dd->nc, loc_c);
5367         if (rank == dd->rank)
5368         {
5369             /* This process is part of the group */
5370             bPartOfGroup = TRUE;
5371         }
5372     }
5373     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5374                    &c_row);
5375     if (bPartOfGroup)
5376     {
5377         dd->comm->mpi_comm_load[dim_ind] = c_row;
5378         if (dd->comm->dlbState != edlbsOffForever)
5379         {
5380             if (dd->ci[dim] == dd->master_ci[dim])
5381             {
5382                 /* This is the root process of this row */
5383                 snew(dd->comm->root[dim_ind], 1);
5384                 root = dd->comm->root[dim_ind];
5385                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5386                 snew(root->old_cell_f, dd->nc[dim]+1);
5387                 snew(root->bCellMin, dd->nc[dim]);
5388                 if (dim_ind > 0)
5389                 {
5390                     snew(root->cell_f_max0, dd->nc[dim]);
5391                     snew(root->cell_f_min1, dd->nc[dim]);
5392                     snew(root->bound_min, dd->nc[dim]);
5393                     snew(root->bound_max, dd->nc[dim]);
5394                 }
5395                 snew(root->buf_ncd, dd->nc[dim]);
5396             }
5397             else
5398             {
5399                 /* This is not a root process, we only need to receive cell_f */
5400                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5401             }
5402         }
5403         if (dd->ci[dim] == dd->master_ci[dim])
5404         {
5405             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5406         }
5407     }
5408 }
5409 #endif
5410
5411 void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
5412                                    const gmx_hw_info_t gmx_unused *hwinfo,
5413                                    const gmx_hw_opt_t  gmx_unused *hw_opt)
5414 {
5415 #ifdef GMX_MPI
5416     int           physicalnode_id_hash;
5417     int           gpu_id;
5418     gmx_domdec_t *dd;
5419     MPI_Comm      mpi_comm_pp_physicalnode;
5420
5421     if (!(cr->duty & DUTY_PP) || hw_opt->gpu_opt.n_dev_use == 0)
5422     {
5423         /* Only PP nodes (currently) use GPUs.
5424          * If we don't have GPUs, there are no resources to share.
5425          */
5426         return;
5427     }
5428
5429     physicalnode_id_hash = gmx_physicalnode_id_hash();
5430
5431     gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
5432
5433     dd = cr->dd;
5434
5435     if (debug)
5436     {
5437         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5438         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5439                 dd->rank, physicalnode_id_hash, gpu_id);
5440     }
5441     /* Split the PP communicator over the physical nodes */
5442     /* TODO: See if we should store this (before), as it's also used for
5443      * for the nodecomm summution.
5444      */
5445     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5446                    &mpi_comm_pp_physicalnode);
5447     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5448                    &dd->comm->mpi_comm_gpu_shared);
5449     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5450     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5451
5452     if (debug)
5453     {
5454         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5455     }
5456
5457     /* Note that some ranks could share a GPU, while others don't */
5458
5459     if (dd->comm->nrank_gpu_shared == 1)
5460     {
5461         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5462     }
5463 #endif
5464 }
5465
5466 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5467 {
5468 #ifdef GMX_MPI
5469     int  dim0, dim1, i, j;
5470     ivec loc;
5471
5472     if (debug)
5473     {
5474         fprintf(debug, "Making load communicators\n");
5475     }
5476
5477     snew(dd->comm->load,          std::max(dd->ndim, 1));
5478     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
5479
5480     if (dd->ndim == 0)
5481     {
5482         return;
5483     }
5484
5485     clear_ivec(loc);
5486     make_load_communicator(dd, 0, loc);
5487     if (dd->ndim > 1)
5488     {
5489         dim0 = dd->dim[0];
5490         for (i = 0; i < dd->nc[dim0]; i++)
5491         {
5492             loc[dim0] = i;
5493             make_load_communicator(dd, 1, loc);
5494         }
5495     }
5496     if (dd->ndim > 2)
5497     {
5498         dim0 = dd->dim[0];
5499         for (i = 0; i < dd->nc[dim0]; i++)
5500         {
5501             loc[dim0] = i;
5502             dim1      = dd->dim[1];
5503             for (j = 0; j < dd->nc[dim1]; j++)
5504             {
5505                 loc[dim1] = j;
5506                 make_load_communicator(dd, 2, loc);
5507             }
5508         }
5509     }
5510
5511     if (debug)
5512     {
5513         fprintf(debug, "Finished making load communicators\n");
5514     }
5515 #endif
5516 }
5517
5518 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5519 {
5520     int                     d, dim, i, j, m;
5521     ivec                    tmp, s;
5522     int                     nzone, nzonep;
5523     ivec                    dd_zp[DD_MAXIZONE];
5524     gmx_domdec_zones_t     *zones;
5525     gmx_domdec_ns_ranges_t *izone;
5526
5527     for (d = 0; d < dd->ndim; d++)
5528     {
5529         dim = dd->dim[d];
5530         copy_ivec(dd->ci, tmp);
5531         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5532         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5533         copy_ivec(dd->ci, tmp);
5534         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5535         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5536         if (debug)
5537         {
5538             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5539                     dd->rank, dim,
5540                     dd->neighbor[d][0],
5541                     dd->neighbor[d][1]);
5542         }
5543     }
5544
5545     if (fplog)
5546     {
5547         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5548                 dd->ndim,
5549                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5550                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5551     }
5552     switch (dd->ndim)
5553     {
5554         case 3:
5555             nzone  = dd_z3n;
5556             nzonep = dd_zp3n;
5557             for (i = 0; i < nzonep; i++)
5558             {
5559                 copy_ivec(dd_zp3[i], dd_zp[i]);
5560             }
5561             break;
5562         case 2:
5563             nzone  = dd_z2n;
5564             nzonep = dd_zp2n;
5565             for (i = 0; i < nzonep; i++)
5566             {
5567                 copy_ivec(dd_zp2[i], dd_zp[i]);
5568             }
5569             break;
5570         case 1:
5571             nzone  = dd_z1n;
5572             nzonep = dd_zp1n;
5573             for (i = 0; i < nzonep; i++)
5574             {
5575                 copy_ivec(dd_zp1[i], dd_zp[i]);
5576             }
5577             break;
5578         case 0:
5579             nzone  = dd_z0n;
5580             nzonep = dd_zp0n;
5581             for (i = 0; i < nzonep; i++)
5582             {
5583                 copy_ivec(dd_zp0[i], dd_zp[i]);
5584             }
5585             break;
5586         default:
5587             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5588             nzone  = 0;
5589             nzonep = 0;
5590     }
5591
5592     zones = &dd->comm->zones;
5593
5594     for (i = 0; i < nzone; i++)
5595     {
5596         m = 0;
5597         clear_ivec(zones->shift[i]);
5598         for (d = 0; d < dd->ndim; d++)
5599         {
5600             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5601         }
5602     }
5603
5604     zones->n = nzone;
5605     for (i = 0; i < nzone; i++)
5606     {
5607         for (d = 0; d < DIM; d++)
5608         {
5609             s[d] = dd->ci[d] - zones->shift[i][d];
5610             if (s[d] < 0)
5611             {
5612                 s[d] += dd->nc[d];
5613             }
5614             else if (s[d] >= dd->nc[d])
5615             {
5616                 s[d] -= dd->nc[d];
5617             }
5618         }
5619     }
5620     zones->nizone = nzonep;
5621     for (i = 0; i < zones->nizone; i++)
5622     {
5623         if (dd_zp[i][0] != i)
5624         {
5625             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5626         }
5627         izone     = &zones->izone[i];
5628         izone->j0 = dd_zp[i][1];
5629         izone->j1 = dd_zp[i][2];
5630         for (dim = 0; dim < DIM; dim++)
5631         {
5632             if (dd->nc[dim] == 1)
5633             {
5634                 /* All shifts should be allowed */
5635                 izone->shift0[dim] = -1;
5636                 izone->shift1[dim] = 1;
5637             }
5638             else
5639             {
5640                 /*
5641                    izone->shift0[d] = 0;
5642                    izone->shift1[d] = 0;
5643                    for(j=izone->j0; j<izone->j1; j++) {
5644                    if (dd->shift[j][d] > dd->shift[i][d])
5645                    izone->shift0[d] = -1;
5646                    if (dd->shift[j][d] < dd->shift[i][d])
5647                    izone->shift1[d] = 1;
5648                    }
5649                  */
5650
5651                 int shift_diff;
5652
5653                 /* Assume the shift are not more than 1 cell */
5654                 izone->shift0[dim] = 1;
5655                 izone->shift1[dim] = -1;
5656                 for (j = izone->j0; j < izone->j1; j++)
5657                 {
5658                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5659                     if (shift_diff < izone->shift0[dim])
5660                     {
5661                         izone->shift0[dim] = shift_diff;
5662                     }
5663                     if (shift_diff > izone->shift1[dim])
5664                     {
5665                         izone->shift1[dim] = shift_diff;
5666                     }
5667                 }
5668             }
5669         }
5670     }
5671
5672     if (dd->comm->dlbState != edlbsOffForever)
5673     {
5674         snew(dd->comm->root, dd->ndim);
5675     }
5676
5677     if (dd->comm->bRecordLoad)
5678     {
5679         make_load_communicators(dd);
5680     }
5681 }
5682
5683 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
5684 {
5685     gmx_domdec_t      *dd;
5686     dd   = cr->dd;
5687
5688 #ifdef GMX_MPI
5689     gmx_domdec_comm_t *comm;
5690     int                rank, *buf;
5691     ivec               periods;
5692     MPI_Comm           comm_cart;
5693
5694     comm = dd->comm;
5695
5696     if (comm->bCartesianPP)
5697     {
5698         /* Set up cartesian communication for the particle-particle part */
5699         if (fplog)
5700         {
5701             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5702                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5703         }
5704
5705         for (int i = 0; i < DIM; i++)
5706         {
5707             periods[i] = TRUE;
5708         }
5709         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5710                         &comm_cart);
5711         /* We overwrite the old communicator with the new cartesian one */
5712         cr->mpi_comm_mygroup = comm_cart;
5713     }
5714
5715     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5716     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5717
5718     if (comm->bCartesianPP_PME)
5719     {
5720         /* Since we want to use the original cartesian setup for sim,
5721          * and not the one after split, we need to make an index.
5722          */
5723         snew(comm->ddindex2ddnodeid, dd->nnodes);
5724         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5725         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5726         /* Get the rank of the DD master,
5727          * above we made sure that the master node is a PP node.
5728          */
5729         if (MASTER(cr))
5730         {
5731             rank = dd->rank;
5732         }
5733         else
5734         {
5735             rank = 0;
5736         }
5737         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5738     }
5739     else if (comm->bCartesianPP)
5740     {
5741         if (cr->npmenodes == 0)
5742         {
5743             /* The PP communicator is also
5744              * the communicator for this simulation
5745              */
5746             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5747         }
5748         cr->nodeid = dd->rank;
5749
5750         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5751
5752         /* We need to make an index to go from the coordinates
5753          * to the nodeid of this simulation.
5754          */
5755         snew(comm->ddindex2simnodeid, dd->nnodes);
5756         snew(buf, dd->nnodes);
5757         if (cr->duty & DUTY_PP)
5758         {
5759             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5760         }
5761         /* Communicate the ddindex to simulation nodeid index */
5762         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5763                       cr->mpi_comm_mysim);
5764         sfree(buf);
5765
5766         /* Determine the master coordinates and rank.
5767          * The DD master should be the same node as the master of this sim.
5768          */
5769         for (int i = 0; i < dd->nnodes; i++)
5770         {
5771             if (comm->ddindex2simnodeid[i] == 0)
5772             {
5773                 ddindex2xyz(dd->nc, i, dd->master_ci);
5774                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5775             }
5776         }
5777         if (debug)
5778         {
5779             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5780         }
5781     }
5782     else
5783     {
5784         /* No Cartesian communicators */
5785         /* We use the rank in dd->comm->all as DD index */
5786         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5787         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5788         dd->masterrank = 0;
5789         clear_ivec(dd->master_ci);
5790     }
5791 #endif
5792
5793     if (fplog)
5794     {
5795         fprintf(fplog,
5796                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5797                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5798     }
5799     if (debug)
5800     {
5801         fprintf(debug,
5802                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5803                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5804     }
5805 }
5806
5807 static void receive_ddindex2simnodeid(t_commrec gmx_unused *cr)
5808 {
5809 #ifdef GMX_MPI
5810     gmx_domdec_t      *dd;
5811     gmx_domdec_comm_t *comm;
5812
5813     dd   = cr->dd;
5814     comm = dd->comm;
5815
5816     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5817     {
5818         int *buf;
5819         snew(comm->ddindex2simnodeid, dd->nnodes);
5820         snew(buf, dd->nnodes);
5821         if (cr->duty & DUTY_PP)
5822         {
5823             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5824         }
5825         /* Communicate the ddindex to simulation nodeid index */
5826         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5827                       cr->mpi_comm_mysim);
5828         sfree(buf);
5829     }
5830 #endif
5831 }
5832
5833 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5834                                                      int ncg, int natoms)
5835 {
5836     gmx_domdec_master_t *ma;
5837     int                  i;
5838
5839     snew(ma, 1);
5840
5841     snew(ma->ncg, dd->nnodes);
5842     snew(ma->index, dd->nnodes+1);
5843     snew(ma->cg, ncg);
5844     snew(ma->nat, dd->nnodes);
5845     snew(ma->ibuf, dd->nnodes*2);
5846     snew(ma->cell_x, DIM);
5847     for (i = 0; i < DIM; i++)
5848     {
5849         snew(ma->cell_x[i], dd->nc[i]+1);
5850     }
5851
5852     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5853     {
5854         ma->vbuf = NULL;
5855     }
5856     else
5857     {
5858         snew(ma->vbuf, natoms);
5859     }
5860
5861     return ma;
5862 }
5863
5864 static void split_communicator(FILE *fplog, t_commrec *cr, int gmx_unused dd_node_order,
5865                                int gmx_unused reorder)
5866 {
5867     gmx_domdec_t      *dd;
5868     gmx_domdec_comm_t *comm;
5869     int                i;
5870     gmx_bool           bDiv[DIM];
5871 #ifdef GMX_MPI
5872     MPI_Comm           comm_cart;
5873 #endif
5874
5875     dd   = cr->dd;
5876     comm = dd->comm;
5877
5878     if (comm->bCartesianPP)
5879     {
5880         for (i = 1; i < DIM; i++)
5881         {
5882             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5883         }
5884         if (bDiv[YY] || bDiv[ZZ])
5885         {
5886             comm->bCartesianPP_PME = TRUE;
5887             /* If we have 2D PME decomposition, which is always in x+y,
5888              * we stack the PME only nodes in z.
5889              * Otherwise we choose the direction that provides the thinnest slab
5890              * of PME only nodes as this will have the least effect
5891              * on the PP communication.
5892              * But for the PME communication the opposite might be better.
5893              */
5894             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5895                              !bDiv[YY] ||
5896                              dd->nc[YY] > dd->nc[ZZ]))
5897             {
5898                 comm->cartpmedim = ZZ;
5899             }
5900             else
5901             {
5902                 comm->cartpmedim = YY;
5903             }
5904             comm->ntot[comm->cartpmedim]
5905                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5906         }
5907         else if (fplog)
5908         {
5909             fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
5910             fprintf(fplog,
5911                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5912         }
5913     }
5914
5915 #ifdef GMX_MPI
5916     if (comm->bCartesianPP_PME)
5917     {
5918         int  rank;
5919         ivec periods;
5920
5921         if (fplog)
5922         {
5923             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
5924         }
5925
5926         for (i = 0; i < DIM; i++)
5927         {
5928             periods[i] = TRUE;
5929         }
5930         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
5931                         &comm_cart);
5932         MPI_Comm_rank(comm_cart, &rank);
5933         if (MASTERNODE(cr) && rank != 0)
5934         {
5935             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5936         }
5937
5938         /* With this assigment we loose the link to the original communicator
5939          * which will usually be MPI_COMM_WORLD, unless have multisim.
5940          */
5941         cr->mpi_comm_mysim = comm_cart;
5942         cr->sim_nodeid     = rank;
5943
5944         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
5945
5946         if (fplog)
5947         {
5948             fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
5949                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5950         }
5951
5952         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5953         {
5954             cr->duty = DUTY_PP;
5955         }
5956         if (cr->npmenodes == 0 ||
5957             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5958         {
5959             cr->duty = DUTY_PME;
5960         }
5961
5962         /* Split the sim communicator into PP and PME only nodes */
5963         MPI_Comm_split(cr->mpi_comm_mysim,
5964                        cr->duty,
5965                        dd_index(comm->ntot, dd->ci),
5966                        &cr->mpi_comm_mygroup);
5967     }
5968     else
5969     {
5970         switch (dd_node_order)
5971         {
5972             case ddnoPP_PME:
5973                 if (fplog)
5974                 {
5975                     fprintf(fplog, "Order of the ranks: PP first, PME last\n");
5976                 }
5977                 break;
5978             case ddnoINTERLEAVE:
5979                 /* Interleave the PP-only and PME-only nodes,
5980                  * as on clusters with dual-core machines this will double
5981                  * the communication bandwidth of the PME processes
5982                  * and thus speed up the PP <-> PME and inter PME communication.
5983                  */
5984                 if (fplog)
5985                 {
5986                     fprintf(fplog, "Interleaving PP and PME ranks\n");
5987                 }
5988                 comm->pmenodes = dd_pmenodes(cr);
5989                 break;
5990             case ddnoCARTESIAN:
5991                 break;
5992             default:
5993                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
5994         }
5995
5996         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
5997         {
5998             cr->duty = DUTY_PME;
5999         }
6000         else
6001         {
6002             cr->duty = DUTY_PP;
6003         }
6004
6005         /* Split the sim communicator into PP and PME only nodes */
6006         MPI_Comm_split(cr->mpi_comm_mysim,
6007                        cr->duty,
6008                        cr->nodeid,
6009                        &cr->mpi_comm_mygroup);
6010         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6011     }
6012 #endif
6013
6014     if (fplog)
6015     {
6016         fprintf(fplog, "This rank does only %s work.\n\n",
6017                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6018     }
6019 }
6020
6021 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6022 {
6023     gmx_domdec_t      *dd;
6024     gmx_domdec_comm_t *comm;
6025     int                CartReorder;
6026
6027     dd   = cr->dd;
6028     comm = dd->comm;
6029
6030     copy_ivec(dd->nc, comm->ntot);
6031
6032     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6033     comm->bCartesianPP_PME = FALSE;
6034
6035     /* Reorder the nodes by default. This might change the MPI ranks.
6036      * Real reordering is only supported on very few architectures,
6037      * Blue Gene is one of them.
6038      */
6039     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6040
6041     if (cr->npmenodes > 0)
6042     {
6043         /* Split the communicator into a PP and PME part */
6044         split_communicator(fplog, cr, dd_node_order, CartReorder);
6045         if (comm->bCartesianPP_PME)
6046         {
6047             /* We (possibly) reordered the nodes in split_communicator,
6048              * so it is no longer required in make_pp_communicator.
6049              */
6050             CartReorder = FALSE;
6051         }
6052     }
6053     else
6054     {
6055         /* All nodes do PP and PME */
6056 #ifdef GMX_MPI
6057         /* We do not require separate communicators */
6058         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6059 #endif
6060     }
6061
6062     if (cr->duty & DUTY_PP)
6063     {
6064         /* Copy or make a new PP communicator */
6065         make_pp_communicator(fplog, cr, CartReorder);
6066     }
6067     else
6068     {
6069         receive_ddindex2simnodeid(cr);
6070     }
6071
6072     if (!(cr->duty & DUTY_PME))
6073     {
6074         /* Set up the commnuication to our PME node */
6075         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6076         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6077         if (debug)
6078         {
6079             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6080                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6081         }
6082     }
6083     else
6084     {
6085         dd->pme_nodeid = -1;
6086     }
6087
6088     if (DDMASTER(dd))
6089     {
6090         dd->ma = init_gmx_domdec_master_t(dd,
6091                                           comm->cgs_gl.nr,
6092                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6093     }
6094 }
6095
6096 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6097 {
6098     real  *slb_frac, tot;
6099     int    i, n;
6100     double dbl;
6101
6102     slb_frac = NULL;
6103     if (nc > 1 && size_string != NULL)
6104     {
6105         if (fplog)
6106         {
6107             fprintf(fplog, "Using static load balancing for the %s direction\n",
6108                     dir);
6109         }
6110         snew(slb_frac, nc);
6111         tot = 0;
6112         for (i = 0; i < nc; i++)
6113         {
6114             dbl = 0;
6115             sscanf(size_string, "%20lf%n", &dbl, &n);
6116             if (dbl == 0)
6117             {
6118                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6119             }
6120             slb_frac[i]  = dbl;
6121             size_string += n;
6122             tot         += slb_frac[i];
6123         }
6124         /* Normalize */
6125         if (fplog)
6126         {
6127             fprintf(fplog, "Relative cell sizes:");
6128         }
6129         for (i = 0; i < nc; i++)
6130         {
6131             slb_frac[i] /= tot;
6132             if (fplog)
6133             {
6134                 fprintf(fplog, " %5.3f", slb_frac[i]);
6135             }
6136         }
6137         if (fplog)
6138         {
6139             fprintf(fplog, "\n");
6140         }
6141     }
6142
6143     return slb_frac;
6144 }
6145
6146 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6147 {
6148     int                  n, nmol, ftype;
6149     gmx_mtop_ilistloop_t iloop;
6150     t_ilist             *il;
6151
6152     n     = 0;
6153     iloop = gmx_mtop_ilistloop_init(mtop);
6154     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6155     {
6156         for (ftype = 0; ftype < F_NRE; ftype++)
6157         {
6158             if ((interaction_function[ftype].flags & IF_BOND) &&
6159                 NRAL(ftype) >  2)
6160             {
6161                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6162             }
6163         }
6164     }
6165
6166     return n;
6167 }
6168
6169 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6170 {
6171     char *val;
6172     int   nst;
6173
6174     nst = def;
6175     val = getenv(env_var);
6176     if (val)
6177     {
6178         if (sscanf(val, "%20d", &nst) <= 0)
6179         {
6180             nst = 1;
6181         }
6182         if (fplog)
6183         {
6184             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6185                     env_var, val, nst);
6186         }
6187     }
6188
6189     return nst;
6190 }
6191
6192 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6193 {
6194     if (MASTER(cr))
6195     {
6196         fprintf(stderr, "\n%s\n", warn_string);
6197     }
6198     if (fplog)
6199     {
6200         fprintf(fplog, "\n%s\n", warn_string);
6201     }
6202 }
6203
6204 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6205                                   t_inputrec *ir, FILE *fplog)
6206 {
6207     if (ir->ePBC == epbcSCREW &&
6208         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6209     {
6210         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6211     }
6212
6213     if (ir->ns_type == ensSIMPLE)
6214     {
6215         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6216     }
6217
6218     if (ir->nstlist == 0)
6219     {
6220         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6221     }
6222
6223     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6224     {
6225         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6226     }
6227 }
6228
6229 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6230 {
6231     int  di, d;
6232     real r;
6233
6234     r = ddbox->box_size[XX];
6235     for (di = 0; di < dd->ndim; di++)
6236     {
6237         d = dd->dim[di];
6238         /* Check using the initial average cell size */
6239         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6240     }
6241
6242     return r;
6243 }
6244
6245 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6246                              const char *dlb_opt, gmx_bool bRecordLoad,
6247                              unsigned long Flags, t_inputrec *ir)
6248 {
6249     int           dlbState = -1;
6250     char          buf[STRLEN];
6251
6252     switch (dlb_opt[0])
6253     {
6254         case 'a': dlbState = edlbsOffCanTurnOn; break;
6255         case 'n': dlbState = edlbsOffForever;   break;
6256         case 'y': dlbState = edlbsOn;           break;
6257         default: gmx_incons("Unknown dlb_opt");
6258     }
6259
6260     if (Flags & MD_RERUN)
6261     {
6262         return edlbsOffForever;
6263     }
6264
6265     if (!EI_DYNAMICS(ir->eI))
6266     {
6267         if (dlbState == edlbsOn)
6268         {
6269             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6270             dd_warning(cr, fplog, buf);
6271         }
6272
6273         return edlbsOffForever;
6274     }
6275
6276     if (!bRecordLoad)
6277     {
6278         dd_warning(cr, fplog, "NOTE: Cycle counters unsupported or not enabled in kernel. Cannot use dynamic load balancing.\n");
6279         return edlbsOffForever;
6280     }
6281
6282     if (Flags & MD_REPRODUCIBLE)
6283     {
6284         switch (dlbState)
6285         {
6286             case edlbsOffForever:
6287                 break;
6288             case edlbsOffCanTurnOn:
6289                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6290                 dlbState = edlbsOffForever;
6291                 break;
6292             case edlbsOn:
6293                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6294                 break;
6295             default:
6296                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
6297                 break;
6298         }
6299     }
6300
6301     return dlbState;
6302 }
6303
6304 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6305 {
6306     int dim;
6307
6308     dd->ndim = 0;
6309     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6310     {
6311         /* Decomposition order z,y,x */
6312         if (fplog)
6313         {
6314             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6315         }
6316         for (dim = DIM-1; dim >= 0; dim--)
6317         {
6318             if (dd->nc[dim] > 1)
6319             {
6320                 dd->dim[dd->ndim++] = dim;
6321             }
6322         }
6323     }
6324     else
6325     {
6326         /* Decomposition order x,y,z */
6327         for (dim = 0; dim < DIM; dim++)
6328         {
6329             if (dd->nc[dim] > 1)
6330             {
6331                 dd->dim[dd->ndim++] = dim;
6332             }
6333         }
6334     }
6335 }
6336
6337 static gmx_domdec_comm_t *init_dd_comm()
6338 {
6339     gmx_domdec_comm_t *comm;
6340     int                i;
6341
6342     snew(comm, 1);
6343     snew(comm->cggl_flag, DIM*2);
6344     snew(comm->cgcm_state, DIM*2);
6345     for (i = 0; i < DIM*2; i++)
6346     {
6347         comm->cggl_flag_nalloc[i]  = 0;
6348         comm->cgcm_state_nalloc[i] = 0;
6349     }
6350
6351     comm->nalloc_int = 0;
6352     comm->buf_int    = NULL;
6353
6354     vec_rvec_init(&comm->vbuf);
6355
6356     comm->n_load_have    = 0;
6357     comm->n_load_collect = 0;
6358
6359     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6360     {
6361         comm->sum_nat[i] = 0;
6362     }
6363     comm->ndecomp   = 0;
6364     comm->nload     = 0;
6365     comm->load_step = 0;
6366     comm->load_sum  = 0;
6367     comm->load_max  = 0;
6368     clear_ivec(comm->load_lim);
6369     comm->load_mdf  = 0;
6370     comm->load_pme  = 0;
6371
6372     return comm;
6373 }
6374
6375 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6376                                         unsigned long Flags,
6377                                         ivec nc,
6378                                         real comm_distance_min, real rconstr,
6379                                         const char *dlb_opt, real dlb_scale,
6380                                         const char *sizex, const char *sizey, const char *sizez,
6381                                         gmx_mtop_t *mtop, t_inputrec *ir,
6382                                         matrix box, rvec *x,
6383                                         gmx_ddbox_t *ddbox,
6384                                         int *npme_x, int *npme_y)
6385 {
6386     gmx_domdec_t      *dd;
6387     gmx_domdec_comm_t *comm;
6388     int                recload;
6389     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6390     gmx_bool           bC;
6391     char               buf[STRLEN];
6392     const real         tenPercentMargin = 1.1;
6393
6394     if (fplog)
6395     {
6396         fprintf(fplog,
6397                 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
6398     }
6399
6400     snew(dd, 1);
6401
6402     dd->comm = init_dd_comm();
6403     comm     = dd->comm;
6404     snew(comm->cggl_flag, DIM*2);
6405     snew(comm->cgcm_state, DIM*2);
6406
6407     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6408     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6409
6410     dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
6411     comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
6412     comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
6413     recload             = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
6414     comm->nstSortCG     = dd_getenv(fplog, "GMX_DD_NST_SORT_CHARGE_GROUPS", 1);
6415     comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
6416     comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
6417     comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
6418
6419     dd->pme_recv_f_alloc = 0;
6420     dd->pme_recv_f_buf   = NULL;
6421
6422     if (dd->bSendRecv2 && fplog)
6423     {
6424         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6425     }
6426     if (comm->eFlop)
6427     {
6428         if (fplog)
6429         {
6430             fprintf(fplog, "Will load balance based on FLOP count\n");
6431         }
6432         if (comm->eFlop > 1)
6433         {
6434             srand(1+cr->nodeid);
6435         }
6436         comm->bRecordLoad = TRUE;
6437     }
6438     else
6439     {
6440         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6441
6442     }
6443
6444     /* Initialize to GPU share count to 0, might change later */
6445     comm->nrank_gpu_shared = 0;
6446
6447     comm->dlbState                 = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6448     comm->bCheckWhetherToTurnDlbOn = TRUE;
6449
6450     if (fplog)
6451     {
6452         fprintf(fplog, "Dynamic load balancing: %s\n",
6453                 edlbs_names[comm->dlbState]);
6454     }
6455     comm->bPMELoadBalDLBLimits = FALSE;
6456
6457     if (comm->nstSortCG)
6458     {
6459         if (fplog)
6460         {
6461             if (comm->nstSortCG == 1)
6462             {
6463                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6464             }
6465             else
6466             {
6467                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6468                         comm->nstSortCG);
6469             }
6470         }
6471         snew(comm->sort, 1);
6472     }
6473     else
6474     {
6475         if (fplog)
6476         {
6477             fprintf(fplog, "Will not sort the charge groups\n");
6478         }
6479     }
6480
6481     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6482
6483     comm->bInterCGBondeds = ((ncg_mtop(mtop) > mtop->mols.nr) ||
6484                              mtop->bIntermolecularInteractions);
6485     if (comm->bInterCGBondeds)
6486     {
6487         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6488     }
6489     else
6490     {
6491         comm->bInterCGMultiBody = FALSE;
6492     }
6493
6494     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6495     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6496
6497     if (ir->rlistlong == 0)
6498     {
6499         /* Set the cut-off to some very large value,
6500          * so we don't need if statements everywhere in the code.
6501          * We use sqrt, since the cut-off is squared in some places.
6502          */
6503         comm->cutoff   = GMX_CUTOFF_INF;
6504     }
6505     else
6506     {
6507         comm->cutoff   = ir->rlistlong;
6508     }
6509     comm->cutoff_mbody = 0;
6510
6511     comm->cellsize_limit = 0;
6512     comm->bBondComm      = FALSE;
6513
6514     /* Atoms should be able to move by up to half the list buffer size (if > 0)
6515      * within nstlist steps. Since boundaries are allowed to displace by half
6516      * a cell size, DD cells should be at least the size of the list buffer.
6517      */
6518     comm->cellsize_limit = std::max(comm->cellsize_limit,
6519                                     ir->rlistlong - std::max(ir->rvdw, ir->rcoulomb));
6520
6521     if (comm->bInterCGBondeds)
6522     {
6523         if (comm_distance_min > 0)
6524         {
6525             comm->cutoff_mbody = comm_distance_min;
6526             if (Flags & MD_DDBONDCOMM)
6527             {
6528                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6529             }
6530             else
6531             {
6532                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6533             }
6534             r_bonded_limit = comm->cutoff_mbody;
6535         }
6536         else if (ir->bPeriodicMols)
6537         {
6538             /* Can not easily determine the required cut-off */
6539             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6540             comm->cutoff_mbody = comm->cutoff/2;
6541             r_bonded_limit     = comm->cutoff_mbody;
6542         }
6543         else
6544         {
6545             if (MASTER(cr))
6546             {
6547                 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6548                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6549             }
6550             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6551             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6552
6553             /* We use an initial margin of 10% for the minimum cell size,
6554              * except when we are just below the non-bonded cut-off.
6555              */
6556             if (Flags & MD_DDBONDCOMM)
6557             {
6558                 if (std::max(r_2b, r_mb) > comm->cutoff)
6559                 {
6560                     r_bonded        = std::max(r_2b, r_mb);
6561                     r_bonded_limit  = tenPercentMargin*r_bonded;
6562                     comm->bBondComm = TRUE;
6563                 }
6564                 else
6565                 {
6566                     r_bonded       = r_mb;
6567                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
6568                 }
6569                 /* We determine cutoff_mbody later */
6570             }
6571             else
6572             {
6573                 /* No special bonded communication,
6574                  * simply increase the DD cut-off.
6575                  */
6576                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
6577                 comm->cutoff_mbody = r_bonded_limit;
6578                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
6579             }
6580         }
6581         if (fplog)
6582         {
6583             fprintf(fplog,
6584                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6585                     r_bonded_limit);
6586         }
6587         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
6588     }
6589
6590     if (dd->bInterCGcons && rconstr <= 0)
6591     {
6592         /* There is a cell size limit due to the constraints (P-LINCS) */
6593         rconstr = constr_r_max(fplog, mtop, ir);
6594         if (fplog)
6595         {
6596             fprintf(fplog,
6597                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6598                     rconstr);
6599             if (rconstr > comm->cellsize_limit)
6600             {
6601                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6602             }
6603         }
6604     }
6605     else if (rconstr > 0 && fplog)
6606     {
6607         /* Here we do not check for dd->bInterCGcons,
6608          * because one can also set a cell size limit for virtual sites only
6609          * and at this point we don't know yet if there are intercg v-sites.
6610          */
6611         fprintf(fplog,
6612                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6613                 rconstr);
6614     }
6615     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
6616
6617     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6618
6619     if (nc[XX] > 0)
6620     {
6621         copy_ivec(nc, dd->nc);
6622         set_dd_dim(fplog, dd);
6623         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6624
6625         if (cr->npmenodes == -1)
6626         {
6627             cr->npmenodes = 0;
6628         }
6629         acs = average_cellsize_min(dd, ddbox);
6630         if (acs < comm->cellsize_limit)
6631         {
6632             if (fplog)
6633             {
6634                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6635             }
6636             gmx_fatal_collective(FARGS, cr, NULL,
6637                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6638                                  acs, comm->cellsize_limit);
6639         }
6640     }
6641     else
6642     {
6643         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6644
6645         /* We need to choose the optimal DD grid and possibly PME nodes */
6646         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6647                                comm->dlbState != edlbsOffForever, dlb_scale,
6648                                comm->cellsize_limit, comm->cutoff,
6649                                comm->bInterCGBondeds);
6650
6651         if (dd->nc[XX] == 0)
6652         {
6653             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6654             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6655                     !bC ? "-rdd" : "-rcon",
6656                     comm->dlbState != edlbsOffForever ? " or -dds" : "",
6657                     bC ? " or your LINCS settings" : "");
6658
6659             gmx_fatal_collective(FARGS, cr, NULL,
6660                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6661                                  "%s\n"
6662                                  "Look in the log file for details on the domain decomposition",
6663                                  cr->nnodes-cr->npmenodes, limit, buf);
6664         }
6665         set_dd_dim(fplog, dd);
6666     }
6667
6668     if (fplog)
6669     {
6670         fprintf(fplog,
6671                 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6672                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6673     }
6674
6675     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6676     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6677     {
6678         gmx_fatal_collective(FARGS, cr, NULL,
6679                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6680                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6681     }
6682     if (cr->npmenodes > dd->nnodes)
6683     {
6684         gmx_fatal_collective(FARGS, cr, NULL,
6685                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6686     }
6687     if (cr->npmenodes > 0)
6688     {
6689         comm->npmenodes = cr->npmenodes;
6690     }
6691     else
6692     {
6693         comm->npmenodes = dd->nnodes;
6694     }
6695
6696     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6697     {
6698         /* The following choices should match those
6699          * in comm_cost_est in domdec_setup.c.
6700          * Note that here the checks have to take into account
6701          * that the decomposition might occur in a different order than xyz
6702          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6703          * in which case they will not match those in comm_cost_est,
6704          * but since that is mainly for testing purposes that's fine.
6705          */
6706         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6707             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6708             getenv("GMX_PMEONEDD") == NULL)
6709         {
6710             comm->npmedecompdim = 2;
6711             comm->npmenodes_x   = dd->nc[XX];
6712             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6713         }
6714         else
6715         {
6716             /* In case nc is 1 in both x and y we could still choose to
6717              * decompose pme in y instead of x, but we use x for simplicity.
6718              */
6719             comm->npmedecompdim = 1;
6720             if (dd->dim[0] == YY)
6721             {
6722                 comm->npmenodes_x = 1;
6723                 comm->npmenodes_y = comm->npmenodes;
6724             }
6725             else
6726             {
6727                 comm->npmenodes_x = comm->npmenodes;
6728                 comm->npmenodes_y = 1;
6729             }
6730         }
6731         if (fplog)
6732         {
6733             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6734                     comm->npmenodes_x, comm->npmenodes_y, 1);
6735         }
6736     }
6737     else
6738     {
6739         comm->npmedecompdim = 0;
6740         comm->npmenodes_x   = 0;
6741         comm->npmenodes_y   = 0;
6742     }
6743
6744     /* Technically we don't need both of these,
6745      * but it simplifies code not having to recalculate it.
6746      */
6747     *npme_x = comm->npmenodes_x;
6748     *npme_y = comm->npmenodes_y;
6749
6750     snew(comm->slb_frac, DIM);
6751     if (comm->dlbState == edlbsOffForever)
6752     {
6753         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6754         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6755         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6756     }
6757
6758     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6759     {
6760         if (comm->bBondComm || comm->dlbState != edlbsOffForever)
6761         {
6762             /* Set the bonded communication distance to halfway
6763              * the minimum and the maximum,
6764              * since the extra communication cost is nearly zero.
6765              */
6766             acs                = average_cellsize_min(dd, ddbox);
6767             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6768             if (comm->dlbState != edlbsOffForever)
6769             {
6770                 /* Check if this does not limit the scaling */
6771                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
6772             }
6773             if (!comm->bBondComm)
6774             {
6775                 /* Without bBondComm do not go beyond the n.b. cut-off */
6776                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
6777                 if (comm->cellsize_limit >= comm->cutoff)
6778                 {
6779                     /* We don't loose a lot of efficieny
6780                      * when increasing it to the n.b. cut-off.
6781                      * It can even be slightly faster, because we need
6782                      * less checks for the communication setup.
6783                      */
6784                     comm->cutoff_mbody = comm->cutoff;
6785                 }
6786             }
6787             /* Check if we did not end up below our original limit */
6788             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
6789
6790             if (comm->cutoff_mbody > comm->cellsize_limit)
6791             {
6792                 comm->cellsize_limit = comm->cutoff_mbody;
6793             }
6794         }
6795         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6796     }
6797
6798     if (debug)
6799     {
6800         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6801                 "cellsize limit %f\n",
6802                 comm->bBondComm, comm->cellsize_limit);
6803     }
6804
6805     if (MASTER(cr))
6806     {
6807         check_dd_restrictions(cr, dd, ir, fplog);
6808     }
6809
6810     comm->partition_step = INT_MIN;
6811     dd->ddp_count        = 0;
6812
6813     clear_dd_cycle_counts(dd);
6814
6815     return dd;
6816 }
6817
6818 static void set_dlb_limits(gmx_domdec_t *dd)
6819
6820 {
6821     int d;
6822
6823     for (d = 0; d < dd->ndim; d++)
6824     {
6825         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6826         dd->comm->cellsize_min[dd->dim[d]] =
6827             dd->comm->cellsize_min_dlb[dd->dim[d]];
6828     }
6829 }
6830
6831
6832 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6833 {
6834     gmx_domdec_t      *dd;
6835     gmx_domdec_comm_t *comm;
6836     real               cellsize_min;
6837     int                d, nc, i;
6838     char               buf[STRLEN];
6839
6840     dd   = cr->dd;
6841     comm = dd->comm;
6842
6843     if (fplog)
6844     {
6845         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
6846     }
6847
6848     cellsize_min = comm->cellsize_min[dd->dim[0]];
6849     for (d = 1; d < dd->ndim; d++)
6850     {
6851         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6852     }
6853
6854     if (cellsize_min < comm->cellsize_limit*1.05)
6855     {
6856         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6857
6858         /* Change DLB from "auto" to "no". */
6859         comm->dlbState = edlbsOffForever;
6860
6861         return;
6862     }
6863
6864     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
6865     comm->dlbState = edlbsOn;
6866
6867     set_dlb_limits(dd);
6868
6869     /* We can set the required cell size info here,
6870      * so we do not need to communicate this.
6871      * The grid is completely uniform.
6872      */
6873     for (d = 0; d < dd->ndim; d++)
6874     {
6875         if (comm->root[d])
6876         {
6877             comm->load[d].sum_m = comm->load[d].sum;
6878
6879             nc = dd->nc[dd->dim[d]];
6880             for (i = 0; i < nc; i++)
6881             {
6882                 comm->root[d]->cell_f[i]    = i/(real)nc;
6883                 if (d > 0)
6884                 {
6885                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6886                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6887                 }
6888             }
6889             comm->root[d]->cell_f[nc] = 1.0;
6890         }
6891     }
6892 }
6893
6894 static char *init_bLocalCG(gmx_mtop_t *mtop)
6895 {
6896     int   ncg, cg;
6897     char *bLocalCG;
6898
6899     ncg = ncg_mtop(mtop);
6900     snew(bLocalCG, ncg);
6901     for (cg = 0; cg < ncg; cg++)
6902     {
6903         bLocalCG[cg] = FALSE;
6904     }
6905
6906     return bLocalCG;
6907 }
6908
6909 void dd_init_bondeds(FILE *fplog,
6910                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
6911                      gmx_vsite_t *vsite,
6912                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
6913 {
6914     gmx_domdec_comm_t *comm;
6915
6916     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
6917
6918     comm = dd->comm;
6919
6920     if (comm->bBondComm)
6921     {
6922         /* Communicate atoms beyond the cut-off for bonded interactions */
6923         comm = dd->comm;
6924
6925         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
6926
6927         comm->bLocalCG = init_bLocalCG(mtop);
6928     }
6929     else
6930     {
6931         /* Only communicate atoms based on cut-off */
6932         comm->cglink   = NULL;
6933         comm->bLocalCG = NULL;
6934     }
6935 }
6936
6937 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
6938                               t_inputrec *ir,
6939                               gmx_bool bDynLoadBal, real dlb_scale,
6940                               gmx_ddbox_t *ddbox)
6941 {
6942     gmx_domdec_comm_t *comm;
6943     int                d;
6944     ivec               np;
6945     real               limit, shrink;
6946     char               buf[64];
6947
6948     if (fplog == NULL)
6949     {
6950         return;
6951     }
6952
6953     comm = dd->comm;
6954
6955     if (bDynLoadBal)
6956     {
6957         fprintf(fplog, "The maximum number of communication pulses is:");
6958         for (d = 0; d < dd->ndim; d++)
6959         {
6960             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
6961         }
6962         fprintf(fplog, "\n");
6963         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
6964         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
6965         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
6966         for (d = 0; d < DIM; d++)
6967         {
6968             if (dd->nc[d] > 1)
6969             {
6970                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6971                 {
6972                     shrink = 0;
6973                 }
6974                 else
6975                 {
6976                     shrink =
6977                         comm->cellsize_min_dlb[d]/
6978                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6979                 }
6980                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
6981             }
6982         }
6983         fprintf(fplog, "\n");
6984     }
6985     else
6986     {
6987         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
6988         fprintf(fplog, "The initial number of communication pulses is:");
6989         for (d = 0; d < dd->ndim; d++)
6990         {
6991             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
6992         }
6993         fprintf(fplog, "\n");
6994         fprintf(fplog, "The initial domain decomposition cell size is:");
6995         for (d = 0; d < DIM; d++)
6996         {
6997             if (dd->nc[d] > 1)
6998             {
6999                 fprintf(fplog, " %c %.2f nm",
7000                         dim2char(d), dd->comm->cellsize_min[d]);
7001             }
7002         }
7003         fprintf(fplog, "\n\n");
7004     }
7005
7006     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7007     {
7008         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7009         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7010                 "non-bonded interactions", "", comm->cutoff);
7011
7012         if (bDynLoadBal)
7013         {
7014             limit = dd->comm->cellsize_limit;
7015         }
7016         else
7017         {
7018             if (dynamic_dd_box(ddbox, ir))
7019             {
7020                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7021             }
7022             limit = dd->comm->cellsize_min[XX];
7023             for (d = 1; d < DIM; d++)
7024             {
7025                 limit = std::min(limit, dd->comm->cellsize_min[d]);
7026             }
7027         }
7028
7029         if (comm->bInterCGBondeds)
7030         {
7031             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7032                     "two-body bonded interactions", "(-rdd)",
7033                     std::max(comm->cutoff, comm->cutoff_mbody));
7034             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7035                     "multi-body bonded interactions", "(-rdd)",
7036                     (comm->bBondComm || dlbIsOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
7037         }
7038         if (dd->vsite_comm)
7039         {
7040             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7041                     "virtual site constructions", "(-rcon)", limit);
7042         }
7043         if (dd->constraint_comm)
7044         {
7045             sprintf(buf, "atoms separated by up to %d constraints",
7046                     1+ir->nProjOrder);
7047             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7048                     buf, "(-rcon)", limit);
7049         }
7050         fprintf(fplog, "\n");
7051     }
7052
7053     fflush(fplog);
7054 }
7055
7056 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7057                                 real               dlb_scale,
7058                                 const t_inputrec  *ir,
7059                                 const gmx_ddbox_t *ddbox)
7060 {
7061     gmx_domdec_comm_t *comm;
7062     int                d, dim, npulse, npulse_d_max, npulse_d;
7063     gmx_bool           bNoCutOff;
7064
7065     comm = dd->comm;
7066
7067     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7068
7069     /* Determine the maximum number of comm. pulses in one dimension */
7070
7071     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7072
7073     /* Determine the maximum required number of grid pulses */
7074     if (comm->cellsize_limit >= comm->cutoff)
7075     {
7076         /* Only a single pulse is required */
7077         npulse = 1;
7078     }
7079     else if (!bNoCutOff && comm->cellsize_limit > 0)
7080     {
7081         /* We round down slightly here to avoid overhead due to the latency
7082          * of extra communication calls when the cut-off
7083          * would be only slightly longer than the cell size.
7084          * Later cellsize_limit is redetermined,
7085          * so we can not miss interactions due to this rounding.
7086          */
7087         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7088     }
7089     else
7090     {
7091         /* There is no cell size limit */
7092         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7093     }
7094
7095     if (!bNoCutOff && npulse > 1)
7096     {
7097         /* See if we can do with less pulses, based on dlb_scale */
7098         npulse_d_max = 0;
7099         for (d = 0; d < dd->ndim; d++)
7100         {
7101             dim      = dd->dim[d];
7102             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7103                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7104             npulse_d_max = std::max(npulse_d_max, npulse_d);
7105         }
7106         npulse = std::min(npulse, npulse_d_max);
7107     }
7108
7109     /* This env var can override npulse */
7110     d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
7111     if (d > 0)
7112     {
7113         npulse = d;
7114     }
7115
7116     comm->maxpulse       = 1;
7117     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7118     for (d = 0; d < dd->ndim; d++)
7119     {
7120         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
7121         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7122         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7123         comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
7124         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7125         {
7126             comm->bVacDLBNoLimit = FALSE;
7127         }
7128     }
7129
7130     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7131     if (!comm->bVacDLBNoLimit)
7132     {
7133         comm->cellsize_limit = std::max(comm->cellsize_limit,
7134                                         comm->cutoff/comm->maxpulse);
7135     }
7136     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7137     /* Set the minimum cell size for each DD dimension */
7138     for (d = 0; d < dd->ndim; d++)
7139     {
7140         if (comm->bVacDLBNoLimit ||
7141             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7142         {
7143             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7144         }
7145         else
7146         {
7147             comm->cellsize_min_dlb[dd->dim[d]] =
7148                 comm->cutoff/comm->cd[d].np_dlb;
7149         }
7150     }
7151     if (comm->cutoff_mbody <= 0)
7152     {
7153         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
7154     }
7155     if (dlbIsOn(comm))
7156     {
7157         set_dlb_limits(dd);
7158     }
7159 }
7160
7161 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7162 {
7163     /* If each molecule is a single charge group
7164      * or we use domain decomposition for each periodic dimension,
7165      * we do not need to take pbc into account for the bonded interactions.
7166      */
7167     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7168             !(dd->nc[XX] > 1 &&
7169               dd->nc[YY] > 1 &&
7170               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7171 }
7172
7173 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7174                        t_inputrec *ir, gmx_ddbox_t *ddbox)
7175 {
7176     gmx_domdec_comm_t *comm;
7177     int                natoms_tot;
7178     real               vol_frac;
7179
7180     comm = dd->comm;
7181
7182     /* Initialize the thread data.
7183      * This can not be done in init_domain_decomposition,
7184      * as the numbers of threads is determined later.
7185      */
7186     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7187     if (comm->nth > 1)
7188     {
7189         snew(comm->dth, comm->nth);
7190     }
7191
7192     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7193     {
7194         init_ddpme(dd, &comm->ddpme[0], 0);
7195         if (comm->npmedecompdim >= 2)
7196         {
7197             init_ddpme(dd, &comm->ddpme[1], 1);
7198         }
7199     }
7200     else
7201     {
7202         comm->npmenodes = 0;
7203         if (dd->pme_nodeid >= 0)
7204         {
7205             gmx_fatal_collective(FARGS, NULL, dd,
7206                                  "Can not have separate PME ranks without PME electrostatics");
7207         }
7208     }
7209
7210     if (debug)
7211     {
7212         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7213     }
7214     if (comm->dlbState != edlbsOffForever)
7215     {
7216         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7217     }
7218
7219     print_dd_settings(fplog, dd, ir, dlbIsOn(comm), dlb_scale, ddbox);
7220     if (comm->dlbState == edlbsOffCanTurnOn)
7221     {
7222         if (fplog)
7223         {
7224             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7225         }
7226         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7227     }
7228
7229     if (ir->ePBC == epbcNONE)
7230     {
7231         vol_frac = 1 - 1/(double)dd->nnodes;
7232     }
7233     else
7234     {
7235         vol_frac =
7236             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7237     }
7238     if (debug)
7239     {
7240         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7241     }
7242     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7243
7244     dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
7245 }
7246
7247 static gmx_bool test_dd_cutoff(t_commrec *cr,
7248                                t_state *state, t_inputrec *ir,
7249                                real cutoff_req)
7250 {
7251     gmx_domdec_t *dd;
7252     gmx_ddbox_t   ddbox;
7253     int           d, dim, np;
7254     real          inv_cell_size;
7255     int           LocallyLimited;
7256
7257     dd = cr->dd;
7258
7259     set_ddbox(dd, FALSE, cr, ir, state->box,
7260               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7261
7262     LocallyLimited = 0;
7263
7264     for (d = 0; d < dd->ndim; d++)
7265     {
7266         dim = dd->dim[d];
7267
7268         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7269         if (dynamic_dd_box(&ddbox, ir))
7270         {
7271             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7272         }
7273
7274         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7275
7276         if (dd->comm->dlbState != edlbsOffForever && dim < ddbox.npbcdim &&
7277             dd->comm->cd[d].np_dlb > 0)
7278         {
7279             if (np > dd->comm->cd[d].np_dlb)
7280             {
7281                 return FALSE;
7282             }
7283
7284             /* If a current local cell size is smaller than the requested
7285              * cut-off, we could still fix it, but this gets very complicated.
7286              * Without fixing here, we might actually need more checks.
7287              */
7288             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7289             {
7290                 LocallyLimited = 1;
7291             }
7292         }
7293     }
7294
7295     if (dd->comm->dlbState != edlbsOffForever)
7296     {
7297         /* If DLB is not active yet, we don't need to check the grid jumps.
7298          * Actually we shouldn't, because then the grid jump data is not set.
7299          */
7300         if (dlbIsOn(dd->comm) &&
7301             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7302         {
7303             LocallyLimited = 1;
7304         }
7305
7306         gmx_sumi(1, &LocallyLimited, cr);
7307
7308         if (LocallyLimited > 0)
7309         {
7310             return FALSE;
7311         }
7312     }
7313
7314     return TRUE;
7315 }
7316
7317 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7318                           real cutoff_req)
7319 {
7320     gmx_bool bCutoffAllowed;
7321
7322     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7323
7324     if (bCutoffAllowed)
7325     {
7326         cr->dd->comm->cutoff = cutoff_req;
7327     }
7328
7329     return bCutoffAllowed;
7330 }
7331
7332 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
7333 {
7334     gmx_domdec_comm_t *comm;
7335
7336     comm = cr->dd->comm;
7337
7338     /* Turn on the DLB limiting (might have been on already) */
7339     comm->bPMELoadBalDLBLimits = TRUE;
7340
7341     /* Change the cut-off limit */
7342     comm->PMELoadBal_max_cutoff = cutoff;
7343
7344     if (debug)
7345     {
7346         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
7347                 comm->PMELoadBal_max_cutoff);
7348     }
7349 }
7350
7351 /* Sets whether we should later check the load imbalance data, so that
7352  * we can trigger dynamic load balancing if enough imbalance has
7353  * arisen.
7354  *
7355  * Used after PME load balancing unlocks DLB, so that the check
7356  * whether DLB will be useful can happen immediately.
7357  */
7358 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
7359 {
7360     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7361     {
7362         dd->comm->bCheckWhetherToTurnDlbOn = bValue;
7363     }
7364 }
7365
7366 /* Returns if we should check whether there has been enough load
7367  * imbalance to trigger dynamic load balancing.
7368  */
7369 static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
7370 {
7371     const int nddp_chk_dlb = 100;
7372
7373     if (dd->comm->dlbState != edlbsOffCanTurnOn)
7374     {
7375         return FALSE;
7376     }
7377
7378     /* We should check whether we should use DLB directly after
7379      * unlocking DLB. */
7380     if (dd->comm->bCheckWhetherToTurnDlbOn)
7381     {
7382         /* This flag was set when the PME load-balancing routines
7383            unlocked DLB, and should now be cleared. */
7384         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
7385         return TRUE;
7386     }
7387     /* We should also check whether we should use DLB every 100
7388      * partitionings (we do not do this every partioning, so that we
7389      * avoid excessive communication). */
7390     if (dd->comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1)
7391     {
7392         return TRUE;
7393     }
7394
7395     return FALSE;
7396 }
7397
7398 gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
7399 {
7400     return (dd->comm->dlbState == edlbsOn);
7401 }
7402
7403 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
7404 {
7405     return (dd->comm->dlbState == edlbsOffTemporarilyLocked);
7406 }
7407
7408 void dd_dlb_lock(gmx_domdec_t *dd)
7409 {
7410     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7411     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7412     {
7413         dd->comm->dlbState = edlbsOffTemporarilyLocked;
7414     }
7415 }
7416
7417 void dd_dlb_unlock(gmx_domdec_t *dd)
7418 {
7419     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7420     if (dd->comm->dlbState == edlbsOffTemporarilyLocked)
7421     {
7422         dd->comm->dlbState = edlbsOffCanTurnOn;
7423         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
7424     }
7425 }
7426
7427 static void merge_cg_buffers(int ncell,
7428                              gmx_domdec_comm_dim_t *cd, int pulse,
7429                              int  *ncg_cell,
7430                              int  *index_gl, int  *recv_i,
7431                              rvec *cg_cm,    rvec *recv_vr,
7432                              int *cgindex,
7433                              cginfo_mb_t *cginfo_mb, int *cginfo)
7434 {
7435     gmx_domdec_ind_t *ind, *ind_p;
7436     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7437     int               shift, shift_at;
7438
7439     ind = &cd->ind[pulse];
7440
7441     /* First correct the already stored data */
7442     shift = ind->nrecv[ncell];
7443     for (cell = ncell-1; cell >= 0; cell--)
7444     {
7445         shift -= ind->nrecv[cell];
7446         if (shift > 0)
7447         {
7448             /* Move the cg's present from previous grid pulses */
7449             cg0                = ncg_cell[ncell+cell];
7450             cg1                = ncg_cell[ncell+cell+1];
7451             cgindex[cg1+shift] = cgindex[cg1];
7452             for (cg = cg1-1; cg >= cg0; cg--)
7453             {
7454                 index_gl[cg+shift] = index_gl[cg];
7455                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7456                 cgindex[cg+shift] = cgindex[cg];
7457                 cginfo[cg+shift]  = cginfo[cg];
7458             }
7459             /* Correct the already stored send indices for the shift */
7460             for (p = 1; p <= pulse; p++)
7461             {
7462                 ind_p = &cd->ind[p];
7463                 cg0   = 0;
7464                 for (c = 0; c < cell; c++)
7465                 {
7466                     cg0 += ind_p->nsend[c];
7467                 }
7468                 cg1 = cg0 + ind_p->nsend[cell];
7469                 for (cg = cg0; cg < cg1; cg++)
7470                 {
7471                     ind_p->index[cg] += shift;
7472                 }
7473             }
7474         }
7475     }
7476
7477     /* Merge in the communicated buffers */
7478     shift    = 0;
7479     shift_at = 0;
7480     cg0      = 0;
7481     for (cell = 0; cell < ncell; cell++)
7482     {
7483         cg1 = ncg_cell[ncell+cell+1] + shift;
7484         if (shift_at > 0)
7485         {
7486             /* Correct the old cg indices */
7487             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7488             {
7489                 cgindex[cg+1] += shift_at;
7490             }
7491         }
7492         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7493         {
7494             /* Copy this charge group from the buffer */
7495             index_gl[cg1] = recv_i[cg0];
7496             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7497             /* Add it to the cgindex */
7498             cg_gl          = index_gl[cg1];
7499             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7500             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7501             cgindex[cg1+1] = cgindex[cg1] + nat;
7502             cg0++;
7503             cg1++;
7504             shift_at += nat;
7505         }
7506         shift                 += ind->nrecv[cell];
7507         ncg_cell[ncell+cell+1] = cg1;
7508     }
7509 }
7510
7511 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7512                                int nzone, int cg0, const int *cgindex)
7513 {
7514     int cg, zone, p;
7515
7516     /* Store the atom block boundaries for easy copying of communication buffers
7517      */
7518     cg = cg0;
7519     for (zone = 0; zone < nzone; zone++)
7520     {
7521         for (p = 0; p < cd->np; p++)
7522         {
7523             cd->ind[p].cell2at0[zone] = cgindex[cg];
7524             cg += cd->ind[p].nrecv[zone];
7525             cd->ind[p].cell2at1[zone] = cgindex[cg];
7526         }
7527     }
7528 }
7529
7530 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7531 {
7532     int      i;
7533     gmx_bool bMiss;
7534
7535     bMiss = FALSE;
7536     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7537     {
7538         if (!bLocalCG[link->a[i]])
7539         {
7540             bMiss = TRUE;
7541         }
7542     }
7543
7544     return bMiss;
7545 }
7546
7547 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7548 typedef struct {
7549     real c[DIM][4]; /* the corners for the non-bonded communication */
7550     real cr0;       /* corner for rounding */
7551     real cr1[4];    /* corners for rounding */
7552     real bc[DIM];   /* corners for bounded communication */
7553     real bcr1;      /* corner for rounding for bonded communication */
7554 } dd_corners_t;
7555
7556 /* Determine the corners of the domain(s) we are communicating with */
7557 static void
7558 set_dd_corners(const gmx_domdec_t *dd,
7559                int dim0, int dim1, int dim2,
7560                gmx_bool bDistMB,
7561                dd_corners_t *c)
7562 {
7563     const gmx_domdec_comm_t  *comm;
7564     const gmx_domdec_zones_t *zones;
7565     int i, j;
7566
7567     comm = dd->comm;
7568
7569     zones = &comm->zones;
7570
7571     /* Keep the compiler happy */
7572     c->cr0  = 0;
7573     c->bcr1 = 0;
7574
7575     /* The first dimension is equal for all cells */
7576     c->c[0][0] = comm->cell_x0[dim0];
7577     if (bDistMB)
7578     {
7579         c->bc[0] = c->c[0][0];
7580     }
7581     if (dd->ndim >= 2)
7582     {
7583         dim1 = dd->dim[1];
7584         /* This cell row is only seen from the first row */
7585         c->c[1][0] = comm->cell_x0[dim1];
7586         /* All rows can see this row */
7587         c->c[1][1] = comm->cell_x0[dim1];
7588         if (dlbIsOn(dd->comm))
7589         {
7590             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7591             if (bDistMB)
7592             {
7593                 /* For the multi-body distance we need the maximum */
7594                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7595             }
7596         }
7597         /* Set the upper-right corner for rounding */
7598         c->cr0 = comm->cell_x1[dim0];
7599
7600         if (dd->ndim >= 3)
7601         {
7602             dim2 = dd->dim[2];
7603             for (j = 0; j < 4; j++)
7604             {
7605                 c->c[2][j] = comm->cell_x0[dim2];
7606             }
7607             if (dlbIsOn(dd->comm))
7608             {
7609                 /* Use the maximum of the i-cells that see a j-cell */
7610                 for (i = 0; i < zones->nizone; i++)
7611                 {
7612                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7613                     {
7614                         if (j >= 4)
7615                         {
7616                             c->c[2][j-4] =
7617                                 std::max(c->c[2][j-4],
7618                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7619                         }
7620                     }
7621                 }
7622                 if (bDistMB)
7623                 {
7624                     /* For the multi-body distance we need the maximum */
7625                     c->bc[2] = comm->cell_x0[dim2];
7626                     for (i = 0; i < 2; i++)
7627                     {
7628                         for (j = 0; j < 2; j++)
7629                         {
7630                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
7631                         }
7632                     }
7633                 }
7634             }
7635
7636             /* Set the upper-right corner for rounding */
7637             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7638              * Only cell (0,0,0) can see cell 7 (1,1,1)
7639              */
7640             c->cr1[0] = comm->cell_x1[dim1];
7641             c->cr1[3] = comm->cell_x1[dim1];
7642             if (dlbIsOn(dd->comm))
7643             {
7644                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7645                 if (bDistMB)
7646                 {
7647                     /* For the multi-body distance we need the maximum */
7648                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7649                 }
7650             }
7651         }
7652     }
7653 }
7654
7655 /* Determine which cg's we need to send in this pulse from this zone */
7656 static void
7657 get_zone_pulse_cgs(gmx_domdec_t *dd,
7658                    int zonei, int zone,
7659                    int cg0, int cg1,
7660                    const int *index_gl,
7661                    const int *cgindex,
7662                    int dim, int dim_ind,
7663                    int dim0, int dim1, int dim2,
7664                    real r_comm2, real r_bcomm2,
7665                    matrix box,
7666                    ivec tric_dist,
7667                    rvec *normal,
7668                    real skew_fac2_d, real skew_fac_01,
7669                    rvec *v_d, rvec *v_0, rvec *v_1,
7670                    const dd_corners_t *c,
7671                    rvec sf2_round,
7672                    gmx_bool bDistBonded,
7673                    gmx_bool bBondComm,
7674                    gmx_bool bDist2B,
7675                    gmx_bool bDistMB,
7676                    rvec *cg_cm,
7677                    int *cginfo,
7678                    gmx_domdec_ind_t *ind,
7679                    int **ibuf, int *ibuf_nalloc,
7680                    vec_rvec_t *vbuf,
7681                    int *nsend_ptr,
7682                    int *nat_ptr,
7683                    int *nsend_z_ptr)
7684 {
7685     gmx_domdec_comm_t *comm;
7686     gmx_bool           bScrew;
7687     gmx_bool           bDistMB_pulse;
7688     int                cg, i;
7689     real               r2, rb2, r, tric_sh;
7690     rvec               rn, rb;
7691     int                dimd;
7692     int                nsend_z, nsend, nat;
7693
7694     comm = dd->comm;
7695
7696     bScrew = (dd->bScrewPBC && dim == XX);
7697
7698     bDistMB_pulse = (bDistMB && bDistBonded);
7699
7700     nsend_z = 0;
7701     nsend   = *nsend_ptr;
7702     nat     = *nat_ptr;
7703
7704     for (cg = cg0; cg < cg1; cg++)
7705     {
7706         r2  = 0;
7707         rb2 = 0;
7708         if (tric_dist[dim_ind] == 0)
7709         {
7710             /* Rectangular direction, easy */
7711             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7712             if (r > 0)
7713             {
7714                 r2 += r*r;
7715             }
7716             if (bDistMB_pulse)
7717             {
7718                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7719                 if (r > 0)
7720                 {
7721                     rb2 += r*r;
7722                 }
7723             }
7724             /* Rounding gives at most a 16% reduction
7725              * in communicated atoms
7726              */
7727             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7728             {
7729                 r = cg_cm[cg][dim0] - c->cr0;
7730                 /* This is the first dimension, so always r >= 0 */
7731                 r2 += r*r;
7732                 if (bDistMB_pulse)
7733                 {
7734                     rb2 += r*r;
7735                 }
7736             }
7737             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7738             {
7739                 r = cg_cm[cg][dim1] - c->cr1[zone];
7740                 if (r > 0)
7741                 {
7742                     r2 += r*r;
7743                 }
7744                 if (bDistMB_pulse)
7745                 {
7746                     r = cg_cm[cg][dim1] - c->bcr1;
7747                     if (r > 0)
7748                     {
7749                         rb2 += r*r;
7750                     }
7751                 }
7752             }
7753         }
7754         else
7755         {
7756             /* Triclinic direction, more complicated */
7757             clear_rvec(rn);
7758             clear_rvec(rb);
7759             /* Rounding, conservative as the skew_fac multiplication
7760              * will slightly underestimate the distance.
7761              */
7762             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7763             {
7764                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7765                 for (i = dim0+1; i < DIM; i++)
7766                 {
7767                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7768                 }
7769                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7770                 if (bDistMB_pulse)
7771                 {
7772                     rb[dim0] = rn[dim0];
7773                     rb2      = r2;
7774                 }
7775                 /* Take care that the cell planes along dim0 might not
7776                  * be orthogonal to those along dim1 and dim2.
7777                  */
7778                 for (i = 1; i <= dim_ind; i++)
7779                 {
7780                     dimd = dd->dim[i];
7781                     if (normal[dim0][dimd] > 0)
7782                     {
7783                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7784                         if (bDistMB_pulse)
7785                         {
7786                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7787                         }
7788                     }
7789                 }
7790             }
7791             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7792             {
7793                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7794                 tric_sh   = 0;
7795                 for (i = dim1+1; i < DIM; i++)
7796                 {
7797                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7798                 }
7799                 rn[dim1] += tric_sh;
7800                 if (rn[dim1] > 0)
7801                 {
7802                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7803                     /* Take care of coupling of the distances
7804                      * to the planes along dim0 and dim1 through dim2.
7805                      */
7806                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7807                     /* Take care that the cell planes along dim1
7808                      * might not be orthogonal to that along dim2.
7809                      */
7810                     if (normal[dim1][dim2] > 0)
7811                     {
7812                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7813                     }
7814                 }
7815                 if (bDistMB_pulse)
7816                 {
7817                     rb[dim1] +=
7818                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7819                     if (rb[dim1] > 0)
7820                     {
7821                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7822                         /* Take care of coupling of the distances
7823                          * to the planes along dim0 and dim1 through dim2.
7824                          */
7825                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7826                         /* Take care that the cell planes along dim1
7827                          * might not be orthogonal to that along dim2.
7828                          */
7829                         if (normal[dim1][dim2] > 0)
7830                         {
7831                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7832                         }
7833                     }
7834                 }
7835             }
7836             /* The distance along the communication direction */
7837             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7838             tric_sh  = 0;
7839             for (i = dim+1; i < DIM; i++)
7840             {
7841                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7842             }
7843             rn[dim] += tric_sh;
7844             if (rn[dim] > 0)
7845             {
7846                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7847                 /* Take care of coupling of the distances
7848                  * to the planes along dim0 and dim1 through dim2.
7849                  */
7850                 if (dim_ind == 1 && zonei == 1)
7851                 {
7852                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7853                 }
7854             }
7855             if (bDistMB_pulse)
7856             {
7857                 clear_rvec(rb);
7858                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7859                 if (rb[dim] > 0)
7860                 {
7861                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7862                     /* Take care of coupling of the distances
7863                      * to the planes along dim0 and dim1 through dim2.
7864                      */
7865                     if (dim_ind == 1 && zonei == 1)
7866                     {
7867                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7868                     }
7869                 }
7870             }
7871         }
7872
7873         if (r2 < r_comm2 ||
7874             (bDistBonded &&
7875              ((bDistMB && rb2 < r_bcomm2) ||
7876               (bDist2B && r2  < r_bcomm2)) &&
7877              (!bBondComm ||
7878               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7879                missing_link(comm->cglink, index_gl[cg],
7880                             comm->bLocalCG)))))
7881         {
7882             /* Make an index to the local charge groups */
7883             if (nsend+1 > ind->nalloc)
7884             {
7885                 ind->nalloc = over_alloc_large(nsend+1);
7886                 srenew(ind->index, ind->nalloc);
7887             }
7888             if (nsend+1 > *ibuf_nalloc)
7889             {
7890                 *ibuf_nalloc = over_alloc_large(nsend+1);
7891                 srenew(*ibuf, *ibuf_nalloc);
7892             }
7893             ind->index[nsend] = cg;
7894             (*ibuf)[nsend]    = index_gl[cg];
7895             nsend_z++;
7896             vec_rvec_check_alloc(vbuf, nsend+1);
7897
7898             if (dd->ci[dim] == 0)
7899             {
7900                 /* Correct cg_cm for pbc */
7901                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7902                 if (bScrew)
7903                 {
7904                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7905                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7906                 }
7907             }
7908             else
7909             {
7910                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7911             }
7912             nsend++;
7913             nat += cgindex[cg+1] - cgindex[cg];
7914         }
7915     }
7916
7917     *nsend_ptr   = nsend;
7918     *nat_ptr     = nat;
7919     *nsend_z_ptr = nsend_z;
7920 }
7921
7922 static void setup_dd_communication(gmx_domdec_t *dd,
7923                                    matrix box, gmx_ddbox_t *ddbox,
7924                                    t_forcerec *fr, t_state *state, rvec **f)
7925 {
7926     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7927     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7928     int                    c, i, cg, cg_gl, nrcg;
7929     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7930     gmx_domdec_comm_t     *comm;
7931     gmx_domdec_zones_t    *zones;
7932     gmx_domdec_comm_dim_t *cd;
7933     gmx_domdec_ind_t      *ind;
7934     cginfo_mb_t           *cginfo_mb;
7935     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
7936     real                   r_comm2, r_bcomm2;
7937     dd_corners_t           corners;
7938     ivec                   tric_dist;
7939     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
7940     real                   skew_fac2_d, skew_fac_01;
7941     rvec                   sf2_round;
7942     int                    nsend, nat;
7943     int                    th;
7944
7945     if (debug)
7946     {
7947         fprintf(debug, "Setting up DD communication\n");
7948     }
7949
7950     comm  = dd->comm;
7951
7952     switch (fr->cutoff_scheme)
7953     {
7954         case ecutsGROUP:
7955             cg_cm = fr->cg_cm;
7956             break;
7957         case ecutsVERLET:
7958             cg_cm = state->x;
7959             break;
7960         default:
7961             gmx_incons("unimplemented");
7962             cg_cm = NULL;
7963     }
7964
7965     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
7966     {
7967         /* Check if we need to use triclinic distances */
7968         tric_dist[dim_ind] = 0;
7969         for (i = 0; i <= dim_ind; i++)
7970         {
7971             if (ddbox->tric_dir[dd->dim[i]])
7972             {
7973                 tric_dist[dim_ind] = 1;
7974             }
7975         }
7976     }
7977
7978     bBondComm = comm->bBondComm;
7979
7980     /* Do we need to determine extra distances for multi-body bondeds? */
7981     bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
7982
7983     /* Do we need to determine extra distances for only two-body bondeds? */
7984     bDist2B = (bBondComm && !bDistMB);
7985
7986     r_comm2  = sqr(comm->cutoff);
7987     r_bcomm2 = sqr(comm->cutoff_mbody);
7988
7989     if (debug)
7990     {
7991         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
7992     }
7993
7994     zones = &comm->zones;
7995
7996     dim0 = dd->dim[0];
7997     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
7998     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
7999
8000     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8001
8002     /* Triclinic stuff */
8003     normal      = ddbox->normal;
8004     skew_fac_01 = 0;
8005     if (dd->ndim >= 2)
8006     {
8007         v_0 = ddbox->v[dim0];
8008         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8009         {
8010             /* Determine the coupling coefficient for the distances
8011              * to the cell planes along dim0 and dim1 through dim2.
8012              * This is required for correct rounding.
8013              */
8014             skew_fac_01 =
8015                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8016             if (debug)
8017             {
8018                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8019             }
8020         }
8021     }
8022     if (dd->ndim >= 3)
8023     {
8024         v_1 = ddbox->v[dim1];
8025     }
8026
8027     zone_cg_range = zones->cg_range;
8028     index_gl      = dd->index_gl;
8029     cgindex       = dd->cgindex;
8030     cginfo_mb     = fr->cginfo_mb;
8031
8032     zone_cg_range[0]   = 0;
8033     zone_cg_range[1]   = dd->ncg_home;
8034     comm->zone_ncg1[0] = dd->ncg_home;
8035     pos_cg             = dd->ncg_home;
8036
8037     nat_tot = dd->nat_home;
8038     nzone   = 1;
8039     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8040     {
8041         dim = dd->dim[dim_ind];
8042         cd  = &comm->cd[dim_ind];
8043
8044         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8045         {
8046             /* No pbc in this dimension, the first node should not comm. */
8047             nzone_send = 0;
8048         }
8049         else
8050         {
8051             nzone_send = nzone;
8052         }
8053
8054         v_d         = ddbox->v[dim];
8055         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8056
8057         cd->bInPlace = TRUE;
8058         for (p = 0; p < cd->np; p++)
8059         {
8060             /* Only atoms communicated in the first pulse are used
8061              * for multi-body bonded interactions or for bBondComm.
8062              */
8063             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8064
8065             ind   = &cd->ind[p];
8066             nsend = 0;
8067             nat   = 0;
8068             for (zone = 0; zone < nzone_send; zone++)
8069             {
8070                 if (tric_dist[dim_ind] && dim_ind > 0)
8071                 {
8072                     /* Determine slightly more optimized skew_fac's
8073                      * for rounding.
8074                      * This reduces the number of communicated atoms
8075                      * by about 10% for 3D DD of rhombic dodecahedra.
8076                      */
8077                     for (dimd = 0; dimd < dim; dimd++)
8078                     {
8079                         sf2_round[dimd] = 1;
8080                         if (ddbox->tric_dir[dimd])
8081                         {
8082                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8083                             {
8084                                 /* If we are shifted in dimension i
8085                                  * and the cell plane is tilted forward
8086                                  * in dimension i, skip this coupling.
8087                                  */
8088                                 if (!(zones->shift[nzone+zone][i] &&
8089                                       ddbox->v[dimd][i][dimd] >= 0))
8090                                 {
8091                                     sf2_round[dimd] +=
8092                                         sqr(ddbox->v[dimd][i][dimd]);
8093                                 }
8094                             }
8095                             sf2_round[dimd] = 1/sf2_round[dimd];
8096                         }
8097                     }
8098                 }
8099
8100                 zonei = zone_perm[dim_ind][zone];
8101                 if (p == 0)
8102                 {
8103                     /* Here we permutate the zones to obtain a convenient order
8104                      * for neighbor searching
8105                      */
8106                     cg0 = zone_cg_range[zonei];
8107                     cg1 = zone_cg_range[zonei+1];
8108                 }
8109                 else
8110                 {
8111                     /* Look only at the cg's received in the previous grid pulse
8112                      */
8113                     cg1 = zone_cg_range[nzone+zone+1];
8114                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8115                 }
8116
8117 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8118                 for (th = 0; th < comm->nth; th++)
8119                 {
8120                     try
8121                     {
8122                         gmx_domdec_ind_t *ind_p;
8123                         int             **ibuf_p, *ibuf_nalloc_p;
8124                         vec_rvec_t       *vbuf_p;
8125                         int              *nsend_p, *nat_p;
8126                         int              *nsend_zone_p;
8127                         int               cg0_th, cg1_th;
8128
8129                         if (th == 0)
8130                         {
8131                             /* Thread 0 writes in the comm buffers */
8132                             ind_p         = ind;
8133                             ibuf_p        = &comm->buf_int;
8134                             ibuf_nalloc_p = &comm->nalloc_int;
8135                             vbuf_p        = &comm->vbuf;
8136                             nsend_p       = &nsend;
8137                             nat_p         = &nat;
8138                             nsend_zone_p  = &ind->nsend[zone];
8139                         }
8140                         else
8141                         {
8142                             /* Other threads write into temp buffers */
8143                             ind_p         = &comm->dth[th].ind;
8144                             ibuf_p        = &comm->dth[th].ibuf;
8145                             ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8146                             vbuf_p        = &comm->dth[th].vbuf;
8147                             nsend_p       = &comm->dth[th].nsend;
8148                             nat_p         = &comm->dth[th].nat;
8149                             nsend_zone_p  = &comm->dth[th].nsend_zone;
8150
8151                             comm->dth[th].nsend      = 0;
8152                             comm->dth[th].nat        = 0;
8153                             comm->dth[th].nsend_zone = 0;
8154                         }
8155
8156                         if (comm->nth == 1)
8157                         {
8158                             cg0_th = cg0;
8159                             cg1_th = cg1;
8160                         }
8161                         else
8162                         {
8163                             cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8164                             cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8165                         }
8166
8167                         /* Get the cg's for this pulse in this zone */
8168                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8169                                            index_gl, cgindex,
8170                                            dim, dim_ind, dim0, dim1, dim2,
8171                                            r_comm2, r_bcomm2,
8172                                            box, tric_dist,
8173                                            normal, skew_fac2_d, skew_fac_01,
8174                                            v_d, v_0, v_1, &corners, sf2_round,
8175                                            bDistBonded, bBondComm,
8176                                            bDist2B, bDistMB,
8177                                            cg_cm, fr->cginfo,
8178                                            ind_p,
8179                                            ibuf_p, ibuf_nalloc_p,
8180                                            vbuf_p,
8181                                            nsend_p, nat_p,
8182                                            nsend_zone_p);
8183                     }
8184                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
8185                 } // END
8186
8187                 /* Append data of threads>=1 to the communication buffers */
8188                 for (th = 1; th < comm->nth; th++)
8189                 {
8190                     dd_comm_setup_work_t *dth;
8191                     int                   i, ns1;
8192
8193                     dth = &comm->dth[th];
8194
8195                     ns1 = nsend + dth->nsend_zone;
8196                     if (ns1 > ind->nalloc)
8197                     {
8198                         ind->nalloc = over_alloc_dd(ns1);
8199                         srenew(ind->index, ind->nalloc);
8200                     }
8201                     if (ns1 > comm->nalloc_int)
8202                     {
8203                         comm->nalloc_int = over_alloc_dd(ns1);
8204                         srenew(comm->buf_int, comm->nalloc_int);
8205                     }
8206                     if (ns1 > comm->vbuf.nalloc)
8207                     {
8208                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8209                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8210                     }
8211
8212                     for (i = 0; i < dth->nsend_zone; i++)
8213                     {
8214                         ind->index[nsend]    = dth->ind.index[i];
8215                         comm->buf_int[nsend] = dth->ibuf[i];
8216                         copy_rvec(dth->vbuf.v[i],
8217                                   comm->vbuf.v[nsend]);
8218                         nsend++;
8219                     }
8220                     nat              += dth->nat;
8221                     ind->nsend[zone] += dth->nsend_zone;
8222                 }
8223             }
8224             /* Clear the counts in case we do not have pbc */
8225             for (zone = nzone_send; zone < nzone; zone++)
8226             {
8227                 ind->nsend[zone] = 0;
8228             }
8229             ind->nsend[nzone]   = nsend;
8230             ind->nsend[nzone+1] = nat;
8231             /* Communicate the number of cg's and atoms to receive */
8232             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8233                             ind->nsend, nzone+2,
8234                             ind->nrecv, nzone+2);
8235
8236             /* The rvec buffer is also required for atom buffers of size nsend
8237              * in dd_move_x and dd_move_f.
8238              */
8239             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8240
8241             if (p > 0)
8242             {
8243                 /* We can receive in place if only the last zone is not empty */
8244                 for (zone = 0; zone < nzone-1; zone++)
8245                 {
8246                     if (ind->nrecv[zone] > 0)
8247                     {
8248                         cd->bInPlace = FALSE;
8249                     }
8250                 }
8251                 if (!cd->bInPlace)
8252                 {
8253                     /* The int buffer is only required here for the cg indices */
8254                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8255                     {
8256                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8257                         srenew(comm->buf_int2, comm->nalloc_int2);
8258                     }
8259                     /* The rvec buffer is also required for atom buffers
8260                      * of size nrecv in dd_move_x and dd_move_f.
8261                      */
8262                     i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8263                     vec_rvec_check_alloc(&comm->vbuf2, i);
8264                 }
8265             }
8266
8267             /* Make space for the global cg indices */
8268             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8269                 || dd->cg_nalloc == 0)
8270             {
8271                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8272                 srenew(index_gl, dd->cg_nalloc);
8273                 srenew(cgindex, dd->cg_nalloc+1);
8274             }
8275             /* Communicate the global cg indices */
8276             if (cd->bInPlace)
8277             {
8278                 recv_i = index_gl + pos_cg;
8279             }
8280             else
8281             {
8282                 recv_i = comm->buf_int2;
8283             }
8284             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8285                             comm->buf_int, nsend,
8286                             recv_i,        ind->nrecv[nzone]);
8287
8288             /* Make space for cg_cm */
8289             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8290             if (fr->cutoff_scheme == ecutsGROUP)
8291             {
8292                 cg_cm = fr->cg_cm;
8293             }
8294             else
8295             {
8296                 cg_cm = state->x;
8297             }
8298             /* Communicate cg_cm */
8299             if (cd->bInPlace)
8300             {
8301                 recv_vr = cg_cm + pos_cg;
8302             }
8303             else
8304             {
8305                 recv_vr = comm->vbuf2.v;
8306             }
8307             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8308                              comm->vbuf.v, nsend,
8309                              recv_vr,      ind->nrecv[nzone]);
8310
8311             /* Make the charge group index */
8312             if (cd->bInPlace)
8313             {
8314                 zone = (p == 0 ? 0 : nzone - 1);
8315                 while (zone < nzone)
8316                 {
8317                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8318                     {
8319                         cg_gl              = index_gl[pos_cg];
8320                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8321                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8322                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8323                         if (bBondComm)
8324                         {
8325                             /* Update the charge group presence,
8326                              * so we can use it in the next pass of the loop.
8327                              */
8328                             comm->bLocalCG[cg_gl] = TRUE;
8329                         }
8330                         pos_cg++;
8331                     }
8332                     if (p == 0)
8333                     {
8334                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8335                     }
8336                     zone++;
8337                     zone_cg_range[nzone+zone] = pos_cg;
8338                 }
8339             }
8340             else
8341             {
8342                 /* This part of the code is never executed with bBondComm. */
8343                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8344                                  index_gl, recv_i, cg_cm, recv_vr,
8345                                  cgindex, fr->cginfo_mb, fr->cginfo);
8346                 pos_cg += ind->nrecv[nzone];
8347             }
8348             nat_tot += ind->nrecv[nzone+1];
8349         }
8350         if (!cd->bInPlace)
8351         {
8352             /* Store the atom block for easy copying of communication buffers */
8353             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8354         }
8355         nzone += nzone;
8356     }
8357     dd->index_gl = index_gl;
8358     dd->cgindex  = cgindex;
8359
8360     dd->ncg_tot          = zone_cg_range[zones->n];
8361     dd->nat_tot          = nat_tot;
8362     comm->nat[ddnatHOME] = dd->nat_home;
8363     for (i = ddnatZONE; i < ddnatNR; i++)
8364     {
8365         comm->nat[i] = dd->nat_tot;
8366     }
8367
8368     if (!bBondComm)
8369     {
8370         /* We don't need to update cginfo, since that was alrady done above.
8371          * So we pass NULL for the forcerec.
8372          */
8373         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8374                       NULL, comm->bLocalCG);
8375     }
8376
8377     if (debug)
8378     {
8379         fprintf(debug, "Finished setting up DD communication, zones:");
8380         for (c = 0; c < zones->n; c++)
8381         {
8382             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8383         }
8384         fprintf(debug, "\n");
8385     }
8386 }
8387
8388 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8389 {
8390     int c;
8391
8392     for (c = 0; c < zones->nizone; c++)
8393     {
8394         zones->izone[c].cg1  = zones->cg_range[c+1];
8395         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8396         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8397     }
8398 }
8399
8400 static void set_zones_size(gmx_domdec_t *dd,
8401                            matrix box, const gmx_ddbox_t *ddbox,
8402                            int zone_start, int zone_end)
8403 {
8404     gmx_domdec_comm_t  *comm;
8405     gmx_domdec_zones_t *zones;
8406     gmx_bool            bDistMB;
8407     int                 z, zi, d, dim;
8408     real                rcs, rcmbs;
8409     int                 i, j;
8410     real                vol;
8411
8412     comm = dd->comm;
8413
8414     zones = &comm->zones;
8415
8416     /* Do we need to determine extra distances for multi-body bondeds? */
8417     bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
8418
8419     for (z = zone_start; z < zone_end; z++)
8420     {
8421         /* Copy cell limits to zone limits.
8422          * Valid for non-DD dims and non-shifted dims.
8423          */
8424         copy_rvec(comm->cell_x0, zones->size[z].x0);
8425         copy_rvec(comm->cell_x1, zones->size[z].x1);
8426     }
8427
8428     for (d = 0; d < dd->ndim; d++)
8429     {
8430         dim = dd->dim[d];
8431
8432         for (z = 0; z < zones->n; z++)
8433         {
8434             /* With a staggered grid we have different sizes
8435              * for non-shifted dimensions.
8436              */
8437             if (dlbIsOn(dd->comm) && zones->shift[z][dim] == 0)
8438             {
8439                 if (d == 1)
8440                 {
8441                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8442                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8443                 }
8444                 else if (d == 2)
8445                 {
8446                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8447                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8448                 }
8449             }
8450         }
8451
8452         rcs   = comm->cutoff;
8453         rcmbs = comm->cutoff_mbody;
8454         if (ddbox->tric_dir[dim])
8455         {
8456             rcs   /= ddbox->skew_fac[dim];
8457             rcmbs /= ddbox->skew_fac[dim];
8458         }
8459
8460         /* Set the lower limit for the shifted zone dimensions */
8461         for (z = zone_start; z < zone_end; z++)
8462         {
8463             if (zones->shift[z][dim] > 0)
8464             {
8465                 dim = dd->dim[d];
8466                 if (!dlbIsOn(dd->comm) || d == 0)
8467                 {
8468                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8469                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8470                 }
8471                 else
8472                 {
8473                     /* Here we take the lower limit of the zone from
8474                      * the lowest domain of the zone below.
8475                      */
8476                     if (z < 4)
8477                     {
8478                         zones->size[z].x0[dim] =
8479                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8480                     }
8481                     else
8482                     {
8483                         if (d == 1)
8484                         {
8485                             zones->size[z].x0[dim] =
8486                                 zones->size[zone_perm[2][z-4]].x0[dim];
8487                         }
8488                         else
8489                         {
8490                             zones->size[z].x0[dim] =
8491                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8492                         }
8493                     }
8494                     /* A temporary limit, is updated below */
8495                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8496
8497                     if (bDistMB)
8498                     {
8499                         for (zi = 0; zi < zones->nizone; zi++)
8500                         {
8501                             if (zones->shift[zi][dim] == 0)
8502                             {
8503                                 /* This takes the whole zone into account.
8504                                  * With multiple pulses this will lead
8505                                  * to a larger zone then strictly necessary.
8506                                  */
8507                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8508                                                                   zones->size[zi].x1[dim]+rcmbs);
8509                             }
8510                         }
8511                     }
8512                 }
8513             }
8514         }
8515
8516         /* Loop over the i-zones to set the upper limit of each
8517          * j-zone they see.
8518          */
8519         for (zi = 0; zi < zones->nizone; zi++)
8520         {
8521             if (zones->shift[zi][dim] == 0)
8522             {
8523                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8524                 {
8525                     if (zones->shift[z][dim] > 0)
8526                     {
8527                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8528                                                           zones->size[zi].x1[dim]+rcs);
8529                     }
8530                 }
8531             }
8532         }
8533     }
8534
8535     for (z = zone_start; z < zone_end; z++)
8536     {
8537         /* Initialization only required to keep the compiler happy */
8538         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8539         int  nc, c;
8540
8541         /* To determine the bounding box for a zone we need to find
8542          * the extreme corners of 4, 2 or 1 corners.
8543          */
8544         nc = 1 << (ddbox->nboundeddim - 1);
8545
8546         for (c = 0; c < nc; c++)
8547         {
8548             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8549             corner[XX] = 0;
8550             if ((c & 1) == 0)
8551             {
8552                 corner[YY] = zones->size[z].x0[YY];
8553             }
8554             else
8555             {
8556                 corner[YY] = zones->size[z].x1[YY];
8557             }
8558             if ((c & 2) == 0)
8559             {
8560                 corner[ZZ] = zones->size[z].x0[ZZ];
8561             }
8562             else
8563             {
8564                 corner[ZZ] = zones->size[z].x1[ZZ];
8565             }
8566             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
8567                 box[ZZ][1 - dd->dim[0]] != 0)
8568             {
8569                 /* With 1D domain decomposition the cg's are not in
8570                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
8571                  * Shift the corner of the z-vector back to along the box
8572                  * vector of dimension d, so it will later end up at 0 along d.
8573                  * This can affect the location of this corner along dd->dim[0]
8574                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
8575                  */
8576                 int d = 1 - dd->dim[0];
8577
8578                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
8579             }
8580             /* Apply the triclinic couplings */
8581             assert(ddbox->npbcdim <= DIM);
8582             for (i = YY; i < ddbox->npbcdim; i++)
8583             {
8584                 for (j = XX; j < i; j++)
8585                 {
8586                     corner[j] += corner[i]*box[i][j]/box[i][i];
8587                 }
8588             }
8589             if (c == 0)
8590             {
8591                 copy_rvec(corner, corner_min);
8592                 copy_rvec(corner, corner_max);
8593             }
8594             else
8595             {
8596                 for (i = 0; i < DIM; i++)
8597                 {
8598                     corner_min[i] = std::min(corner_min[i], corner[i]);
8599                     corner_max[i] = std::max(corner_max[i], corner[i]);
8600                 }
8601             }
8602         }
8603         /* Copy the extreme cornes without offset along x */
8604         for (i = 0; i < DIM; i++)
8605         {
8606             zones->size[z].bb_x0[i] = corner_min[i];
8607             zones->size[z].bb_x1[i] = corner_max[i];
8608         }
8609         /* Add the offset along x */
8610         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8611         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8612     }
8613
8614     if (zone_start == 0)
8615     {
8616         vol = 1;
8617         for (dim = 0; dim < DIM; dim++)
8618         {
8619             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8620         }
8621         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8622     }
8623
8624     if (debug)
8625     {
8626         for (z = zone_start; z < zone_end; z++)
8627         {
8628             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8629                     z,
8630                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8631                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8632                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8633             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8634                     z,
8635                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8636                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8637                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8638         }
8639     }
8640 }
8641
8642 static int comp_cgsort(const void *a, const void *b)
8643 {
8644     int           comp;
8645
8646     gmx_cgsort_t *cga, *cgb;
8647     cga = (gmx_cgsort_t *)a;
8648     cgb = (gmx_cgsort_t *)b;
8649
8650     comp = cga->nsc - cgb->nsc;
8651     if (comp == 0)
8652     {
8653         comp = cga->ind_gl - cgb->ind_gl;
8654     }
8655
8656     return comp;
8657 }
8658
8659 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8660                          int *a, int *buf)
8661 {
8662     int i;
8663
8664     /* Order the data */
8665     for (i = 0; i < n; i++)
8666     {
8667         buf[i] = a[sort[i].ind];
8668     }
8669
8670     /* Copy back to the original array */
8671     for (i = 0; i < n; i++)
8672     {
8673         a[i] = buf[i];
8674     }
8675 }
8676
8677 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8678                          rvec *v, rvec *buf)
8679 {
8680     int i;
8681
8682     /* Order the data */
8683     for (i = 0; i < n; i++)
8684     {
8685         copy_rvec(v[sort[i].ind], buf[i]);
8686     }
8687
8688     /* Copy back to the original array */
8689     for (i = 0; i < n; i++)
8690     {
8691         copy_rvec(buf[i], v[i]);
8692     }
8693 }
8694
8695 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8696                            rvec *v, rvec *buf)
8697 {
8698     int a, atot, cg, cg0, cg1, i;
8699
8700     if (cgindex == NULL)
8701     {
8702         /* Avoid the useless loop of the atoms within a cg */
8703         order_vec_cg(ncg, sort, v, buf);
8704
8705         return;
8706     }
8707
8708     /* Order the data */
8709     a = 0;
8710     for (cg = 0; cg < ncg; cg++)
8711     {
8712         cg0 = cgindex[sort[cg].ind];
8713         cg1 = cgindex[sort[cg].ind+1];
8714         for (i = cg0; i < cg1; i++)
8715         {
8716             copy_rvec(v[i], buf[a]);
8717             a++;
8718         }
8719     }
8720     atot = a;
8721
8722     /* Copy back to the original array */
8723     for (a = 0; a < atot; a++)
8724     {
8725         copy_rvec(buf[a], v[a]);
8726     }
8727 }
8728
8729 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8730                          int nsort_new, gmx_cgsort_t *sort_new,
8731                          gmx_cgsort_t *sort1)
8732 {
8733     int i1, i2, i_new;
8734
8735     /* The new indices are not very ordered, so we qsort them */
8736     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8737
8738     /* sort2 is already ordered, so now we can merge the two arrays */
8739     i1    = 0;
8740     i2    = 0;
8741     i_new = 0;
8742     while (i2 < nsort2 || i_new < nsort_new)
8743     {
8744         if (i2 == nsort2)
8745         {
8746             sort1[i1++] = sort_new[i_new++];
8747         }
8748         else if (i_new == nsort_new)
8749         {
8750             sort1[i1++] = sort2[i2++];
8751         }
8752         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8753                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8754                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8755         {
8756             sort1[i1++] = sort2[i2++];
8757         }
8758         else
8759         {
8760             sort1[i1++] = sort_new[i_new++];
8761         }
8762     }
8763 }
8764
8765 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8766 {
8767     gmx_domdec_sort_t *sort;
8768     gmx_cgsort_t      *cgsort, *sort_i;
8769     int                ncg_new, nsort2, nsort_new, i, *a, moved;
8770
8771     sort = dd->comm->sort;
8772
8773     a = fr->ns.grid->cell_index;
8774
8775     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8776
8777     if (ncg_home_old >= 0)
8778     {
8779         /* The charge groups that remained in the same ns grid cell
8780          * are completely ordered. So we can sort efficiently by sorting
8781          * the charge groups that did move into the stationary list.
8782          */
8783         ncg_new   = 0;
8784         nsort2    = 0;
8785         nsort_new = 0;
8786         for (i = 0; i < dd->ncg_home; i++)
8787         {
8788             /* Check if this cg did not move to another node */
8789             if (a[i] < moved)
8790             {
8791                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8792                 {
8793                     /* This cg is new on this node or moved ns grid cell */
8794                     if (nsort_new >= sort->sort_new_nalloc)
8795                     {
8796                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8797                         srenew(sort->sort_new, sort->sort_new_nalloc);
8798                     }
8799                     sort_i = &(sort->sort_new[nsort_new++]);
8800                 }
8801                 else
8802                 {
8803                     /* This cg did not move */
8804                     sort_i = &(sort->sort2[nsort2++]);
8805                 }
8806                 /* Sort on the ns grid cell indices
8807                  * and the global topology index.
8808                  * index_gl is irrelevant with cell ns,
8809                  * but we set it here anyhow to avoid a conditional.
8810                  */
8811                 sort_i->nsc    = a[i];
8812                 sort_i->ind_gl = dd->index_gl[i];
8813                 sort_i->ind    = i;
8814                 ncg_new++;
8815             }
8816         }
8817         if (debug)
8818         {
8819             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8820                     nsort2, nsort_new);
8821         }
8822         /* Sort efficiently */
8823         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8824                      sort->sort);
8825     }
8826     else
8827     {
8828         cgsort  = sort->sort;
8829         ncg_new = 0;
8830         for (i = 0; i < dd->ncg_home; i++)
8831         {
8832             /* Sort on the ns grid cell indices
8833              * and the global topology index
8834              */
8835             cgsort[i].nsc    = a[i];
8836             cgsort[i].ind_gl = dd->index_gl[i];
8837             cgsort[i].ind    = i;
8838             if (cgsort[i].nsc < moved)
8839             {
8840                 ncg_new++;
8841             }
8842         }
8843         if (debug)
8844         {
8845             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8846         }
8847         /* Determine the order of the charge groups using qsort */
8848         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8849     }
8850
8851     return ncg_new;
8852 }
8853
8854 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8855 {
8856     gmx_cgsort_t *sort;
8857     int           ncg_new, i, na;
8858     const int    *a;
8859
8860     sort = dd->comm->sort->sort;
8861
8862     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8863
8864     ncg_new = 0;
8865     for (i = 0; i < na; i++)
8866     {
8867         if (a[i] >= 0)
8868         {
8869             sort[ncg_new].ind = a[i];
8870             ncg_new++;
8871         }
8872     }
8873
8874     return ncg_new;
8875 }
8876
8877 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
8878                           int ncg_home_old)
8879 {
8880     gmx_domdec_sort_t *sort;
8881     gmx_cgsort_t      *cgsort;
8882     int               *cgindex;
8883     int                ncg_new, i, *ibuf, cgsize;
8884     rvec              *vbuf;
8885
8886     sort = dd->comm->sort;
8887
8888     if (dd->ncg_home > sort->sort_nalloc)
8889     {
8890         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8891         srenew(sort->sort, sort->sort_nalloc);
8892         srenew(sort->sort2, sort->sort_nalloc);
8893     }
8894     cgsort = sort->sort;
8895
8896     switch (fr->cutoff_scheme)
8897     {
8898         case ecutsGROUP:
8899             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8900             break;
8901         case ecutsVERLET:
8902             ncg_new = dd_sort_order_nbnxn(dd, fr);
8903             break;
8904         default:
8905             gmx_incons("unimplemented");
8906             ncg_new = 0;
8907     }
8908
8909     /* We alloc with the old size, since cgindex is still old */
8910     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8911     vbuf = dd->comm->vbuf.v;
8912
8913     if (dd->comm->bCGs)
8914     {
8915         cgindex = dd->cgindex;
8916     }
8917     else
8918     {
8919         cgindex = NULL;
8920     }
8921
8922     /* Remove the charge groups which are no longer at home here */
8923     dd->ncg_home = ncg_new;
8924     if (debug)
8925     {
8926         fprintf(debug, "Set the new home charge group count to %d\n",
8927                 dd->ncg_home);
8928     }
8929
8930     /* Reorder the state */
8931     for (i = 0; i < estNR; i++)
8932     {
8933         if (EST_DISTR(i) && (state->flags & (1<<i)))
8934         {
8935             switch (i)
8936             {
8937                 case estX:
8938                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
8939                     break;
8940                 case estV:
8941                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
8942                     break;
8943                 case estSDX:
8944                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
8945                     break;
8946                 case estCGP:
8947                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
8948                     break;
8949                 case estLD_RNG:
8950                 case estLD_RNGI:
8951                 case estDISRE_INITF:
8952                 case estDISRE_RM3TAV:
8953                 case estORIRE_INITF:
8954                 case estORIRE_DTAV:
8955                     /* No ordering required */
8956                     break;
8957                 default:
8958                     gmx_incons("Unknown state entry encountered in dd_sort_state");
8959                     break;
8960             }
8961         }
8962     }
8963     if (fr->cutoff_scheme == ecutsGROUP)
8964     {
8965         /* Reorder cgcm */
8966         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
8967     }
8968
8969     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8970     {
8971         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8972         srenew(sort->ibuf, sort->ibuf_nalloc);
8973     }
8974     ibuf = sort->ibuf;
8975     /* Reorder the global cg index */
8976     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
8977     /* Reorder the cginfo */
8978     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
8979     /* Rebuild the local cg index */
8980     if (dd->comm->bCGs)
8981     {
8982         ibuf[0] = 0;
8983         for (i = 0; i < dd->ncg_home; i++)
8984         {
8985             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8986             ibuf[i+1] = ibuf[i] + cgsize;
8987         }
8988         for (i = 0; i < dd->ncg_home+1; i++)
8989         {
8990             dd->cgindex[i] = ibuf[i];
8991         }
8992     }
8993     else
8994     {
8995         for (i = 0; i < dd->ncg_home+1; i++)
8996         {
8997             dd->cgindex[i] = i;
8998         }
8999     }
9000     /* Set the home atom number */
9001     dd->nat_home = dd->cgindex[dd->ncg_home];
9002
9003     if (fr->cutoff_scheme == ecutsVERLET)
9004     {
9005         /* The atoms are now exactly in grid order, update the grid order */
9006         nbnxn_set_atomorder(fr->nbv->nbs);
9007     }
9008     else
9009     {
9010         /* Copy the sorted ns cell indices back to the ns grid struct */
9011         for (i = 0; i < dd->ncg_home; i++)
9012         {
9013             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9014         }
9015         fr->ns.grid->nr = dd->ncg_home;
9016     }
9017 }
9018
9019 static void add_dd_statistics(gmx_domdec_t *dd)
9020 {
9021     gmx_domdec_comm_t *comm;
9022     int                ddnat;
9023
9024     comm = dd->comm;
9025
9026     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9027     {
9028         comm->sum_nat[ddnat-ddnatZONE] +=
9029             comm->nat[ddnat] - comm->nat[ddnat-1];
9030     }
9031     comm->ndecomp++;
9032 }
9033
9034 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9035 {
9036     gmx_domdec_comm_t *comm;
9037     int                ddnat;
9038
9039     comm = dd->comm;
9040
9041     /* Reset all the statistics and counters for total run counting */
9042     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9043     {
9044         comm->sum_nat[ddnat-ddnatZONE] = 0;
9045     }
9046     comm->ndecomp   = 0;
9047     comm->nload     = 0;
9048     comm->load_step = 0;
9049     comm->load_sum  = 0;
9050     comm->load_max  = 0;
9051     clear_ivec(comm->load_lim);
9052     comm->load_mdf = 0;
9053     comm->load_pme = 0;
9054 }
9055
9056 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9057 {
9058     gmx_domdec_comm_t *comm;
9059     int                ddnat;
9060     double             av;
9061
9062     comm = cr->dd->comm;
9063
9064     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9065
9066     if (fplog == NULL)
9067     {
9068         return;
9069     }
9070
9071     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9072
9073     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9074     {
9075         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9076         switch (ddnat)
9077         {
9078             case ddnatZONE:
9079                 fprintf(fplog,
9080                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9081                         2, av);
9082                 break;
9083             case ddnatVSITE:
9084                 if (cr->dd->vsite_comm)
9085                 {
9086                     fprintf(fplog,
9087                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9088                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9089                             av);
9090                 }
9091                 break;
9092             case ddnatCON:
9093                 if (cr->dd->constraint_comm)
9094                 {
9095                     fprintf(fplog,
9096                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9097                             1 + ir->nLincsIter, av);
9098                 }
9099                 break;
9100             default:
9101                 gmx_incons(" Unknown type for DD statistics");
9102         }
9103     }
9104     fprintf(fplog, "\n");
9105
9106     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9107     {
9108         print_dd_load_av(fplog, cr->dd);
9109     }
9110 }
9111
9112 void dd_partition_system(FILE                *fplog,
9113                          gmx_int64_t          step,
9114                          t_commrec           *cr,
9115                          gmx_bool             bMasterState,
9116                          int                  nstglobalcomm,
9117                          t_state             *state_global,
9118                          gmx_mtop_t          *top_global,
9119                          t_inputrec          *ir,
9120                          t_state             *state_local,
9121                          rvec               **f,
9122                          t_mdatoms           *mdatoms,
9123                          gmx_localtop_t      *top_local,
9124                          t_forcerec          *fr,
9125                          gmx_vsite_t         *vsite,
9126                          gmx_shellfc_t       *shellfc,
9127                          gmx_constr_t         constr,
9128                          t_nrnb              *nrnb,
9129                          gmx_wallcycle_t      wcycle,
9130                          gmx_bool             bVerbose)
9131 {
9132     gmx_domdec_t      *dd;
9133     gmx_domdec_comm_t *comm;
9134     gmx_ddbox_t        ddbox = {0};
9135     t_block           *cgs_gl;
9136     gmx_int64_t        step_pcoupl;
9137     rvec               cell_ns_x0, cell_ns_x1;
9138     int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9139     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bTurnOnDLB, bLogLoad;
9140     gmx_bool           bRedist, bSortCG, bResortAll;
9141     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9142     real               grid_density;
9143     char               sbuf[22];
9144
9145     wallcycle_start(wcycle, ewcDOMDEC);
9146
9147     dd   = cr->dd;
9148     comm = dd->comm;
9149
9150     bBoxChanged = (bMasterState || DEFORM(*ir));
9151     if (ir->epc != epcNO)
9152     {
9153         /* With nstpcouple > 1 pressure coupling happens.
9154          * one step after calculating the pressure.
9155          * Box scaling happens at the end of the MD step,
9156          * after the DD partitioning.
9157          * We therefore have to do DLB in the first partitioning
9158          * after an MD step where P-coupling occured.
9159          * We need to determine the last step in which p-coupling occurred.
9160          * MRS -- need to validate this for vv?
9161          */
9162         n = ir->nstpcouple;
9163         if (n == 1)
9164         {
9165             step_pcoupl = step - 1;
9166         }
9167         else
9168         {
9169             step_pcoupl = ((step - 1)/n)*n + 1;
9170         }
9171         if (step_pcoupl >= comm->partition_step)
9172         {
9173             bBoxChanged = TRUE;
9174         }
9175     }
9176
9177     bNStGlobalComm = (step % nstglobalcomm == 0);
9178
9179     if (!dlbIsOn(comm))
9180     {
9181         bDoDLB = FALSE;
9182     }
9183     else
9184     {
9185         /* Should we do dynamic load balacing this step?
9186          * Since it requires (possibly expensive) global communication,
9187          * we might want to do DLB less frequently.
9188          */
9189         if (bBoxChanged || ir->epc != epcNO)
9190         {
9191             bDoDLB = bBoxChanged;
9192         }
9193         else
9194         {
9195             bDoDLB = bNStGlobalComm;
9196         }
9197     }
9198
9199     /* Check if we have recorded loads on the nodes */
9200     if (comm->bRecordLoad && dd_load_count(comm) > 0)
9201     {
9202         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
9203
9204         /* Print load every nstlog, first and last step to the log file */
9205         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9206                     comm->n_load_collect == 0 ||
9207                     (ir->nsteps >= 0 &&
9208                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9209
9210         /* Avoid extra communication due to verbose screen output
9211          * when nstglobalcomm is set.
9212          */
9213         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
9214             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9215         {
9216             get_load_distribution(dd, wcycle);
9217             if (DDMASTER(dd))
9218             {
9219                 if (bLogLoad)
9220                 {
9221                     dd_print_load(fplog, dd, step-1);
9222                 }
9223                 if (bVerbose)
9224                 {
9225                     dd_print_load_verbose(dd);
9226                 }
9227             }
9228             comm->n_load_collect++;
9229
9230             if (bCheckWhetherToTurnDlbOn)
9231             {
9232                 /* Since the timings are node dependent, the master decides */
9233                 if (DDMASTER(dd))
9234                 {
9235                     /* Here we check if the max PME rank load is more than 0.98
9236                      * the max PP force load. If so, PP DLB will not help,
9237                      * since we are (almost) limited by PME. Furthermore,
9238                      * DLB will cause a significant extra x/f redistribution
9239                      * cost on the PME ranks, which will then surely result
9240                      * in lower total performance.
9241                      * This check might be fragile, since one measurement
9242                      * below 0.98 (although only done once every 100 DD part.)
9243                      * could turn on DLB for the rest of the run.
9244                      */
9245                     if (cr->npmenodes > 0 &&
9246                         dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
9247                     {
9248                         bTurnOnDLB = FALSE;
9249                     }
9250                     else
9251                     {
9252                         bTurnOnDLB =
9253                             (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9254                     }
9255                     if (debug)
9256                     {
9257                         fprintf(debug, "step %s, imb loss %f\n",
9258                                 gmx_step_str(step, sbuf),
9259                                 dd_force_imb_perf_loss(dd));
9260                     }
9261                 }
9262                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9263                 if (bTurnOnDLB)
9264                 {
9265                     turn_on_dlb(fplog, cr, step);
9266                     bDoDLB = TRUE;
9267                 }
9268             }
9269         }
9270         comm->n_load_have++;
9271     }
9272
9273     cgs_gl = &comm->cgs_gl;
9274
9275     bRedist = FALSE;
9276     if (bMasterState)
9277     {
9278         /* Clear the old state */
9279         clear_dd_indices(dd, 0, 0);
9280         ncgindex_set = 0;
9281
9282         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9283                   TRUE, cgs_gl, state_global->x, &ddbox);
9284
9285         get_cg_distribution(fplog, dd, cgs_gl,
9286                             state_global->box, &ddbox, state_global->x);
9287
9288         dd_distribute_state(dd, cgs_gl,
9289                             state_global, state_local, f);
9290
9291         dd_make_local_cgs(dd, &top_local->cgs);
9292
9293         /* Ensure that we have space for the new distribution */
9294         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9295
9296         if (fr->cutoff_scheme == ecutsGROUP)
9297         {
9298             calc_cgcm(fplog, 0, dd->ncg_home,
9299                       &top_local->cgs, state_local->x, fr->cg_cm);
9300         }
9301
9302         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9303
9304         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9305     }
9306     else if (state_local->ddp_count != dd->ddp_count)
9307     {
9308         if (state_local->ddp_count > dd->ddp_count)
9309         {
9310             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9311         }
9312
9313         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9314         {
9315             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9316         }
9317
9318         /* Clear the old state */
9319         clear_dd_indices(dd, 0, 0);
9320
9321         /* Build the new indices */
9322         rebuild_cgindex(dd, cgs_gl->index, state_local);
9323         make_dd_indices(dd, cgs_gl->index, 0);
9324         ncgindex_set = dd->ncg_home;
9325
9326         if (fr->cutoff_scheme == ecutsGROUP)
9327         {
9328             /* Redetermine the cg COMs */
9329             calc_cgcm(fplog, 0, dd->ncg_home,
9330                       &top_local->cgs, state_local->x, fr->cg_cm);
9331         }
9332
9333         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9334
9335         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9336
9337         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9338                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9339
9340         bRedist = dlbIsOn(comm);
9341     }
9342     else
9343     {
9344         /* We have the full state, only redistribute the cgs */
9345
9346         /* Clear the non-home indices */
9347         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9348         ncgindex_set = 0;
9349
9350         /* Avoid global communication for dim's without pbc and -gcom */
9351         if (!bNStGlobalComm)
9352         {
9353             copy_rvec(comm->box0, ddbox.box0    );
9354             copy_rvec(comm->box_size, ddbox.box_size);
9355         }
9356         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9357                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9358
9359         bBoxChanged = TRUE;
9360         bRedist     = TRUE;
9361     }
9362     /* For dim's without pbc and -gcom */
9363     copy_rvec(ddbox.box0, comm->box0    );
9364     copy_rvec(ddbox.box_size, comm->box_size);
9365
9366     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9367                       step, wcycle);
9368
9369     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9370     {
9371         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9372     }
9373
9374     /* Check if we should sort the charge groups */
9375     if (comm->nstSortCG > 0)
9376     {
9377         bSortCG = (bMasterState ||
9378                    (bRedist && (step % comm->nstSortCG == 0)));
9379     }
9380     else
9381     {
9382         bSortCG = FALSE;
9383     }
9384
9385     ncg_home_old = dd->ncg_home;
9386
9387     ncg_moved = 0;
9388     if (bRedist)
9389     {
9390         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9391
9392         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9393                            state_local, f, fr,
9394                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9395
9396         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9397     }
9398
9399     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9400                           dd, &ddbox,
9401                           &comm->cell_x0, &comm->cell_x1,
9402                           dd->ncg_home, fr->cg_cm,
9403                           cell_ns_x0, cell_ns_x1, &grid_density);
9404
9405     if (bBoxChanged)
9406     {
9407         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9408     }
9409
9410     switch (fr->cutoff_scheme)
9411     {
9412         case ecutsGROUP:
9413             copy_ivec(fr->ns.grid->n, ncells_old);
9414             grid_first(fplog, fr->ns.grid, dd, &ddbox,
9415                        state_local->box, cell_ns_x0, cell_ns_x1,
9416                        fr->rlistlong, grid_density);
9417             break;
9418         case ecutsVERLET:
9419             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9420             break;
9421         default:
9422             gmx_incons("unimplemented");
9423     }
9424     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9425     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9426
9427     if (bSortCG)
9428     {
9429         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9430
9431         /* Sort the state on charge group position.
9432          * This enables exact restarts from this step.
9433          * It also improves performance by about 15% with larger numbers
9434          * of atoms per node.
9435          */
9436
9437         /* Fill the ns grid with the home cell,
9438          * so we can sort with the indices.
9439          */
9440         set_zones_ncg_home(dd);
9441
9442         switch (fr->cutoff_scheme)
9443         {
9444             case ecutsVERLET:
9445                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9446
9447                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9448                                   0,
9449                                   comm->zones.size[0].bb_x0,
9450                                   comm->zones.size[0].bb_x1,
9451                                   0, dd->ncg_home,
9452                                   comm->zones.dens_zone0,
9453                                   fr->cginfo,
9454                                   state_local->x,
9455                                   ncg_moved, bRedist ? comm->moved : NULL,
9456                                   fr->nbv->grp[eintLocal].kernel_type,
9457                                   fr->nbv->grp[eintLocal].nbat);
9458
9459                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9460                 break;
9461             case ecutsGROUP:
9462                 fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
9463                           0, dd->ncg_home, fr->cg_cm);
9464
9465                 copy_ivec(fr->ns.grid->n, ncells_new);
9466                 break;
9467             default:
9468                 gmx_incons("unimplemented");
9469         }
9470
9471         bResortAll = bMasterState;
9472
9473         /* Check if we can user the old order and ns grid cell indices
9474          * of the charge groups to sort the charge groups efficiently.
9475          */
9476         if (ncells_new[XX] != ncells_old[XX] ||
9477             ncells_new[YY] != ncells_old[YY] ||
9478             ncells_new[ZZ] != ncells_old[ZZ])
9479         {
9480             bResortAll = TRUE;
9481         }
9482
9483         if (debug)
9484         {
9485             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9486                     gmx_step_str(step, sbuf), dd->ncg_home);
9487         }
9488         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9489                       bResortAll ? -1 : ncg_home_old);
9490         /* Rebuild all the indices */
9491         ga2la_clear(dd->ga2la);
9492         ncgindex_set = 0;
9493
9494         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9495     }
9496
9497     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9498
9499     /* Setup up the communication and communicate the coordinates */
9500     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9501
9502     /* Set the indices */
9503     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9504
9505     /* Set the charge group boundaries for neighbor searching */
9506     set_cg_boundaries(&comm->zones);
9507
9508     if (fr->cutoff_scheme == ecutsVERLET)
9509     {
9510         set_zones_size(dd, state_local->box, &ddbox,
9511                        bSortCG ? 1 : 0, comm->zones.n);
9512     }
9513
9514     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9515
9516     /*
9517        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9518                  -1,state_local->x,state_local->box);
9519      */
9520
9521     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9522
9523     /* Extract a local topology from the global topology */
9524     for (i = 0; i < dd->ndim; i++)
9525     {
9526         np[dd->dim[i]] = comm->cd[i].np;
9527     }
9528     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9529                       comm->cellsize_min, np,
9530                       fr,
9531                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9532                       vsite, top_global, top_local);
9533
9534     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9535
9536     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9537
9538     /* Set up the special atom communication */
9539     n = comm->nat[ddnatZONE];
9540     for (i = ddnatZONE+1; i < ddnatNR; i++)
9541     {
9542         switch (i)
9543         {
9544             case ddnatVSITE:
9545                 if (vsite && vsite->n_intercg_vsite)
9546                 {
9547                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9548                 }
9549                 break;
9550             case ddnatCON:
9551                 if (dd->bInterCGcons || dd->bInterCGsettles)
9552                 {
9553                     /* Only for inter-cg constraints we need special code */
9554                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9555                                                   constr, ir->nProjOrder,
9556                                                   top_local->idef.il);
9557                 }
9558                 break;
9559             default:
9560                 gmx_incons("Unknown special atom type setup");
9561         }
9562         comm->nat[i] = n;
9563     }
9564
9565     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9566
9567     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9568
9569     /* Make space for the extra coordinates for virtual site
9570      * or constraint communication.
9571      */
9572     state_local->natoms = comm->nat[ddnatNR-1];
9573     if (state_local->natoms > state_local->nalloc)
9574     {
9575         dd_realloc_state(state_local, f, state_local->natoms);
9576     }
9577
9578     if (fr->bF_NoVirSum)
9579     {
9580         if (vsite && vsite->n_intercg_vsite)
9581         {
9582             nat_f_novirsum = comm->nat[ddnatVSITE];
9583         }
9584         else
9585         {
9586             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9587             {
9588                 nat_f_novirsum = dd->nat_tot;
9589             }
9590             else
9591             {
9592                 nat_f_novirsum = dd->nat_home;
9593             }
9594         }
9595     }
9596     else
9597     {
9598         nat_f_novirsum = 0;
9599     }
9600
9601     /* Set the number of atoms required for the force calculation.
9602      * Forces need to be constrained when using a twin-range setup
9603      * or with energy minimization. For simple simulations we could
9604      * avoid some allocation, zeroing and copying, but this is
9605      * probably not worth the complications ande checking.
9606      */
9607     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9608                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9609
9610     /* We make the all mdatoms up to nat_tot_con.
9611      * We could save some work by only setting invmass
9612      * between nat_tot and nat_tot_con.
9613      */
9614     /* This call also sets the new number of home particles to dd->nat_home */
9615     atoms2md(top_global, ir,
9616              comm->nat[ddnatCON], dd->gatindex, dd->nat_home, mdatoms);
9617
9618     /* Now we have the charges we can sort the FE interactions */
9619     dd_sort_local_top(dd, mdatoms, top_local);
9620
9621     if (vsite != NULL)
9622     {
9623         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9624         split_vsites_over_threads(top_local->idef.il, top_local->idef.iparams,
9625                                   mdatoms, FALSE, vsite);
9626     }
9627
9628     if (shellfc)
9629     {
9630         /* Make the local shell stuff, currently no communication is done */
9631         make_local_shells(cr, mdatoms, shellfc);
9632     }
9633
9634     if (ir->implicit_solvent)
9635     {
9636         make_local_gb(cr, fr->born, ir->gb_algorithm);
9637     }
9638
9639     setup_bonded_threading(fr, &top_local->idef);
9640
9641     if (!(cr->duty & DUTY_PME))
9642     {
9643         /* Send the charges and/or c6/sigmas to our PME only node */
9644         gmx_pme_send_parameters(cr,
9645                                 fr->ic,
9646                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9647                                 mdatoms->chargeA, mdatoms->chargeB,
9648                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9649                                 mdatoms->sigmaA, mdatoms->sigmaB,
9650                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9651     }
9652
9653     if (constr)
9654     {
9655         set_constraints(constr, top_local, ir, mdatoms, cr);
9656     }
9657
9658     if (ir->bPull)
9659     {
9660         /* Update the local pull groups */
9661         dd_make_local_pull_groups(cr, ir->pull_work, mdatoms);
9662     }
9663
9664     if (ir->bRot)
9665     {
9666         /* Update the local rotation groups */
9667         dd_make_local_rotation_groups(dd, ir->rot);
9668     }
9669
9670     if (ir->eSwapCoords != eswapNO)
9671     {
9672         /* Update the local groups needed for ion swapping */
9673         dd_make_local_swap_groups(dd, ir->swap);
9674     }
9675
9676     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9677     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9678
9679     add_dd_statistics(dd);
9680
9681     /* Make sure we only count the cycles for this DD partitioning */
9682     clear_dd_cycle_counts(dd);
9683
9684     /* Because the order of the atoms might have changed since
9685      * the last vsite construction, we need to communicate the constructing
9686      * atom coordinates again (for spreading the forces this MD step).
9687      */
9688     dd_move_x_vsites(dd, state_local->box, state_local->x);
9689
9690     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9691
9692     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9693     {
9694         dd_move_x(dd, state_local->box, state_local->x);
9695         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9696                      -1, state_local->x, state_local->box);
9697     }
9698
9699     /* Store the partitioning step */
9700     comm->partition_step = step;
9701
9702     /* Increase the DD partitioning counter */
9703     dd->ddp_count++;
9704     /* The state currently matches this DD partitioning count, store it */
9705     state_local->ddp_count = dd->ddp_count;
9706     if (bMasterState)
9707     {
9708         /* The DD master node knows the complete cg distribution,
9709          * store the count so we can possibly skip the cg info communication.
9710          */
9711         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9712     }
9713
9714     if (comm->DD_debug > 0)
9715     {
9716         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9717         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9718                                 "after partitioning");
9719     }
9720
9721     wallcycle_stop(wcycle, ewcDOMDEC);
9722 }