src/mdlib/domdec.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of Gromacs        Copyright (c) 1991-2008
   5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation; either version 2
  10  * of the License, or (at your option) any later version.
  11  *
  12  * To help us fund GROMACS development, we humbly ask that you cite
  13  * the research papers on the package. Check out http://www.gromacs.org
  14  *
  15  * And Hey:
  16  * Gnomes, ROck Monsters And Chili Sauce
  17  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 #include <config.h>
  21 #endif
  22
  23 #include <stdio.h>
  24 #include <time.h>
  25 #include <math.h>
  26 #include <string.h>
  27 #include <stdlib.h>
  28 #include "typedefs.h"
  29 #include "smalloc.h"
  30 #include "vec.h"
  31 #include "domdec.h"
  32 #include "domdec_network.h"
  33 #include "nrnb.h"
  34 #include "pbc.h"
  35 #include "constr.h"
  36 #include "mdatoms.h"
  37 #include "names.h"
  38 #include "pdbio.h"
  39 #include "futil.h"
  40 #include "force.h"
  41 #include "pme.h"
  42 #include "pull.h"
  43 #include "gmx_wallcycle.h"
  44 #include "mdrun.h"
  45 #include "nsgrid.h"
  46 #include "shellfc.h"
  47 #include "mtop_util.h"
  48 #include "gmxfio.h"
  49 #include "gmx_ga2la.h"
  50
  51 #ifdef GMX_LIB_MPI
  52 #include <mpi.h>
  53 #endif
  54 #ifdef GMX_THREADS
  55 #include "tmpi.h"
  56 #endif
  57
  58 #define DDRANK(dd,rank)    (rank)
  59 #define DDMASTERRANK(dd)   (dd->masterrank)
  60
  61 typedef struct gmx_domdec_master
  62 {
  63     /* The cell boundaries */
  64     real **cell_x;
  65     /* The global charge group division */
  66     int  *ncg;     /* Number of home charge groups for each node */
  67     int  *index;   /* Index of nnodes+1 into cg */
  68     int  *cg;      /* Global charge group index */
  69     int  *nat;     /* Number of home atoms for each node. */
  70     int  *ibuf;    /* Buffer for communication */
  71     rvec *vbuf;    /* Buffer for state scattering and gathering */
  72 } gmx_domdec_master_t;
  73
  74 typedef struct
  75 {
  76     /* The numbers of charge groups to send and receive for each cell
  77      * that requires communication, the last entry contains the total
  78      * number of atoms that needs to be communicated.
  79      */
  80     int nsend[DD_MAXIZONE+2];
  81     int nrecv[DD_MAXIZONE+2];
  82     /* The charge groups to send */
  83     int *index;
  84     int nalloc;
  85     /* The atom range for non-in-place communication */
  86     int cell2at0[DD_MAXIZONE];
  87     int cell2at1[DD_MAXIZONE];
  88 } gmx_domdec_ind_t;
  89
  90 typedef struct
  91 {
  92     int  np;                   /* Number of grid pulses in this dimension */
  93     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
  94     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
  95     int  np_nalloc;
  96     bool bInPlace;             /* Can we communicate in place?            */
  97 } gmx_domdec_comm_dim_t;
  98
  99 typedef struct
 100 {
 101     bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 102     real *cell_f;      /* State var.: cell boundaries, box relative      */
 103     real *old_cell_f;  /* Temp. var.: old cell size                      */
 104     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 105     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 106     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 107     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 108     bool bLimited;     /* State var.: is DLB limited in this dim and row */
 109     real *buf_ncd;     /* Temp. var.                                     */
 110 } gmx_domdec_root_t;
 111
 112 #define DD_NLOAD_MAX 9
 113
 114 /* Here floats are accurate enough, since these variables
 115  * only influence the load balancing, not the actual MD results.
 116  */
 117 typedef struct
 118 {
 119     int  nload;
 120     float *load;
 121     float sum;
 122     float max;
 123     float sum_m;
 124     float cvol_min;
 125     float mdf;
 126     float pme;
 127     int   flags;
 128 } gmx_domdec_load_t;
 129
 130 typedef struct
 131 {
 132     int  nsc;
 133     int  ind_gl;
 134     int  ind;
 135 } gmx_cgsort_t;
 136
 137 typedef struct
 138 {
 139     gmx_cgsort_t *sort1,*sort2;
 140     int  sort_nalloc;
 141     gmx_cgsort_t *sort_new;
 142     int  sort_new_nalloc;
 143     int  *ibuf;
 144     int  ibuf_nalloc;
 145 } gmx_domdec_sort_t;
 146
 147 typedef struct
 148 {
 149     rvec *v;
 150     int  nalloc;
 151 } vec_rvec_t;
 152
 153 /* This enum determines the order of the coordinates.
 154  * ddnatHOME and ddnatZONE should be first and second,
 155  * the others can be ordered as wanted.
 156  */
 157 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
 158
 159 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
 160 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 161
 162 typedef struct
 163 {
 164     int  dimind;   /* The dimension index                                    */
 165     int  nslab;    /* The number of PME slabs in this dimension              */
 166     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 167     int  *pp_min;  /* The minimum pp node location, size nslab               */
 168     int  *pp_max;  /* The maximum pp node location,size nslab                */
 169     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
 170 } gmx_ddpme_t;
 171
 172 typedef struct
 173 {
 174     real min0;    /* The minimum bottom of this zone                        */
 175     real max1;    /* The maximum top of this zone                           */
 176     real mch0;    /* The maximum bottom communicaton height for this zone   */
 177     real mch1;    /* The maximum top communicaton height for this zone      */
 178     real p1_0;    /* The bottom value of the first cell in this zone        */
 179     real p1_1;    /* The top value of the first cell in this zone           */
 180 } gmx_ddzone_t;
 181
 182 typedef struct gmx_domdec_comm
 183 {
 184     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 185      * unless stated otherwise.
 186      */
 187
 188     /* The number of decomposition dimensions for PME, 0: no PME */
 189     int  npmedecompdim;
 190     /* The number of nodes doing PME (PP/PME or only PME) */
 191     int  npmenodes;
 192     int  npmenodes_major;
 193     /* The communication setup including the PME only nodes */
 194     bool bCartesianPP_PME;
 195     ivec ntot;
 196     int  cartpmedim;
 197     int  *pmenodes;          /* size npmenodes                         */
 198     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 199                               * but with bCartesianPP_PME              */
 200     gmx_ddpme_t ddpme[2];
 201
 202     /* The DD particle-particle nodes only */
 203     bool bCartesianPP;
 204     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 205
 206     /* The global charge groups */
 207     t_block cgs_gl;
 208
 209     /* Should we sort the cgs */
 210     int  nstSortCG;
 211     gmx_domdec_sort_t *sort;
 212
 213     /* Are there bonded and multi-body interactions between charge groups? */
 214     bool bInterCGBondeds;
 215     bool bInterCGMultiBody;
 216
 217     /* Data for the optional bonded interaction atom communication range */
 218     bool bBondComm;
 219     t_blocka *cglink;
 220     char *bLocalCG;
 221
 222     /* The DLB option */
 223     int  eDLB;
 224     /* Are we actually using DLB? */
 225     bool bDynLoadBal;
 226
 227     /* Cell sizes for static load balancing, first index cartesian */
 228     real **slb_frac;
 229
 230     /* The width of the communicated boundaries */
 231     real cutoff_mbody;
 232     real cutoff;
 233     /* The minimum cell size (including triclinic correction) */
 234     rvec cellsize_min;
 235     /* For dlb, for use with edlbAUTO */
 236     rvec cellsize_min_dlb;
 237     /* The lower limit for the DD cell size with DLB */
 238     real cellsize_limit;
 239     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 240     bool bVacDLBNoLimit;
 241
 242     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 243     ivec tric_dir;
 244     /* box0 and box_size are required with dim's without pbc and -gcom */
 245     rvec box0;
 246     rvec box_size;
 247
 248     /* The cell boundaries */
 249     rvec cell_x0;
 250     rvec cell_x1;
 251
 252     /* The old location of the cell boundaries, to check cg displacements */
 253     rvec old_cell_x0;
 254     rvec old_cell_x1;
 255
 256     /* The communication setup and charge group boundaries for the zones */
 257     gmx_domdec_zones_t zones;
 258
 259     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 260      * cell boundaries of neighboring cells for dynamic load balancing.
 261      */
 262     gmx_ddzone_t zone_d1[2];
 263     gmx_ddzone_t zone_d2[2][2];
 264
 265     /* The coordinate/force communication setup and indices */
 266     gmx_domdec_comm_dim_t cd[DIM];
 267     /* The maximum number of cells to communicate with in one dimension */
 268     int  maxpulse;
 269
 270     /* Which cg distribution is stored on the master node */
 271     int master_cg_ddp_count;
 272
 273     /* The number of cg's received from the direct neighbors */
 274     int  zone_ncg1[DD_MAXZONE];
 275
 276     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 277     int  nat[ddnatNR];
 278
 279     /* Communication buffer for general use */
 280     int  *buf_int;
 281     int  nalloc_int;
 282
 283      /* Communication buffer for general use */
 284     vec_rvec_t vbuf;
 285
 286     /* Communication buffers only used with multiple grid pulses */
 287     int  *buf_int2;
 288     int  nalloc_int2;
 289     vec_rvec_t vbuf2;
 290
 291     /* Communication buffers for local redistribution */
 292     int  **cggl_flag;
 293     int  cggl_flag_nalloc[DIM*2];
 294     rvec **cgcm_state;
 295     int  cgcm_state_nalloc[DIM*2];
 296
 297     /* Cell sizes for dynamic load balancing */
 298     gmx_domdec_root_t **root;
 299     real *cell_f_row;
 300     real cell_f0[DIM];
 301     real cell_f1[DIM];
 302     real cell_f_max0[DIM];
 303     real cell_f_min1[DIM];
 304
 305     /* Stuff for load communication */
 306     bool bRecordLoad;
 307     gmx_domdec_load_t *load;
 308 #ifdef GMX_MPI
 309     MPI_Comm *mpi_comm_load;
 310 #endif
 311     /* Cycle counters */
 312     float cycl[ddCyclNr];
 313     int   cycl_n[ddCyclNr];
 314     float cycl_max[ddCyclNr];
 315     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 316     int eFlop;
 317     double flop;
 318     int    flop_n;
 319     /* Have often have did we have load measurements */
 320     int    n_load_have;
 321     /* Have often have we collected the load measurements */
 322     int    n_load_collect;
 323
 324     /* Statistics */
 325     double sum_nat[ddnatNR-ddnatZONE];
 326     int    ndecomp;
 327     int    nload;
 328     double load_step;
 329     double load_sum;
 330     double load_max;
 331     ivec   load_lim;
 332     double load_mdf;
 333     double load_pme;
 334
 335     /* The last partition step */
 336     gmx_step_t partition_step;
 337
 338     /* Debugging */
 339     int  nstDDDump;
 340     int  nstDDDumpGrid;
 341     int  DD_debug;
 342 } gmx_domdec_comm_t;
 343
 344 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 345 #define DD_CGIBS 2
 346
 347 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 348 #define DD_FLAG_NRCG  65535
 349 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 350 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 351
 352 /* Zone permutation required to obtain consecutive charge groups
 353  * for neighbor searching.
 354  */
 355 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
 356
 357 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 358  * components see only j zones with that component 0.
 359  */
 360
 361 /* The DD zone order */
 362 static const ivec dd_zo[DD_MAXZONE] =
 363   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
 364
 365 /* The 3D setup */
 366 #define dd_z3n  8
 367 #define dd_zp3n 4
 368 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
 369
 370 /* The 2D setup */
 371 #define dd_z2n  4
 372 #define dd_zp2n 2
 373 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
 374
 375 /* The 1D setup */
 376 #define dd_z1n  2
 377 #define dd_zp1n 1
 378 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
 379
 380 /* Factors used to avoid problems due to rounding issues */
 381 #define DD_CELL_MARGIN       1.0001
 382 #define DD_CELL_MARGIN2      1.00005
 383 /* Factor to account for pressure scaling during nstlist steps */
 384 #define DD_PRES_SCALE_MARGIN 1.02
 385
 386 /* Allowed performance loss before we DLB or warn */
 387 #define DD_PERF_LOSS 0.05
 388
 389 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 390
 391 /* Use separate MPI send and receive commands
 392  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 393  * This saves memory (and some copying for small nnodes).
 394  * For high parallelization scatter and gather calls are used.
 395  */
 396 #define GMX_DD_NNODES_SENDRECV 4
 397
 398
 399 /*
 400 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 401
 402 static void index2xyz(ivec nc,int ind,ivec xyz)
 403 {
 404   xyz[XX] = ind % nc[XX];
 405   xyz[YY] = (ind / nc[XX]) % nc[YY];
 406   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 407 }
 408 */
 409
 410 /* This order is required to minimize the coordinate communication in PME
 411  * which uses decomposition in the x direction.
 412  */
 413 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 414
 415 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
 416 {
 417     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 418     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 419     xyz[ZZ] = ind % nc[ZZ];
 420 }
 421
 422 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
 423 {
 424     int ddindex;
 425     int ddnodeid=-1;
 426
 427     ddindex = dd_index(dd->nc,c);
 428     if (dd->comm->bCartesianPP_PME)
 429     {
 430         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 431     }
 432     else if (dd->comm->bCartesianPP)
 433     {
 434 #ifdef GMX_MPI
 435         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
 436 #endif
 437     }
 438     else
 439     {
 440         ddnodeid = ddindex;
 441     }
 442
 443     return ddnodeid;
 444 }
 445
 446 static bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
 447 {
 448     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 449 }
 450
 451 int ddglatnr(gmx_domdec_t *dd,int i)
 452 {
 453     int atnr;
 454
 455     if (dd == NULL)
 456     {
 457         atnr = i + 1;
 458     }
 459     else
 460     {
 461         if (i >= dd->comm->nat[ddnatNR-1])
 462         {
 463             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
 464         }
 465         atnr = dd->gatindex[i] + 1;
 466     }
 467
 468     return atnr;
 469 }
 470
 471 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 472 {
 473     return &dd->comm->cgs_gl;
 474 }
 475
 476 static void check_vec_rvec_alloc(vec_rvec_t *v,int n)
 477 {
 478     if (n > v->nalloc)
 479     {
 480         v->nalloc = over_alloc_dd(n);
 481         srenew(v->v,v->nalloc);
 482     }
 483 }
 484
 485 void dd_store_state(gmx_domdec_t *dd,t_state *state)
 486 {
 487     int i;
 488
 489     if (state->ddp_count != dd->ddp_count)
 490     {
 491         gmx_incons("The state does not the domain decomposition state");
 492     }
 493
 494     state->ncg_gl = dd->ncg_home;
 495     if (state->ncg_gl > state->cg_gl_nalloc)
 496     {
 497         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 498         srenew(state->cg_gl,state->cg_gl_nalloc);
 499     }
 500     for(i=0; i<state->ncg_gl; i++)
 501     {
 502         state->cg_gl[i] = dd->index_gl[i];
 503     }
 504
 505     state->ddp_count_cg_gl = dd->ddp_count;
 506 }
 507
 508 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 509 {
 510     return &dd->comm->zones;
 511 }
 512
 513 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
 514                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
 515 {
 516     gmx_domdec_zones_t *zones;
 517     int izone,d,dim;
 518
 519     zones = &dd->comm->zones;
 520
 521     izone = 0;
 522     while (icg >= zones->izone[izone].cg1)
 523     {
 524         izone++;
 525     }
 526
 527     if (izone == 0)
 528     {
 529         *jcg0 = icg;
 530     }
 531     else if (izone < zones->nizone)
 532     {
 533         *jcg0 = zones->izone[izone].jcg0;
 534     }
 535     else
 536     {
 537         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
 538                   icg,izone,zones->nizone);
 539     }
 540
 541     *jcg1 = zones->izone[izone].jcg1;
 542
 543     for(d=0; d<dd->ndim; d++)
 544     {
 545         dim = dd->dim[d];
 546         shift0[dim] = zones->izone[izone].shift0[dim];
 547         shift1[dim] = zones->izone[izone].shift1[dim];
 548         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 549         {
 550             /* A conservative approach, this can be optimized */
 551             shift0[dim] -= 1;
 552             shift1[dim] += 1;
 553         }
 554     }
 555 }
 556
 557 int dd_natoms_vsite(gmx_domdec_t *dd)
 558 {
 559     return dd->comm->nat[ddnatVSITE];
 560 }
 561
 562 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
 563 {
 564     *at_start = dd->comm->nat[ddnatCON-1];
 565     *at_end   = dd->comm->nat[ddnatCON];
 566 }
 567
 568 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
 569 {
 570     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 571     int  *index,*cgindex;
 572     gmx_domdec_comm_t *comm;
 573     gmx_domdec_comm_dim_t *cd;
 574     gmx_domdec_ind_t *ind;
 575     rvec shift={0,0,0},*buf,*rbuf;
 576     bool bPBC,bScrew;
 577
 578     comm = dd->comm;
 579
 580     cgindex = dd->cgindex;
 581
 582     buf = comm->vbuf.v;
 583
 584     nzone = 1;
 585     nat_tot = dd->nat_home;
 586     for(d=0; d<dd->ndim; d++)
 587     {
 588         bPBC   = (dd->ci[dd->dim[d]] == 0);
 589         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 590         if (bPBC)
 591         {
 592             copy_rvec(box[dd->dim[d]],shift);
 593         }
 594         cd = &comm->cd[d];
 595         for(p=0; p<cd->np; p++)
 596         {
 597             ind = &cd->ind[p];
 598             index = ind->index;
 599             n = 0;
 600             if (!bPBC)
 601             {
 602                 for(i=0; i<ind->nsend[nzone]; i++)
 603                 {
 604                     at0 = cgindex[index[i]];
 605                     at1 = cgindex[index[i]+1];
 606                     for(j=at0; j<at1; j++)
 607                     {
 608                         copy_rvec(x[j],buf[n]);
 609                         n++;
 610                     }
 611                 }
 612             }
 613             else if (!bScrew)
 614             {
 615                 for(i=0; i<ind->nsend[nzone]; i++)
 616                 {
 617                     at0 = cgindex[index[i]];
 618                     at1 = cgindex[index[i]+1];
 619                     for(j=at0; j<at1; j++)
 620                     {
 621                         /* We need to shift the coordinates */
 622                         rvec_add(x[j],shift,buf[n]);
 623                         n++;
 624                     }
 625                 }
 626             }
 627             else
 628             {
 629                 for(i=0; i<ind->nsend[nzone]; i++)
 630                 {
 631                     at0 = cgindex[index[i]];
 632                     at1 = cgindex[index[i]+1];
 633                     for(j=at0; j<at1; j++)
 634                     {
 635                         /* Shift x */
 636                         buf[n][XX] = x[j][XX] + shift[XX];
 637                         /* Rotate y and z.
 638                          * This operation requires a special shift force
 639                          * treatment, which is performed in calc_vir.
 640                          */
 641                         buf[n][YY] = box[YY][YY] - x[j][YY];
 642                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 643                         n++;
 644                     }
 645                 }
 646             }
 647
 648             if (cd->bInPlace)
 649             {
 650                 rbuf = x + nat_tot;
 651             }
 652             else
 653             {
 654                 rbuf = comm->vbuf2.v;
 655             }
 656             /* Send and receive the coordinates */
 657             dd_sendrecv_rvec(dd, d, dddirBackward,
 658                              buf,  ind->nsend[nzone+1],
 659                              rbuf, ind->nrecv[nzone+1]);
 660             if (!cd->bInPlace)
 661             {
 662                 j = 0;
 663                 for(zone=0; zone<nzone; zone++)
 664                 {
 665                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 666                     {
 667                         copy_rvec(rbuf[j],x[i]);
 668                         j++;
 669                     }
 670                 }
 671             }
 672             nat_tot += ind->nrecv[nzone+1];
 673         }
 674         nzone += nzone;
 675     }
 676 }
 677
 678 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
 679 {
 680     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 681     int  *index,*cgindex;
 682     gmx_domdec_comm_t *comm;
 683     gmx_domdec_comm_dim_t *cd;
 684     gmx_domdec_ind_t *ind;
 685     rvec *buf,*sbuf;
 686     ivec vis;
 687     int  is;
 688     bool bPBC,bScrew;
 689
 690     comm = dd->comm;
 691
 692     cgindex = dd->cgindex;
 693
 694     buf = comm->vbuf.v;
 695
 696     n = 0;
 697     nzone = comm->zones.n/2;
 698     nat_tot = dd->nat_tot;
 699     for(d=dd->ndim-1; d>=0; d--)
 700     {
 701         bPBC   = (dd->ci[dd->dim[d]] == 0);
 702         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 703         if (fshift == NULL && !bScrew)
 704         {
 705             bPBC = FALSE;
 706         }
 707         /* Determine which shift vector we need */
 708         clear_ivec(vis);
 709         vis[dd->dim[d]] = 1;
 710         is = IVEC2IS(vis);
 711
 712         cd = &comm->cd[d];
 713         for(p=cd->np-1; p>=0; p--) {
 714             ind = &cd->ind[p];
 715             nat_tot -= ind->nrecv[nzone+1];
 716             if (cd->bInPlace)
 717             {
 718                 sbuf = f + nat_tot;
 719             }
 720             else
 721             {
 722                 sbuf = comm->vbuf2.v;
 723                 j = 0;
 724                 for(zone=0; zone<nzone; zone++)
 725                 {
 726                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 727                     {
 728                         copy_rvec(f[i],sbuf[j]);
 729                         j++;
 730                     }
 731                 }
 732             }
 733             /* Communicate the forces */
 734             dd_sendrecv_rvec(dd, d, dddirForward,
 735                              sbuf, ind->nrecv[nzone+1],
 736                              buf,  ind->nsend[nzone+1]);
 737             index = ind->index;
 738             /* Add the received forces */
 739             n = 0;
 740             if (!bPBC)
 741             {
 742                 for(i=0; i<ind->nsend[nzone]; i++)
 743                 {
 744                     at0 = cgindex[index[i]];
 745                     at1 = cgindex[index[i]+1];
 746                     for(j=at0; j<at1; j++)
 747                     {
 748                         rvec_inc(f[j],buf[n]);
 749                         n++;
 750                     }
 751                 }
 752             }
 753             else if (!bScrew)
 754             {
 755                 for(i=0; i<ind->nsend[nzone]; i++)
 756                 {
 757                     at0 = cgindex[index[i]];
 758                     at1 = cgindex[index[i]+1];
 759                     for(j=at0; j<at1; j++)
 760                     {
 761                         rvec_inc(f[j],buf[n]);
 762                         /* Add this force to the shift force */
 763                         rvec_inc(fshift[is],buf[n]);
 764                         n++;
 765                     }
 766                 }
 767             }
 768             else
 769             {
 770                 for(i=0; i<ind->nsend[nzone]; i++)
 771                 {
 772                     at0 = cgindex[index[i]];
 773                     at1 = cgindex[index[i]+1];
 774                     for(j=at0; j<at1; j++)
 775                     {
 776                         /* Rotate the force */
 777                         f[j][XX] += buf[n][XX];
 778                         f[j][YY] -= buf[n][YY];
 779                         f[j][ZZ] -= buf[n][ZZ];
 780                         if (fshift)
 781                         {
 782                             /* Add this force to the shift force */
 783                             rvec_inc(fshift[is],buf[n]);
 784                         }
 785                         n++;
 786                     }
 787                 }
 788             }
 789         }
 790         nzone /= 2;
 791     }
 792 }
 793
 794 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
 795 {
 796     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 797     int  *index,*cgindex;
 798     gmx_domdec_comm_t *comm;
 799     gmx_domdec_comm_dim_t *cd;
 800     gmx_domdec_ind_t *ind;
 801     real *buf,*rbuf;
 802
 803     comm = dd->comm;
 804
 805     cgindex = dd->cgindex;
 806
 807     buf = &comm->vbuf.v[0][0];
 808
 809     nzone = 1;
 810     nat_tot = dd->nat_home;
 811     for(d=0; d<dd->ndim; d++)
 812     {
 813         cd = &comm->cd[d];
 814         for(p=0; p<cd->np; p++)
 815         {
 816             ind = &cd->ind[p];
 817             index = ind->index;
 818             n = 0;
 819             for(i=0; i<ind->nsend[nzone]; i++)
 820             {
 821                 at0 = cgindex[index[i]];
 822                 at1 = cgindex[index[i]+1];
 823                 for(j=at0; j<at1; j++)
 824                 {
 825                     buf[n] = v[j];
 826                     n++;
 827                 }
 828             }
 829
 830             if (cd->bInPlace)
 831             {
 832                 rbuf = v + nat_tot;
 833             }
 834             else
 835             {
 836                 rbuf = &comm->vbuf2.v[0][0];
 837             }
 838             /* Send and receive the coordinates */
 839             dd_sendrecv_real(dd, d, dddirBackward,
 840                              buf,  ind->nsend[nzone+1],
 841                              rbuf, ind->nrecv[nzone+1]);
 842             if (!cd->bInPlace)
 843             {
 844                 j = 0;
 845                 for(zone=0; zone<nzone; zone++)
 846                 {
 847                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 848                     {
 849                         v[i] = rbuf[j];
 850                         j++;
 851                     }
 852                 }
 853             }
 854             nat_tot += ind->nrecv[nzone+1];
 855         }
 856         nzone += nzone;
 857     }
 858 }
 859
 860 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
 861 {
 862     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 863     int  *index,*cgindex;
 864     gmx_domdec_comm_t *comm;
 865     gmx_domdec_comm_dim_t *cd;
 866     gmx_domdec_ind_t *ind;
 867     real *buf,*sbuf;
 868
 869     comm = dd->comm;
 870
 871     cgindex = dd->cgindex;
 872
 873     buf = &comm->vbuf.v[0][0];
 874
 875     n = 0;
 876     nzone = comm->zones.n/2;
 877     nat_tot = dd->nat_tot;
 878     for(d=dd->ndim-1; d>=0; d--)
 879     {
 880         cd = &comm->cd[d];
 881         for(p=cd->np-1; p>=0; p--) {
 882             ind = &cd->ind[p];
 883             nat_tot -= ind->nrecv[nzone+1];
 884             if (cd->bInPlace)
 885             {
 886                 sbuf = v + nat_tot;
 887             }
 888             else
 889             {
 890                 sbuf = &comm->vbuf2.v[0][0];
 891                 j = 0;
 892                 for(zone=0; zone<nzone; zone++)
 893                 {
 894                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 895                     {
 896                         sbuf[j] = v[i];
 897                         j++;
 898                     }
 899                 }
 900             }
 901             /* Communicate the forces */
 902             dd_sendrecv_real(dd, d, dddirForward,
 903                              sbuf, ind->nrecv[nzone+1],
 904                              buf,  ind->nsend[nzone+1]);
 905             index = ind->index;
 906             /* Add the received forces */
 907             n = 0;
 908             for(i=0; i<ind->nsend[nzone]; i++)
 909             {
 910                 at0 = cgindex[index[i]];
 911                 at1 = cgindex[index[i]+1];
 912                 for(j=at0; j<at1; j++)
 913                 {
 914                     v[j] += buf[n];
 915                     n++;
 916                 }
 917             }
 918         }
 919         nzone /= 2;
 920     }
 921 }
 922
 923 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
 924 {
 925     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 926             d,i,j,
 927             zone->min0,zone->max1,
 928             zone->mch0,zone->mch0,
 929             zone->p1_0,zone->p1_1);
 930 }
 931
 932 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 933                                int ddimind,int direction,
 934                                gmx_ddzone_t *buf_s,int n_s,
 935                                gmx_ddzone_t *buf_r,int n_r)
 936 {
 937     rvec vbuf_s[5*2],vbuf_r[5*2];
 938     int i;
 939
 940     for(i=0; i<n_s; i++)
 941     {
 942         vbuf_s[i*2  ][0] = buf_s[i].min0;
 943         vbuf_s[i*2  ][1] = buf_s[i].max1;
 944         vbuf_s[i*2  ][2] = buf_s[i].mch0;
 945         vbuf_s[i*2+1][0] = buf_s[i].mch1;
 946         vbuf_s[i*2+1][1] = buf_s[i].p1_0;
 947         vbuf_s[i*2+1][2] = buf_s[i].p1_1;
 948     }
 949
 950     dd_sendrecv_rvec(dd, ddimind, direction,
 951                      vbuf_s, n_s*2,
 952                      vbuf_r, n_r*2);
 953
 954     for(i=0; i<n_r; i++)
 955     {
 956         buf_r[i].min0 = vbuf_r[i*2  ][0];
 957         buf_r[i].max1 = vbuf_r[i*2  ][1];
 958         buf_r[i].mch0 = vbuf_r[i*2  ][2];
 959         buf_r[i].mch1 = vbuf_r[i*2+1][0];
 960         buf_r[i].p1_0 = vbuf_r[i*2+1][1];
 961         buf_r[i].p1_1 = vbuf_r[i*2+1][2];
 962     }
 963 }
 964
 965 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 966                           rvec cell_ns_x0,rvec cell_ns_x1)
 967 {
 968     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
 969     gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
 970     rvec extr_s[2],extr_r[2];
 971     rvec dh;
 972     real dist_d,c=0,det;
 973     gmx_domdec_comm_t *comm;
 974     bool bPBC,bUse;
 975
 976     comm = dd->comm;
 977
 978     for(d=1; d<dd->ndim; d++)
 979     {
 980         dim = dd->dim[d];
 981         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 982         zp->min0 = cell_ns_x0[dim];
 983         zp->max1 = cell_ns_x1[dim];
 984         zp->mch0 = cell_ns_x0[dim];
 985         zp->mch1 = cell_ns_x1[dim];
 986         zp->p1_0 = cell_ns_x0[dim];
 987         zp->p1_1 = cell_ns_x1[dim];
 988     }
 989
 990     for(d=dd->ndim-2; d>=0; d--)
 991     {
 992         dim  = dd->dim[d];
 993         bPBC = (dim < ddbox->npbcdim);
 994
 995         /* Use an rvec to store two reals */
 996         extr_s[d][0] = comm->cell_f0[d+1];
 997         extr_s[d][1] = comm->cell_f1[d+1];
 998         extr_s[d][2] = 0;
 999
1000         pos = 0;
1001         /* Store the extremes in the backward sending buffer,
1002          * so the get updated separately from the forward communication.
1003          */
1004         for(d1=d; d1<dd->ndim-1; d1++)
1005         {
1006             /* We invert the order to be able to use the same loop for buf_e */
1007             buf_s[pos].min0 = extr_s[d1][1];
1008             buf_s[pos].max1 = extr_s[d1][0];
1009             buf_s[pos].mch0 = 0;
1010             buf_s[pos].mch1 = 0;
1011             /* Store the cell corner of the dimension we communicate along */
1012             buf_s[pos].p1_0 = comm->cell_x0[dim];
1013             buf_s[pos].p1_1 = 0;
1014             pos++;
1015         }
1016
1017         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1018         pos++;
1019
1020         if (dd->ndim == 3 && d == 0)
1021         {
1022             buf_s[pos] = comm->zone_d2[0][1];
1023             pos++;
1024             buf_s[pos] = comm->zone_d1[0];
1025             pos++;
1026         }
1027
1028         /* We only need to communicate the extremes
1029          * in the forward direction
1030          */
1031         npulse = comm->cd[d].np;
1032         if (bPBC)
1033         {
1034             /* Take the minimum to avoid double communication */
1035             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1036         }
1037         else
1038         {
1039             /* Without PBC we should really not communicate over
1040              * the boundaries, but implementing that complicates
1041              * the communication setup and therefore we simply
1042              * do all communication, but ignore some data.
1043              */
1044             npulse_min = npulse;
1045         }
1046         for(p=0; p<npulse_min; p++)
1047         {
1048             /* Communicate the extremes forward */
1049             bUse = (bPBC || dd->ci[dim] > 0);
1050
1051             dd_sendrecv_rvec(dd, d, dddirForward,
1052                              extr_s+d, dd->ndim-d-1,
1053                              extr_r+d, dd->ndim-d-1);
1054
1055             if (bUse)
1056             {
1057                 for(d1=d; d1<dd->ndim-1; d1++)
1058                 {
1059                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1060                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1061                 }
1062             }
1063         }
1064
1065         buf_size = pos;
1066         for(p=0; p<npulse; p++)
1067         {
1068             /* Communicate all the zone information backward */
1069             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1070
1071             dd_sendrecv_ddzone(dd, d, dddirBackward,
1072                                buf_s, buf_size,
1073                                buf_r, buf_size);
1074
1075             clear_rvec(dh);
1076             if (p > 0)
1077             {
1078                 for(d1=d+1; d1<dd->ndim; d1++)
1079                 {
1080                     /* Determine the decrease of maximum required
1081                      * communication height along d1 due to the distance along d,
1082                      * this avoids a lot of useless atom communication.
1083                      */
1084                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1085
1086                     if (ddbox->tric_dir[dim])
1087                     {
1088                         /* c is the off-diagonal coupling between the cell planes
1089                          * along directions d and d1.
1090                          */
1091                         c = ddbox->v[dim][dd->dim[d1]][dim];
1092                     }
1093                     else
1094                     {
1095                         c = 0;
1096                     }
1097                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1098                     if (det > 0)
1099                     {
1100                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1101                     }
1102                     else
1103                     {
1104                         /* A negative value signals out of range */
1105                         dh[d1] = -1;
1106                     }
1107                 }
1108             }
1109
1110             /* Accumulate the extremes over all pulses */
1111             for(i=0; i<buf_size; i++)
1112             {
1113                 if (p == 0)
1114                 {
1115                     buf_e[i] = buf_r[i];
1116                 }
1117                 else
1118                 {
1119                     if (bUse)
1120                     {
1121                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1122                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1123                     }
1124
1125                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1126                     {
1127                         d1 = 1;
1128                     }
1129                     else
1130                     {
1131                         d1 = d + 1;
1132                     }
1133                     if (bUse && dh[d1] >= 0)
1134                     {
1135                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1136                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1137                     }
1138                 }
1139                 /* Copy the received buffer to the send buffer,
1140                  * to pass the data through with the next pulse.
1141                  */
1142                 buf_s[i] = buf_r[i];
1143             }
1144             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1145                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1146             {
1147                 /* Store the extremes */
1148                 pos = 0;
1149
1150                 for(d1=d; d1<dd->ndim-1; d1++)
1151                 {
1152                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1153                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1154                     pos++;
1155                 }
1156
1157                 if (d == 1 || (d == 0 && dd->ndim == 3))
1158                 {
1159                     for(i=d; i<2; i++)
1160                     {
1161                         comm->zone_d2[1-d][i] = buf_e[pos];
1162                         pos++;
1163                     }
1164                 }
1165                 if (d == 0)
1166                 {
1167                     comm->zone_d1[1] = buf_e[pos];
1168                     pos++;
1169                 }
1170             }
1171         }
1172     }
1173
1174     if (dd->ndim >= 2)
1175     {
1176         dim = dd->dim[1];
1177         for(i=0; i<2; i++)
1178         {
1179             if (debug)
1180             {
1181                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1182             }
1183             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1184             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1185         }
1186     }
1187     if (dd->ndim >= 3)
1188     {
1189         dim = dd->dim[2];
1190         for(i=0; i<2; i++)
1191         {
1192             for(j=0; j<2; j++)
1193             {
1194                 if (debug)
1195                 {
1196                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1197                 }
1198                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1199                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1200             }
1201         }
1202     }
1203     for(d=1; d<dd->ndim; d++)
1204     {
1205         comm->cell_f_max0[d] = extr_s[d-1][0];
1206         comm->cell_f_min1[d] = extr_s[d-1][1];
1207         if (debug)
1208         {
1209             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1210                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1211         }
1212     }
1213 }
1214
1215 static void dd_collect_cg(gmx_domdec_t *dd,
1216                           t_state *state_local)
1217 {
1218     gmx_domdec_master_t *ma=NULL;
1219     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1220     t_block *cgs_gl;
1221
1222     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1223     {
1224         /* The master has the correct distribution */
1225         return;
1226     }
1227
1228     if (state_local->ddp_count == dd->ddp_count)
1229     {
1230         ncg_home = dd->ncg_home;
1231         cg       = dd->index_gl;
1232         nat_home = dd->nat_home;
1233     }
1234     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1235     {
1236         cgs_gl = &dd->comm->cgs_gl;
1237
1238         ncg_home = state_local->ncg_gl;
1239         cg       = state_local->cg_gl;
1240         nat_home = 0;
1241         for(i=0; i<ncg_home; i++)
1242         {
1243             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1244         }
1245     }
1246     else
1247     {
1248         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1249     }
1250
1251     buf2[0] = dd->ncg_home;
1252     buf2[1] = dd->nat_home;
1253     if (DDMASTER(dd))
1254     {
1255         ma = dd->ma;
1256         ibuf = ma->ibuf;
1257     }
1258     else
1259     {
1260         ibuf = NULL;
1261     }
1262     /* Collect the charge group and atom counts on the master */
1263     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1264
1265     if (DDMASTER(dd))
1266     {
1267         ma->index[0] = 0;
1268         for(i=0; i<dd->nnodes; i++)
1269         {
1270             ma->ncg[i] = ma->ibuf[2*i];
1271             ma->nat[i] = ma->ibuf[2*i+1];
1272             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1273
1274         }
1275         /* Make byte counts and indices */
1276         for(i=0; i<dd->nnodes; i++)
1277         {
1278             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1279             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1280         }
1281         if (debug)
1282         {
1283             fprintf(debug,"Initial charge group distribution: ");
1284             for(i=0; i<dd->nnodes; i++)
1285                 fprintf(debug," %d",ma->ncg[i]);
1286             fprintf(debug,"\n");
1287         }
1288     }
1289
1290     /* Collect the charge group indices on the master */
1291     dd_gatherv(dd,
1292                dd->ncg_home*sizeof(int),dd->index_gl,
1293                DDMASTER(dd) ? ma->ibuf : NULL,
1294                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1295                DDMASTER(dd) ? ma->cg : NULL);
1296
1297     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1298 }
1299
1300 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1301                                     rvec *lv,rvec *v)
1302 {
1303     gmx_domdec_master_t *ma;
1304     int  n,i,c,a,nalloc=0;
1305     rvec *buf=NULL;
1306     t_block *cgs_gl;
1307
1308     ma = dd->ma;
1309
1310     if (!DDMASTER(dd))
1311     {
1312 #ifdef GMX_MPI
1313         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1314                  dd->rank,dd->mpi_comm_all);
1315 #endif
1316     } else {
1317         /* Copy the master coordinates to the global array */
1318         cgs_gl = &dd->comm->cgs_gl;
1319
1320         n = DDMASTERRANK(dd);
1321         a = 0;
1322         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1323         {
1324             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1325             {
1326                 copy_rvec(lv[a++],v[c]);
1327             }
1328         }
1329
1330         for(n=0; n<dd->nnodes; n++)
1331         {
1332             if (n != dd->rank)
1333             {
1334                 if (ma->nat[n] > nalloc)
1335                 {
1336                     nalloc = over_alloc_dd(ma->nat[n]);
1337                     srenew(buf,nalloc);
1338                 }
1339 #ifdef GMX_MPI
1340                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1341                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1342 #endif
1343                 a = 0;
1344                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1345                 {
1346                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1347                     {
1348                         copy_rvec(buf[a++],v[c]);
1349                     }
1350                 }
1351             }
1352         }
1353         sfree(buf);
1354     }
1355 }
1356
1357 static void get_commbuffer_counts(gmx_domdec_t *dd,
1358                                   int **counts,int **disps)
1359 {
1360     gmx_domdec_master_t *ma;
1361     int n;
1362
1363     ma = dd->ma;
1364
1365     /* Make the rvec count and displacment arrays */
1366     *counts  = ma->ibuf;
1367     *disps   = ma->ibuf + dd->nnodes;
1368     for(n=0; n<dd->nnodes; n++)
1369     {
1370         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1371         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1372     }
1373 }
1374
1375 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1376                                    rvec *lv,rvec *v)
1377 {
1378     gmx_domdec_master_t *ma;
1379     int  *rcounts=NULL,*disps=NULL;
1380     int  n,i,c,a;
1381     rvec *buf=NULL;
1382     t_block *cgs_gl;
1383
1384     ma = dd->ma;
1385
1386     if (DDMASTER(dd))
1387     {
1388         get_commbuffer_counts(dd,&rcounts,&disps);
1389
1390         buf = ma->vbuf;
1391     }
1392
1393     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1394
1395     if (DDMASTER(dd))
1396     {
1397         cgs_gl = &dd->comm->cgs_gl;
1398
1399         a = 0;
1400         for(n=0; n<dd->nnodes; n++)
1401         {
1402             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1403             {
1404                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1405                 {
1406                     copy_rvec(buf[a++],v[c]);
1407                 }
1408             }
1409         }
1410     }
1411 }
1412
1413 void dd_collect_vec(gmx_domdec_t *dd,
1414                     t_state *state_local,rvec *lv,rvec *v)
1415 {
1416     gmx_domdec_master_t *ma;
1417     int  n,i,c,a,nalloc=0;
1418     rvec *buf=NULL;
1419
1420     dd_collect_cg(dd,state_local);
1421
1422     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1423     {
1424         dd_collect_vec_sendrecv(dd,lv,v);
1425     }
1426     else
1427     {
1428         dd_collect_vec_gatherv(dd,lv,v);
1429     }
1430 }
1431
1432
1433 void dd_collect_state(gmx_domdec_t *dd,
1434                       t_state *state_local,t_state *state)
1435 {
1436     int est,i;
1437
1438     if (DDMASTER(dd))
1439     {
1440         state->lambda = state_local->lambda;
1441         copy_mat(state_local->box,state->box);
1442         copy_mat(state_local->boxv,state->boxv);
1443         copy_mat(state_local->pres_prev,state->pres_prev);
1444         for(i=0; i<state_local->ngtc; i++)
1445         {
1446             state->nosehoover_xi[i]  = state_local->nosehoover_xi[i];
1447             state->therm_integral[i] = state_local->therm_integral[i];
1448         }
1449     }
1450     for(est=estX; est<estNR; est++)
1451     {
1452         if (state_local->flags & (1<<est))
1453         {
1454             switch (est) {
1455             case estX:
1456                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1457                 break;
1458             case estV:
1459                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1460                 break;
1461             case estSDX:
1462                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1463                 break;
1464             case estCGP:
1465                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1466                 break;
1467             case estLD_RNG:
1468                 if (state->nrngi == 1)
1469                 {
1470                     if (DDMASTER(dd))
1471                     {
1472                         for(i=0; i<state_local->nrng; i++)
1473                         {
1474                             state->ld_rng[i] = state_local->ld_rng[i];
1475                         }
1476                     }
1477                 }
1478                 else
1479                 {
1480                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1481                               state_local->ld_rng,state->ld_rng);
1482                 }
1483                 break;
1484             case estLD_RNGI:
1485                 if (state->nrngi == 1)
1486                 {
1487                    if (DDMASTER(dd))
1488                     {
1489                         state->ld_rngi[0] = state_local->ld_rngi[0];
1490                     }
1491                 }
1492                 else
1493                 {
1494                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1495                               state_local->ld_rngi,state->ld_rngi);
1496                 }
1497                 break;
1498             case estDISRE_INITF:
1499             case estDISRE_RM3TAV:
1500             case estORIRE_INITF:
1501             case estORIRE_DTAV:
1502                 break;
1503             default:
1504                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1505             }
1506         }
1507     }
1508 }
1509
1510 static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
1511 {
1512     if (debug)
1513     {
1514         fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1515     }
1516     fr->cg_nalloc = over_alloc_dd(nalloc);
1517     srenew(fr->cg_cm,fr->cg_nalloc);
1518     srenew(fr->cginfo,fr->cg_nalloc);
1519 }
1520
1521 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1522 {
1523     int est;
1524
1525     if (debug)
1526     {
1527         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1528     }
1529
1530     state->nalloc = over_alloc_dd(nalloc);
1531
1532     for(est=estX; est<estNR; est++)
1533     {
1534         if (state->flags & (1<<est))
1535         {
1536             switch(est) {
1537             case estX:
1538                 srenew(state->x,state->nalloc);
1539                 break;
1540             case estV:
1541                 srenew(state->v,state->nalloc);
1542                 break;
1543             case estSDX:
1544                 srenew(state->sd_X,state->nalloc);
1545                 break;
1546             case estCGP:
1547                 srenew(state->cg_p,state->nalloc);
1548                 break;
1549             case estLD_RNG:
1550             case estLD_RNGI:
1551             case estDISRE_INITF:
1552             case estDISRE_RM3TAV:
1553             case estORIRE_INITF:
1554             case estORIRE_DTAV:
1555                 /* No reallocation required */
1556                 break;
1557             default:
1558                 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1559             }
1560         }
1561     }
1562
1563     if (f != NULL)
1564     {
1565         srenew(*f,state->nalloc);
1566     }
1567 }
1568
1569 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1570                                        rvec *v,rvec *lv)
1571 {
1572     gmx_domdec_master_t *ma;
1573     int  n,i,c,a,nalloc=0;
1574     rvec *buf=NULL;
1575
1576     if (DDMASTER(dd))
1577     {
1578         ma  = dd->ma;
1579
1580         for(n=0; n<dd->nnodes; n++)
1581         {
1582             if (n != dd->rank)
1583             {
1584                 if (ma->nat[n] > nalloc)
1585                 {
1586                     nalloc = over_alloc_dd(ma->nat[n]);
1587                     srenew(buf,nalloc);
1588                 }
1589                 /* Use lv as a temporary buffer */
1590                 a = 0;
1591                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1592                 {
1593                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1594                     {
1595                         copy_rvec(v[c],buf[a++]);
1596                     }
1597                 }
1598                 if (a != ma->nat[n])
1599                 {
1600                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1601                               a,ma->nat[n]);
1602                 }
1603
1604 #ifdef GMX_MPI
1605                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1606                          DDRANK(dd,n),n,dd->mpi_comm_all);
1607 #endif
1608             }
1609         }
1610         sfree(buf);
1611         n = DDMASTERRANK(dd);
1612         a = 0;
1613         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1614         {
1615             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1616             {
1617                 copy_rvec(v[c],lv[a++]);
1618             }
1619         }
1620     }
1621     else
1622     {
1623 #ifdef GMX_MPI
1624         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1625                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1626 #endif
1627     }
1628 }
1629
1630 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1631                                        rvec *v,rvec *lv)
1632 {
1633     gmx_domdec_master_t *ma;
1634     int  *scounts=NULL,*disps=NULL;
1635     int  n,i,c,a,nalloc=0;
1636     rvec *buf=NULL;
1637
1638     if (DDMASTER(dd))
1639     {
1640         ma  = dd->ma;
1641
1642         get_commbuffer_counts(dd,&scounts,&disps);
1643
1644         buf = ma->vbuf;
1645         a = 0;
1646         for(n=0; n<dd->nnodes; n++)
1647         {
1648             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1649             {
1650                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1651                 {
1652                     copy_rvec(v[c],buf[a++]);
1653                 }
1654             }
1655         }
1656     }
1657
1658     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1659 }
1660
1661 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1662 {
1663     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1664     {
1665         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1666     }
1667     else
1668     {
1669         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1670     }
1671 }
1672
1673 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1674                                 t_state *state,t_state *state_local,
1675                                 rvec **f)
1676 {
1677     int  i;
1678
1679     if (DDMASTER(dd))
1680     {
1681         state_local->lambda = state->lambda;
1682         copy_mat(state->box,state_local->box);
1683         copy_mat(state->box_rel,state_local->box_rel);
1684         copy_mat(state->boxv,state_local->boxv);
1685         for(i=0; i<state_local->ngtc; i++)
1686         {
1687             state_local->nosehoover_xi[i]  = state->nosehoover_xi[i];
1688             state_local->therm_integral[i] = state->therm_integral[i];
1689         }
1690     }
1691     dd_bcast(dd,sizeof(real),&state_local->lambda);
1692     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1693     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1694     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1695     dd_bcast(dd,state_local->ngtc*sizeof(real),state_local->nosehoover_xi);
1696     dd_bcast(dd,state_local->ngtc*sizeof(real),state_local->therm_integral);
1697
1698     if (dd->nat_home > state_local->nalloc)
1699     {
1700         dd_realloc_state(state_local,f,dd->nat_home);
1701     }
1702     for(i=estX; i<estNR; i++)
1703     {
1704         if (state_local->flags & (1<<i))
1705         {
1706             switch (i) {
1707             case estX:
1708                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1709                 break;
1710             case estV:
1711                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1712                 break;
1713             case estSDX:
1714                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1715                 break;
1716             case estCGP:
1717                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1718                 break;
1719             case estLD_RNG:
1720                 if (state->nrngi == 1)
1721                 {
1722                     dd_bcastc(dd,
1723                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1724                               state->ld_rng,state_local->ld_rng);
1725                 }
1726                 else
1727                 {
1728                     dd_scatter(dd,
1729                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1730                                state->ld_rng,state_local->ld_rng);
1731                 }
1732                 break;
1733             case estLD_RNGI:
1734                 if (state->nrngi == 1)
1735                 {
1736                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1737                               state->ld_rngi,state_local->ld_rngi);
1738                 }
1739                 else
1740                 {
1741                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1742                                state->ld_rngi,state_local->ld_rngi);
1743                 }
1744                 break;
1745             case estDISRE_INITF:
1746             case estDISRE_RM3TAV:
1747             case estORIRE_INITF:
1748             case estORIRE_DTAV:
1749                 /* Not implemented yet */
1750                 break;
1751             default:
1752                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1753             }
1754         }
1755     }
1756 }
1757
1758 static char dim2char(int dim)
1759 {
1760     char c='?';
1761
1762     switch (dim)
1763     {
1764     case XX: c = 'X'; break;
1765     case YY: c = 'Y'; break;
1766     case ZZ: c = 'Z'; break;
1767     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1768     }
1769
1770     return c;
1771 }
1772
1773 static void write_dd_grid_pdb(const char *fn,gmx_step_t step,
1774                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1775 {
1776     rvec grid_s[2],*grid_r=NULL,cx,r;
1777     char fname[STRLEN],format[STRLEN],buf[22];
1778     FILE *out;
1779     int  a,i,d,z,y,x;
1780     matrix tric;
1781     real vol;
1782
1783     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1784     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1785
1786     if (DDMASTER(dd))
1787     {
1788         snew(grid_r,2*dd->nnodes);
1789     }
1790
1791     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1792
1793     if (DDMASTER(dd))
1794     {
1795         for(d=0; d<DIM; d++)
1796         {
1797             for(i=0; i<DIM; i++)
1798             {
1799                 if (d == i)
1800                 {
1801                     tric[d][i] = 1;
1802                 }
1803                 else
1804                 {
1805                     if (dd->nc[d] > 1 && d < ddbox->npbcdim)
1806                     {
1807                         tric[d][i] = box[i][d]/box[i][i];
1808                     }
1809                     else
1810                     {
1811                         tric[d][i] = 0;
1812                     }
1813                 }
1814             }
1815         }
1816         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1817         sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1818         out = gmx_fio_fopen(fname,"w");
1819         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1820         a = 1;
1821         for(i=0; i<dd->nnodes; i++)
1822         {
1823             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1824             for(d=0; d<DIM; d++)
1825             {
1826                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1827             }
1828             for(z=0; z<2; z++)
1829             {
1830                 for(y=0; y<2; y++)
1831                 {
1832                     for(x=0; x<2; x++)
1833                     {
1834                         cx[XX] = grid_r[i*2+x][XX];
1835                         cx[YY] = grid_r[i*2+y][YY];
1836                         cx[ZZ] = grid_r[i*2+z][ZZ];
1837                         mvmul(tric,cx,r);
1838                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1839                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1840                     }
1841                 }
1842             }
1843             for(d=0; d<DIM; d++)
1844             {
1845                 for(x=0; x<4; x++)
1846                 {
1847                     switch(d)
1848                     {
1849                     case 0: y = 1 + i*8 + 2*x; break;
1850                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1851                     case 2: y = 1 + i*8 + x; break;
1852                     }
1853                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1854                 }
1855             }
1856         }
1857         gmx_fio_fclose(out);
1858         sfree(grid_r);
1859     }
1860 }
1861
1862 void write_dd_pdb(const char *fn,gmx_step_t step,const char *title,
1863                   gmx_mtop_t *mtop,t_commrec *cr,
1864                   int natoms,rvec x[],matrix box)
1865 {
1866     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1867     FILE *out;
1868     int  i,ii,resnr,c;
1869     char *atomname,*resname;
1870     real b;
1871     gmx_domdec_t *dd;
1872
1873     dd = cr->dd;
1874     if (natoms == -1)
1875     {
1876         natoms = dd->comm->nat[ddnatVSITE];
1877     }
1878
1879     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
1880
1881     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1882     sprintf(format4,"%s%s\n",pdbformat4,"%6.2f%6.2f");
1883
1884     out = gmx_fio_fopen(fname,"w");
1885
1886     fprintf(out,"TITLE     %s\n",title);
1887     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1888     for(i=0; i<natoms; i++)
1889     {
1890         ii = dd->gatindex[i];
1891         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
1892         if (i < dd->comm->nat[ddnatZONE])
1893         {
1894             c = 0;
1895             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1896             {
1897                 c++;
1898             }
1899             b = c;
1900         }
1901         else if (i < dd->comm->nat[ddnatVSITE])
1902         {
1903             b = dd->comm->zones.n;
1904         }
1905         else
1906         {
1907             b = dd->comm->zones.n + 1;
1908         }
1909         fprintf(out,strlen(atomname)<4 ? format : format4,
1910                 "ATOM",(ii+1)%100000,
1911                 atomname,resname,' ',(resnr+1)%10000,' ',
1912                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
1913     }
1914     fprintf(out,"TER\n");
1915
1916     gmx_fio_fclose(out);
1917 }
1918
1919 real dd_cutoff_mbody(gmx_domdec_t *dd)
1920 {
1921     gmx_domdec_comm_t *comm;
1922     int  di;
1923     real r;
1924
1925     comm = dd->comm;
1926
1927     r = -1;
1928     if (comm->bInterCGBondeds)
1929     {
1930         if (comm->cutoff_mbody > 0)
1931         {
1932             r = comm->cutoff_mbody;
1933         }
1934         else
1935         {
1936             /* cutoff_mbody=0 means we do not have DLB */
1937             r = comm->cellsize_min[dd->dim[0]];
1938             for(di=1; di<dd->ndim; di++)
1939             {
1940                 r = min(r,comm->cellsize_min[dd->dim[di]]);
1941             }
1942             if (comm->bBondComm)
1943             {
1944                 r = max(r,comm->cutoff_mbody);
1945             }
1946             else
1947             {
1948                 r = min(r,comm->cutoff);
1949             }
1950         }
1951     }
1952
1953     return r;
1954 }
1955
1956 real dd_cutoff_twobody(gmx_domdec_t *dd)
1957 {
1958     real r_mb;
1959
1960     r_mb = dd_cutoff_mbody(dd);
1961
1962     return max(dd->comm->cutoff,r_mb);
1963 }
1964
1965
1966 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
1967 {
1968     int nc,ntot;
1969
1970     nc   = dd->nc[dd->comm->cartpmedim];
1971     ntot = dd->comm->ntot[dd->comm->cartpmedim];
1972     copy_ivec(coord,coord_pme);
1973     coord_pme[dd->comm->cartpmedim] =
1974         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
1975 }
1976
1977 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
1978 {
1979     /* Here we assign a PME node to communicate with this DD node
1980      * by assuming that the major index of both is x.
1981      * We add cr->npmenodes/2 to obtain an even distribution.
1982      */
1983     return (ddindex*npme + npme/2)/ndd;
1984 }
1985
1986 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
1987 {
1988     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
1989 }
1990
1991 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
1992 {
1993     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
1994 }
1995
1996 static int *dd_pmenodes(t_commrec *cr)
1997 {
1998     int *pmenodes;
1999     int n,i,p0,p1;
2000
2001     snew(pmenodes,cr->npmenodes);
2002     n = 0;
2003     for(i=0; i<cr->dd->nnodes; i++) {
2004         p0 = cr_ddindex2pmeindex(cr,i);
2005         p1 = cr_ddindex2pmeindex(cr,i+1);
2006         if (i+1 == cr->dd->nnodes || p1 > p0) {
2007             if (debug)
2008                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2009             pmenodes[n] = i + 1 + n;
2010             n++;
2011         }
2012     }
2013
2014     return pmenodes;
2015 }
2016
2017 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2018 {
2019     gmx_domdec_t *dd;
2020     ivec coords,coords_pme,nc;
2021     int  slab;
2022
2023     dd = cr->dd;
2024     /*
2025       if (dd->comm->bCartesian) {
2026       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2027       dd_coords2pmecoords(dd,coords,coords_pme);
2028       copy_ivec(dd->ntot,nc);
2029       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2030       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2031
2032       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2033       } else {
2034       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2035       }
2036     */
2037     coords[XX] = x;
2038     coords[YY] = y;
2039     coords[ZZ] = z;
2040     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2041
2042     return slab;
2043 }
2044
2045 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2046 {
2047     gmx_domdec_comm_t *comm;
2048     ivec coords;
2049     int  ddindex,nodeid=-1;
2050
2051     comm = cr->dd->comm;
2052
2053     coords[XX] = x;
2054     coords[YY] = y;
2055     coords[ZZ] = z;
2056     if (comm->bCartesianPP_PME)
2057     {
2058 #ifdef GMX_MPI
2059         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2060 #endif
2061     }
2062     else
2063     {
2064         ddindex = dd_index(cr->dd->nc,coords);
2065         if (comm->bCartesianPP)
2066         {
2067             nodeid = comm->ddindex2simnodeid[ddindex];
2068         }
2069         else
2070         {
2071             if (comm->pmenodes)
2072             {
2073                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2074             }
2075             else
2076             {
2077                 nodeid = ddindex;
2078             }
2079         }
2080     }
2081
2082     return nodeid;
2083 }
2084
2085 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2086 {
2087     gmx_domdec_t *dd;
2088     gmx_domdec_comm_t *comm;
2089     ivec coord,coord_pme;
2090     int  i;
2091     int  pmenode=-1;
2092
2093     dd = cr->dd;
2094     comm = dd->comm;
2095
2096     /* This assumes a uniform x domain decomposition grid cell size */
2097     if (comm->bCartesianPP_PME)
2098     {
2099 #ifdef GMX_MPI
2100         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2101         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2102         {
2103             /* This is a PP node */
2104             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2105             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2106         }
2107 #endif
2108     }
2109     else if (comm->bCartesianPP)
2110     {
2111         if (sim_nodeid < dd->nnodes)
2112         {
2113             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2114         }
2115     }
2116     else
2117     {
2118         /* This assumes DD cells with identical x coordinates
2119          * are numbered sequentially.
2120          */
2121         if (dd->comm->pmenodes == NULL)
2122         {
2123             if (sim_nodeid < dd->nnodes)
2124             {
2125                 /* The DD index equals the nodeid */
2126                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2127             }
2128         }
2129         else
2130         {
2131             i = 0;
2132             while (sim_nodeid > dd->comm->pmenodes[i])
2133             {
2134                 i++;
2135             }
2136             if (sim_nodeid < dd->comm->pmenodes[i])
2137             {
2138                 pmenode = dd->comm->pmenodes[i];
2139             }
2140         }
2141     }
2142
2143     return pmenode;
2144 }
2145
2146 bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2147 {
2148     bool bPMEOnlyNode;
2149
2150     if (DOMAINDECOMP(cr))
2151     {
2152         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2153     }
2154     else
2155     {
2156         bPMEOnlyNode = FALSE;
2157     }
2158
2159     return bPMEOnlyNode;
2160 }
2161
2162 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2163                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2164 {
2165     gmx_domdec_t *dd;
2166     int x,y,z;
2167     ivec coord,coord_pme;
2168
2169     dd = cr->dd;
2170
2171     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2172
2173     *nmy_ddnodes = 0;
2174     for(x=0; x<dd->nc[XX]; x++)
2175     {
2176         for(y=0; y<dd->nc[YY]; y++)
2177         {
2178             for(z=0; z<dd->nc[ZZ]; z++)
2179             {
2180                 if (dd->comm->bCartesianPP_PME)
2181                 {
2182                     coord[XX] = x;
2183                     coord[YY] = y;
2184                     coord[ZZ] = z;
2185                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2186                     if (dd->ci[XX] == coord_pme[XX] &&
2187                         dd->ci[YY] == coord_pme[YY] &&
2188                         dd->ci[ZZ] == coord_pme[ZZ])
2189                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2190                 }
2191                 else
2192                 {
2193                     /* The slab corresponds to the nodeid in the PME group */
2194                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2195                     {
2196                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2197                     }
2198                 }
2199             }
2200         }
2201     }
2202
2203     /* The last PP-only node is the peer node */
2204     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2205
2206     if (debug)
2207     {
2208         fprintf(debug,"Receive coordinates from PP nodes:");
2209         for(x=0; x<*nmy_ddnodes; x++)
2210         {
2211             fprintf(debug," %d",(*my_ddnodes)[x]);
2212         }
2213         fprintf(debug,"\n");
2214     }
2215 }
2216
2217 static bool receive_vir_ener(t_commrec *cr)
2218 {
2219     gmx_domdec_comm_t *comm;
2220     int  pmenode,coords[DIM],rank;
2221     bool bReceive;
2222
2223     bReceive = TRUE;
2224     if (cr->npmenodes < cr->dd->nnodes)
2225     {
2226         comm = cr->dd->comm;
2227         if (comm->bCartesianPP_PME)
2228         {
2229             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2230 #ifdef GMX_MPI
2231             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2232             coords[comm->cartpmedim]++;
2233             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2234             {
2235                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2236                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2237                 {
2238                     /* This is not the last PP node for pmenode */
2239                     bReceive = FALSE;
2240                 }
2241             }
2242 #endif
2243         }
2244         else
2245         {
2246             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2247             if (cr->sim_nodeid+1 < cr->nnodes &&
2248                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2249             {
2250                 /* This is not the last PP node for pmenode */
2251                 bReceive = FALSE;
2252             }
2253         }
2254     }
2255
2256     return bReceive;
2257 }
2258
2259 static void set_zones_ncg_home(gmx_domdec_t *dd)
2260 {
2261     gmx_domdec_zones_t *zones;
2262     int i;
2263
2264     zones = &dd->comm->zones;
2265
2266     zones->cg_range[0] = 0;
2267     for(i=1; i<zones->n+1; i++)
2268     {
2269         zones->cg_range[i] = dd->ncg_home;
2270     }
2271 }
2272
2273 static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
2274 {
2275     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2276
2277     ind = state->cg_gl;
2278     dd_cg_gl = dd->index_gl;
2279     cgindex  = dd->cgindex;
2280     nat = 0;
2281     cgindex[0] = nat;
2282     for(i=0; i<state->ncg_gl; i++)
2283     {
2284         cgindex[i] = nat;
2285         cg_gl = ind[i];
2286         dd_cg_gl[i] = cg_gl;
2287         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2288     }
2289     cgindex[i] = nat;
2290
2291     dd->ncg_home = state->ncg_gl;
2292     dd->nat_home = nat;
2293
2294     set_zones_ncg_home(dd);
2295 }
2296
2297 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2298 {
2299     while (cg >= cginfo_mb->cg_end)
2300     {
2301         cginfo_mb++;
2302     }
2303
2304     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2305 }
2306
2307 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2308                           t_forcerec *fr,char *bLocalCG)
2309 {
2310     cginfo_mb_t *cginfo_mb;
2311     int *cginfo;
2312     int cg;
2313
2314     if (fr != NULL)
2315     {
2316         cginfo_mb = fr->cginfo_mb;
2317         cginfo    = fr->cginfo;
2318
2319         for(cg=cg0; cg<cg1; cg++)
2320         {
2321             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2322         }
2323     }
2324
2325     if (bLocalCG != NULL)
2326     {
2327         for(cg=cg0; cg<cg1; cg++)
2328         {
2329             bLocalCG[index_gl[cg]] = TRUE;
2330         }
2331     }
2332 }
2333
2334 static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
2335 {
2336     int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
2337     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2338     gmx_ga2la_t *ga2la;
2339     char *bLocalCG;
2340
2341     bLocalCG = dd->comm->bLocalCG;
2342
2343     if (dd->nat_tot > dd->gatindex_nalloc)
2344     {
2345         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2346         srenew(dd->gatindex,dd->gatindex_nalloc);
2347     }
2348
2349     nzone      = dd->comm->zones.n;
2350     zone2cg    = dd->comm->zones.cg_range;
2351     zone_ncg1  = dd->comm->zone_ncg1;
2352     index_gl   = dd->index_gl;
2353     gatindex   = dd->gatindex;
2354
2355     if (zone2cg[1] != dd->ncg_home)
2356     {
2357         gmx_incons("dd->ncg_zone is not up to date");
2358     }
2359
2360     /* Make the local to global and global to local atom index */
2361     a = dd->cgindex[cg_start];
2362     for(zone=0; zone<nzone; zone++)
2363     {
2364         if (zone == 0)
2365         {
2366             cg0 = cg_start;
2367         }
2368         else
2369         {
2370             cg0 = zone2cg[zone];
2371         }
2372         for(cg=cg0; cg<zone2cg[zone+1]; cg++)
2373         {
2374             zone1 = zone;
2375             if (cg - cg0 >= zone_ncg1[zone])
2376             {
2377                 /* Signal that this cg is from more than one zone away */
2378                 zone1 += nzone;
2379             }
2380             cg_gl = index_gl[cg];
2381             for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2382             {
2383                 gatindex[a] = a_gl;
2384                 ga2la_set(dd->ga2la,a_gl,a,zone1);
2385                 a++;
2386             }
2387         }
2388     }
2389 }
2390
2391 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2392                           const char *where)
2393 {
2394     int ncg,i,ngl,nerr;
2395
2396     nerr = 0;
2397     if (bLocalCG == NULL)
2398     {
2399         return nerr;
2400     }
2401     for(i=0; i<dd->ncg_tot; i++)
2402     {
2403         if (!bLocalCG[dd->index_gl[i]])
2404         {
2405             fprintf(stderr,
2406                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2407             nerr++;
2408         }
2409     }
2410     ngl = 0;
2411     for(i=0; i<ncg_sys; i++)
2412     {
2413         if (bLocalCG[i])
2414         {
2415             ngl++;
2416         }
2417     }
2418     if (ngl != dd->ncg_tot)
2419     {
2420         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2421         nerr++;
2422     }
2423
2424     return nerr;
2425 }
2426
2427 static void check_index_consistency(gmx_domdec_t *dd,
2428                                     int natoms_sys,int ncg_sys,
2429                                     const char *where)
2430 {
2431     int  nerr,ngl,i,a,cell;
2432     int  *have;
2433
2434     nerr = 0;
2435
2436     if (dd->comm->DD_debug > 1)
2437     {
2438         snew(have,natoms_sys);
2439         for(a=0; a<dd->nat_tot; a++)
2440         {
2441             if (have[dd->gatindex[a]] > 0)
2442             {
2443                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2444             }
2445             else
2446             {
2447                 have[dd->gatindex[a]] = a + 1;
2448             }
2449         }
2450         sfree(have);
2451     }
2452
2453     snew(have,dd->nat_tot);
2454
2455     ngl  = 0;
2456     for(i=0; i<natoms_sys; i++)
2457     {
2458         if (ga2la_get(dd->ga2la,i,&a,&cell))
2459         {
2460             if (a >= dd->nat_tot)
2461             {
2462                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2463                 nerr++;
2464             }
2465             else
2466             {
2467                 have[a] = 1;
2468                 if (dd->gatindex[a] != i)
2469                 {
2470                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2471                     nerr++;
2472                 }
2473             }
2474             ngl++;
2475         }
2476     }
2477     if (ngl != dd->nat_tot)
2478     {
2479         fprintf(stderr,
2480                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2481                 dd->rank,where,ngl,dd->nat_tot);
2482     }
2483     for(a=0; a<dd->nat_tot; a++)
2484     {
2485         if (have[a] == 0)
2486         {
2487             fprintf(stderr,
2488                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2489                     dd->rank,where,a+1,dd->gatindex[a]+1);
2490         }
2491     }
2492     sfree(have);
2493
2494     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2495
2496     if (nerr > 0) {
2497         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2498                   dd->rank,where,nerr);
2499     }
2500 }
2501
2502 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2503 {
2504     int  i;
2505     char *bLocalCG;
2506
2507     if (a_start == 0)
2508     {
2509         /* Clear the whole list without searching */
2510         ga2la_clear(dd->ga2la);
2511     }
2512     else
2513     {
2514         for(i=a_start; i<dd->nat_tot; i++)
2515         {
2516             ga2la_del(dd->ga2la,dd->gatindex[i]);
2517         }
2518     }
2519
2520     bLocalCG = dd->comm->bLocalCG;
2521     if (bLocalCG)
2522     {
2523         for(i=cg_start; i<dd->ncg_tot; i++)
2524         {
2525             bLocalCG[dd->index_gl[i]] = FALSE;
2526         }
2527     }
2528
2529     dd_clear_local_vsite_indices(dd);
2530
2531     if (dd->constraints)
2532     {
2533         dd_clear_local_constraint_indices(dd);
2534     }
2535 }
2536
2537 static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
2538 {
2539     real grid_jump_limit;
2540
2541     /* The distance between the boundaries of cells at distance
2542      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2543      * and by the fact that cells should not be shifted by more than
2544      * half their size, such that cg's only shift by one cell
2545      * at redecomposition.
2546      */
2547     grid_jump_limit = comm->cellsize_limit;
2548     if (!comm->bVacDLBNoLimit)
2549     {
2550         grid_jump_limit = max(grid_jump_limit,
2551                               comm->cutoff/comm->cd[dim_ind].np);
2552     }
2553
2554     return grid_jump_limit;
2555 }
2556
2557 static void check_grid_jump(gmx_step_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2558 {
2559     gmx_domdec_comm_t *comm;
2560     int  d,dim;
2561     real limit,bfac;
2562
2563     comm = dd->comm;
2564
2565     for(d=1; d<dd->ndim; d++)
2566     {
2567         dim = dd->dim[d];
2568         limit = grid_jump_limit(comm,d);
2569         bfac = ddbox->box_size[dim];
2570         if (ddbox->tric_dir[dim])
2571         {
2572             bfac *= ddbox->skew_fac[dim];
2573         }
2574         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2575             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2576         {
2577             char buf[22];
2578             gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
2579                       gmx_step_str(step,buf),
2580                       dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2581         }
2582     }
2583 }
2584
2585 static int dd_load_count(gmx_domdec_comm_t *comm)
2586 {
2587     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2588 }
2589
2590 static float dd_force_load(gmx_domdec_comm_t *comm)
2591 {
2592     float load;
2593
2594     if (comm->eFlop)
2595     {
2596         load = comm->flop;
2597         if (comm->eFlop > 1)
2598         {
2599             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2600         }
2601     }
2602     else
2603     {
2604         load = comm->cycl[ddCyclF];
2605         if (comm->cycl_n[ddCyclF] > 1)
2606         {
2607             /* Subtract the maximum of the last n cycle counts
2608              * to get rid of possible high counts due to other soures,
2609              * for instance system activity, that would otherwise
2610              * affect the dynamic load balancing.
2611              */
2612             load -= comm->cycl_max[ddCyclF];
2613         }
2614     }
2615
2616     return load;
2617 }
2618
2619 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dimind,real **dim_f)
2620 {
2621     gmx_domdec_comm_t *comm;
2622     int i;
2623
2624     comm = dd->comm;
2625
2626     if (dd->dim[dimind] != dimind)
2627     {
2628         *dim_f = NULL;
2629         return;
2630     }
2631
2632     snew(*dim_f,dd->nc[dimind]+1);
2633     (*dim_f)[0] = 0;
2634     for(i=1; i<dd->nc[dimind]; i++)
2635     {
2636         if (comm->slb_frac[dimind])
2637         {
2638             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dimind][i-1];
2639         }
2640         else
2641         {
2642             (*dim_f)[i] = (real)i/(real)dd->nc[dimind];
2643         }
2644     }
2645     (*dim_f)[dd->nc[dimind]] = 1;
2646 }
2647
2648 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2649                        int dimind,int nslab)
2650 {
2651     int  pmeindex,slab,nso,i;
2652     ivec xyz;
2653
2654     ddpme->dimind = dimind;
2655     ddpme->nslab  = nslab;
2656
2657     if (nslab <= 1)
2658     {
2659         return;
2660     }
2661
2662     nso = dd->comm->npmenodes/nslab;
2663     /* Determine for each PME slab the PP locacation range for dimension dim */
2664     snew(ddpme->pp_min,nslab);
2665     snew(ddpme->pp_max,nslab);
2666     for(slab=0; slab<nslab; slab++) {
2667         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2668         ddpme->pp_max[slab] = 0;
2669     }
2670     for(i=0; i<dd->nnodes; i++) {
2671         ddindex2xyz(dd->nc,i,xyz);
2672         /* For y only use our y/z slab.
2673          * This assumes that the PME x grid size matches the DD grid size.
2674          */
2675         if (dimind == 0 || xyz[YY] == dd->ci[YY]) {
2676             pmeindex = ddindex2pmeindex(dd,i);
2677             if (dimind == 0) {
2678                 slab = pmeindex/nso;
2679             } else {
2680                 slab = pmeindex % nslab;
2681             }
2682             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2683             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2684         }
2685     }
2686
2687     set_slb_pme_dim_f(dd,ddpme->dimind,&ddpme->slb_dim_f);
2688 }
2689
2690 int dd_pme_maxshift0(gmx_domdec_t *dd)
2691 {
2692     return dd->comm->ddpme[0].maxshift;
2693 }
2694
2695 int dd_pme_maxshift1(gmx_domdec_t *dd)
2696 {
2697     return dd->comm->ddpme[1].maxshift;
2698 }
2699
2700 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2701                              bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2702 {
2703     gmx_domdec_comm_t *comm;
2704     int  dim,nc,ns,s;
2705     int  *xmin,*xmax;
2706     real range,pme_boundary;
2707     int  sh;
2708
2709     comm = dd->comm;
2710     dim = ddpme->dimind;
2711     nc  = dd->nc[dim];
2712     ns  = ddpme->nslab;
2713
2714     if (dd->dim[dim] != dim)
2715     {
2716         /* PP decomposition is not along dim: the worst situation */
2717         sh = ns/2;
2718     }
2719     else if (ns <= 3 || (bUniform && ns == nc))
2720     {
2721         /* The optimal situation */
2722         sh = 1;
2723     }
2724     else
2725     {
2726         /* We need to check for all pme nodes which nodes they
2727          * could possibly need to communicate with.
2728          */
2729         xmin = ddpme->pp_min;
2730         xmax = ddpme->pp_max;
2731         /* Allow for atoms to be maximally half the cell size or cut-off
2732          * out of their DD cell.
2733          */
2734         range  = 0.5*min(comm->cellsize_min[dim],comm->cutoff);
2735         range /= ddbox->skew_fac[dim]*ddbox->box_size[dim];
2736         /* Avoid unlucky rounding at exactly 0.5 */
2737         range *= 0.999;
2738
2739         sh = 1;
2740         for(s=0; s<ns; s++)
2741         {
2742             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2743             pme_boundary = (real)s/ns;
2744             while (sh+1 < ns &&
2745                    ((s-(sh+1) >= 0 &&
2746                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2747                     (s-(sh+1) <  0 &&
2748                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2749             {
2750                 sh++;
2751             }
2752             pme_boundary = (real)(s+1)/ns;
2753             while (sh+1 < ns &&
2754                    ((s+(sh+1) <  ns &&
2755                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2756                     (s+(sh+1) >= ns &&
2757                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2758             {
2759                 sh++;
2760             }
2761         }
2762     }
2763
2764     ddpme->maxshift = sh;
2765
2766     if (debug)
2767     {
2768         fprintf(debug,"PME slab communication range for dimind %d is %d\n",
2769                 ddpme->dimind,ddpme->maxshift);
2770     }
2771 }
2772
2773 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2774 {
2775     int d,dim;
2776
2777     for(d=0; d<dd->ndim; d++)
2778     {
2779         dim = dd->dim[d];
2780         if (dim < ddbox->nboundeddim &&
2781             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2782             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2783         {
2784             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2785                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2786                       dd->nc[dim],dd->comm->cellsize_limit);
2787         }
2788     }
2789 }
2790
2791 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2792                                   bool bMaster,ivec npulse)
2793 {
2794     gmx_domdec_comm_t *comm;
2795     int  d,j;
2796     rvec cellsize_min;
2797     real *cell_x,cell_dx,cellsize;
2798
2799     comm = dd->comm;
2800
2801     for(d=0; d<DIM; d++)
2802     {
2803         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2804         npulse[d] = 1;
2805         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2806         {
2807             /* Uniform grid */
2808             cell_dx = ddbox->box_size[d]/dd->nc[d];
2809             if (bMaster)
2810             {
2811                 for(j=0; j<dd->nc[d]+1; j++)
2812                 {
2813                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2814                 }
2815             }
2816             else
2817             {
2818                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2819                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2820             }
2821             cellsize = cell_dx*ddbox->skew_fac[d];
2822             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
2823             {
2824                 npulse[d]++;
2825             }
2826             cellsize_min[d] = cellsize;
2827         }
2828         else
2829         {
2830             /* Statically load balanced grid */
2831             /* Also when we are not doing a master distribution we determine
2832              * all cell borders in a loop to obtain identical values
2833              * to the master distribution case and to determine npulse.
2834              */
2835             if (bMaster)
2836             {
2837                 cell_x = dd->ma->cell_x[d];
2838             }
2839             else
2840             {
2841                 snew(cell_x,dd->nc[d]+1);
2842             }
2843             cell_x[0] = ddbox->box0[d];
2844             for(j=0; j<dd->nc[d]; j++)
2845             {
2846                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2847                 cell_x[j+1] = cell_x[j] + cell_dx;
2848                 cellsize = cell_dx*ddbox->skew_fac[d];
2849                 while (cellsize*npulse[d] < comm->cutoff &&
2850                        npulse[d] < dd->nc[d]-1)
2851                 {
2852                     npulse[d]++;
2853                 }
2854                 cellsize_min[d] = min(cellsize_min[d],cellsize);
2855             }
2856             if (!bMaster)
2857             {
2858                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2859                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2860                 sfree(cell_x);
2861             }
2862         }
2863         /* The following limitation is to avoid that a cell would receive
2864          * some of its own home charge groups back over the periodic boundary.
2865          * Double charge groups cause trouble with the global indices.
2866          */
2867         if (d < ddbox->npbcdim &&
2868             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2869         {
2870             if (DDMASTER(dd))
2871             {
2872                 gmx_fatal(FARGS,"The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2873                           dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
2874                           comm->cutoff,
2875                           dd->nc[d],dd->nc[d],
2876                           dd->nnodes > dd->nc[d] ? "cells" : "processors");
2877             }
2878 #ifdef GMX_MPI
2879             MPI_Abort(MPI_COMM_WORLD, 0);
2880 #else
2881             exit(0);
2882 #endif
2883         }
2884     }
2885
2886     if (!comm->bDynLoadBal)
2887     {
2888         copy_rvec(cellsize_min,comm->cellsize_min);
2889     }
2890
2891     for(d=0; d<comm->npmedecompdim; d++)
2892     {
2893         set_pme_maxshift(dd,&comm->ddpme[d],
2894                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
2895                          comm->ddpme[d].slb_dim_f);
2896     }
2897 }
2898
2899
2900 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2901                                        int d,int dim,gmx_domdec_root_t *root,
2902                                        gmx_ddbox_t *ddbox,
2903                                        bool bUniform,gmx_step_t step, real cellsize_limit_f, int range[])
2904 {
2905     gmx_domdec_comm_t *comm;
2906     int  ncd,i,j,nmin,nmin_old;
2907     bool bLimLo,bLimHi;
2908     real *cell_size;
2909     real fac,halfway,cellsize_limit_f_i,region_size;
2910     bool bPBC,bLastHi=FALSE;
2911     int nrange[]={range[0],range[1]};
2912
2913     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];
2914
2915     comm = dd->comm;
2916
2917     ncd = dd->nc[dim];
2918
2919     bPBC = (dim < ddbox->npbcdim);
2920
2921     cell_size = root->buf_ncd;
2922
2923     if (debug)
2924     {
2925         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
2926     }
2927
2928     /* First we need to check if the scaling does not make cells
2929      * smaller than the smallest allowed size.
2930      * We need to do this iteratively, since if a cell is too small,
2931      * it needs to be enlarged, which makes all the other cells smaller,
2932      * which could in turn make another cell smaller than allowed.
2933      */
2934     for(i=range[0]; i<range[1]; i++)
2935     {
2936         root->bCellMin[i] = FALSE;
2937     }
2938     nmin = 0;
2939     do
2940     {
2941         nmin_old = nmin;
2942         /* We need the total for normalization */
2943         fac = 0;
2944         for(i=range[0]; i<range[1]; i++)
2945         {
2946             if (root->bCellMin[i] == FALSE)
2947             {
2948                 fac += cell_size[i];
2949             }
2950         }
2951         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
2952         /* Determine the cell boundaries */
2953         for(i=range[0]; i<range[1]; i++)
2954         {
2955             if (root->bCellMin[i] == FALSE)
2956             {
2957                 cell_size[i] *= fac;
2958                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
2959                 {
2960                     cellsize_limit_f_i = 0;
2961                 }
2962                 else
2963                 {
2964                     cellsize_limit_f_i = cellsize_limit_f;
2965                 }
2966                 if (cell_size[i] < cellsize_limit_f_i)
2967                 {
2968                     root->bCellMin[i] = TRUE;
2969                     cell_size[i] = cellsize_limit_f_i;
2970                     nmin++;
2971                 }
2972             }
2973             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
2974         }
2975     }
2976     while (nmin > nmin_old);
2977
2978     i=range[1]-1;
2979     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
2980     /* For this check we should not use DD_CELL_MARGIN,
2981      * but a slightly smaller factor,
2982      * since rounding could get use below the limit.
2983      */
2984     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
2985     {
2986         char buf[22];
2987         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
2988                   gmx_step_str(step,buf),
2989                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2990                   ncd,comm->cellsize_min[dim]);
2991     }
2992
2993     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
2994
2995     if (!bUniform)
2996     {
2997         /* Check if the boundary did not displace more than halfway
2998          * each of the cells it bounds, as this could cause problems,
2999          * especially when the differences between cell sizes are large.
3000          * If changes are applied, they will not make cells smaller
3001          * than the cut-off, as we check all the boundaries which
3002          * might be affected by a change and if the old state was ok,
3003          * the cells will at most be shrunk back to their old size.
3004          */
3005         for(i=range[0]+1; i<range[1]; i++)
3006         {
3007             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3008             if (root->cell_f[i] < halfway)
3009             {
3010                 root->cell_f[i] = halfway;
3011                 /* Check if the change also causes shifts of the next boundaries */
3012                 for(j=i+1; j<range[1]; j++)
3013                 {
3014                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3015                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3016                 }
3017             }
3018             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3019             if (root->cell_f[i] > halfway)
3020             {
3021                 root->cell_f[i] = halfway;
3022                 /* Check if the change also causes shifts of the next boundaries */
3023                 for(j=i-1; j>=range[0]+1; j--)
3024                 {
3025                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3026                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3027                 }
3028             }
3029         }
3030     }
3031
3032     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3033     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3034      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3035      * for a and b nrange is used */
3036     if (d > 0)
3037     {
3038         /* Take care of the staggering of the cell boundaries */
3039         if (bUniform)
3040         {
3041             for(i=range[0]; i<range[1]; i++)
3042             {
3043                 root->cell_f_max0[i] = root->cell_f[i];
3044                 root->cell_f_min1[i] = root->cell_f[i+1];
3045             }
3046         }
3047         else
3048         {
3049             for(i=range[0]+1; i<range[1]; i++)
3050             {
3051                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3052                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3053                 if (bLimLo && bLimHi)
3054                 {
3055                     /* Both limits violated, try the best we can */
3056                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3057                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3058                     nrange[0]=range[0];
3059                     nrange[1]=i;
3060                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3061
3062                     nrange[0]=i;
3063                     nrange[1]=range[1];
3064                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3065
3066                     return;
3067                 }
3068                 else if (bLimLo)
3069                 {
3070                     /* root->cell_f[i] = root->bound_min[i]; */
3071                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3072                     bLastHi=FALSE;
3073                 }
3074                 else if (bLimHi && !bLastHi)
3075                 {
3076                     bLastHi=TRUE;
3077                     if (nrange[1] < range[1])   /* found a LimLo before */
3078                     {
3079                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3080                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3081                         nrange[0]=nrange[1];
3082                     }
3083                     root->cell_f[i] = root->bound_max[i];
3084                     nrange[1]=i;
3085                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3086                     nrange[0]=i;
3087                     nrange[1]=range[1];
3088                 }
3089             }
3090             if (nrange[1] < range[1])   /* found last a LimLo */
3091             {
3092                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3093                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3094                 nrange[0]=nrange[1];
3095                 nrange[1]=range[1];
3096                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3097             }
3098             else if (nrange[0] > range[0]) /* found at least one LimHi */
3099             {
3100                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3101             }
3102         }
3103     }
3104 }
3105
3106
3107 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3108                                        int d,int dim,gmx_domdec_root_t *root,
3109                                        gmx_ddbox_t *ddbox,bool bDynamicBox,
3110                                        bool bUniform,gmx_step_t step)
3111 {
3112     gmx_domdec_comm_t *comm;
3113     int  ncd,d1,i,j,pos;
3114     real *cell_size;
3115     real load_aver,load_i,imbalance,change,change_max,sc;
3116     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3117     real change_limit = 0.1;
3118     real relax = 0.5;
3119     bool bPBC;
3120     int range[] = { 0, 0 };
3121
3122     comm = dd->comm;
3123
3124     ncd = dd->nc[dim];
3125
3126     bPBC = (dim < ddbox->npbcdim);
3127
3128     cell_size = root->buf_ncd;
3129
3130     /* Store the original boundaries */
3131     for(i=0; i<ncd+1; i++)
3132     {
3133         root->old_cell_f[i] = root->cell_f[i];
3134     }
3135     if (bUniform) {
3136         for(i=0; i<ncd; i++)
3137         {
3138             cell_size[i] = 1.0/ncd;
3139         }
3140     }
3141     else if (dd_load_count(comm))
3142     {
3143         load_aver = comm->load[d].sum_m/ncd;
3144         change_max = 0;
3145         for(i=0; i<ncd; i++)
3146         {
3147             /* Determine the relative imbalance of cell i */
3148             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3149             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3150             /* Determine the change of the cell size using underrelaxation */
3151             change = -relax*imbalance;
3152             change_max = max(change_max,max(change,-change));
3153         }
3154         /* Limit the amount of scaling.
3155          * We need to use the same rescaling for all cells in one row,
3156          * otherwise the load balancing might not converge.
3157          */
3158         sc = relax;
3159         if (change_max > change_limit)
3160         {
3161             sc *= change_limit/change_max;
3162         }
3163         for(i=0; i<ncd; i++)
3164         {
3165             /* Determine the relative imbalance of cell i */
3166             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3167             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3168             /* Determine the change of the cell size using underrelaxation */
3169             change = -sc*imbalance;
3170             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3171         }
3172     }
3173
3174     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3175     cellsize_limit_f *= DD_CELL_MARGIN;
3176     dist_min_f_hard        = grid_jump_limit(comm,d)/ddbox->box_size[dim];
3177     dist_min_f       = dist_min_f_hard * DD_CELL_MARGIN;
3178     if (ddbox->tric_dir[dim])
3179     {
3180         cellsize_limit_f /= ddbox->skew_fac[dim];
3181         dist_min_f       /= ddbox->skew_fac[dim];
3182     }
3183     if (bDynamicBox && d > 0)
3184     {
3185         dist_min_f *= DD_PRES_SCALE_MARGIN;
3186     }
3187     if (d > 0 && !bUniform)
3188     {
3189         /* Make sure that the grid is not shifted too much */
3190         for(i=1; i<ncd; i++) {
3191             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3192             {
3193                 gmx_incons("Inconsistent DD boundary staggering limits!");
3194             }
3195             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3196             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3197             if (space > 0) {
3198                 root->bound_min[i] += 0.5*space;
3199             }
3200             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3201             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3202             if (space < 0) {
3203                 root->bound_max[i] += 0.5*space;
3204             }
3205             if (debug)
3206             {
3207                 fprintf(debug,
3208                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3209                         d,i,
3210                         root->cell_f_max0[i-1] + dist_min_f,
3211                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3212                         root->cell_f_min1[i] - dist_min_f);
3213             }
3214         }
3215     }
3216     range[1]=ncd;
3217     root->cell_f[0] = 0;
3218     root->cell_f[ncd] = 1;
3219     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3220
3221
3222     /* After the checks above, the cells should obey the cut-off
3223      * restrictions, but it does not hurt to check.
3224      */
3225     for(i=0; i<ncd; i++)
3226     {
3227         if (debug)
3228         {
3229             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3230                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3231         }
3232
3233         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3234             root->cell_f[i+1] - root->cell_f[i] <
3235             cellsize_limit_f/DD_CELL_MARGIN)
3236         {
3237             char buf[22];
3238             fprintf(stderr,
3239                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3240                     gmx_step_str(step,buf),dim2char(dim),i,
3241                     (root->cell_f[i+1] - root->cell_f[i])
3242                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3243         }
3244     }
3245
3246     pos = ncd + 1;
3247     /* Store the cell boundaries of the lower dimensions at the end */
3248     for(d1=0; d1<d; d1++)
3249     {
3250         root->cell_f[pos++] = comm->cell_f0[d1];
3251         root->cell_f[pos++] = comm->cell_f1[d1];
3252     }
3253
3254     if (d < comm->npmedecompdim)
3255     {
3256         /* The master determines the maximum shift for
3257          * the coordinate communication between separate PME nodes.
3258          */
3259         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3260     }
3261     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3262     if (d >= 1)
3263     {
3264         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3265     }
3266 }
3267
3268 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3269                                              gmx_ddbox_t *ddbox,int dimind)
3270 {
3271     gmx_domdec_comm_t *comm;
3272     int dim;
3273
3274     comm = dd->comm;
3275
3276     /* Set the cell dimensions */
3277     dim = dd->dim[dimind];
3278     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3279     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3280     if (dim >= ddbox->nboundeddim)
3281     {
3282         comm->cell_x0[dim] += ddbox->box0[dim];
3283         comm->cell_x1[dim] += ddbox->box0[dim];
3284     }
3285 }
3286
3287 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3288                                          int d,int dim,real *cell_f_row,
3289                                          gmx_ddbox_t *ddbox)
3290 {
3291     gmx_domdec_comm_t *comm;
3292     int d1,dim1,pos;
3293
3294     comm = dd->comm;
3295
3296 #ifdef GMX_MPI
3297     /* Each node would only need to know two fractions,
3298      * but it is probably cheaper to broadcast the whole array.
3299      */
3300     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3301               0,comm->mpi_comm_load[d]);
3302 #endif
3303     /* Copy the fractions for this dimension from the buffer */
3304     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3305     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3306     /* The whole array was communicated, so set the buffer position */
3307     pos = dd->nc[dim] + 1;
3308     for(d1=0; d1<=d; d1++)
3309     {
3310         if (d1 < d)
3311         {
3312             /* Copy the cell fractions of the lower dimensions */
3313             comm->cell_f0[d1] = cell_f_row[pos++];
3314             comm->cell_f1[d1] = cell_f_row[pos++];
3315         }
3316         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3317     }
3318     /* Convert the communicated shift from float to int */
3319     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3320     if (d >= 1)
3321     {
3322         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3323     }
3324 }
3325
3326 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3327                                          gmx_ddbox_t *ddbox,bool bDynamicBox,
3328                                          bool bUniform,gmx_step_t step)
3329 {
3330     gmx_domdec_comm_t *comm;
3331     int d,dim,d1;
3332     bool bRowMember,bRowRoot;
3333     real *cell_f_row;
3334
3335     comm = dd->comm;
3336
3337     for(d=0; d<dd->ndim; d++)
3338     {
3339         dim = dd->dim[d];
3340         bRowMember = TRUE;
3341         bRowRoot = TRUE;
3342         for(d1=d; d1<dd->ndim; d1++)
3343         {
3344             if (dd->ci[dd->dim[d1]] > 0)
3345             {
3346                 if (d1 > d)
3347                 {
3348                     bRowMember = FALSE;
3349                 }
3350                 bRowRoot = FALSE;
3351             }
3352         }
3353         if (bRowMember)
3354         {
3355             if (bRowRoot)
3356             {
3357                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3358                                            ddbox,bDynamicBox,bUniform,step);
3359                 cell_f_row = comm->root[d]->cell_f;
3360             }
3361             else
3362             {
3363                 cell_f_row = comm->cell_f_row;
3364             }
3365             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3366         }
3367     }
3368 }
3369
3370 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3371 {
3372     int d;
3373
3374     /* This function assumes the box is static and should therefore
3375      * not be called when the box has changed since the last
3376      * call to dd_partition_system.
3377      */
3378     for(d=0; d<dd->ndim; d++)
3379     {
3380         relative_to_absolute_cell_bounds(dd,ddbox,d);
3381     }
3382 }
3383
3384
3385
3386 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3387                                   gmx_ddbox_t *ddbox,bool bDynamicBox,
3388                                   bool bUniform,bool bDoDLB,gmx_step_t step,
3389                                   gmx_wallcycle_t wcycle)
3390 {
3391     gmx_domdec_comm_t *comm;
3392     int dim;
3393
3394     comm = dd->comm;
3395
3396     if (bDoDLB)
3397     {
3398         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3399         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3400         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3401     }
3402     else if (bDynamicBox)
3403     {
3404         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3405     }
3406
3407     /* Set the dimensions for which no DD is used */
3408     for(dim=0; dim<DIM; dim++) {
3409         if (dd->nc[dim] == 1) {
3410             comm->cell_x0[dim] = 0;
3411             comm->cell_x1[dim] = ddbox->box_size[dim];
3412             if (dim >= ddbox->nboundeddim)
3413             {
3414                 comm->cell_x0[dim] += ddbox->box0[dim];
3415                 comm->cell_x1[dim] += ddbox->box0[dim];
3416             }
3417         }
3418     }
3419 }
3420
3421 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3422 {
3423     int d,np,i;
3424     gmx_domdec_comm_dim_t *cd;
3425
3426     for(d=0; d<dd->ndim; d++)
3427     {
3428         cd = &dd->comm->cd[d];
3429         np = npulse[dd->dim[d]];
3430         if (np > cd->np_nalloc)
3431         {
3432             if (debug)
3433             {
3434                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3435                         dim2char(dd->dim[d]),np);
3436             }
3437             if (DDMASTER(dd) && cd->np_nalloc > 0)
3438             {
3439                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3440             }
3441             srenew(cd->ind,np);
3442             for(i=cd->np_nalloc; i<np; i++)
3443             {
3444                 cd->ind[i].index  = NULL;
3445                 cd->ind[i].nalloc = 0;
3446             }
3447             cd->np_nalloc = np;
3448         }
3449         cd->np = np;
3450     }
3451 }
3452
3453
3454 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3455                               gmx_ddbox_t *ddbox,bool bDynamicBox,
3456                               bool bUniform,bool bDoDLB,gmx_step_t step,
3457                               gmx_wallcycle_t wcycle)
3458 {
3459     gmx_domdec_comm_t *comm;
3460     int  d;
3461     ivec npulse;
3462
3463     comm = dd->comm;
3464
3465     /* Copy the old cell boundaries for the cg displacement check */
3466     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3467     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3468
3469     if (comm->bDynLoadBal)
3470     {
3471         if (DDMASTER(dd))
3472         {
3473             check_box_size(dd,ddbox);
3474         }
3475         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3476     }
3477     else
3478     {
3479         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3480         realloc_comm_ind(dd,npulse);
3481     }
3482
3483     if (debug)
3484     {
3485         for(d=0; d<DIM; d++)
3486         {
3487             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3488                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3489         }
3490     }
3491 }
3492
3493 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3494                                   gmx_ddbox_t *ddbox,
3495                                   rvec cell_ns_x0,rvec cell_ns_x1,
3496                                   gmx_step_t step)
3497 {
3498     gmx_domdec_comm_t *comm;
3499     int dim_ind,dim;
3500
3501     comm = dd->comm;
3502
3503     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3504     {
3505         dim = dd->dim[dim_ind];
3506
3507         /* Without PBC we don't have restrictions on the outer cells */
3508         if (!(dim >= ddbox->npbcdim &&
3509               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3510             comm->bDynLoadBal &&
3511             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3512             comm->cellsize_min[dim])
3513         {
3514             char buf[22];
3515             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3516                       gmx_step_str(step,buf),dim2char(dim),
3517                       comm->cell_x1[dim] - comm->cell_x0[dim],
3518                       ddbox->skew_fac[dim],
3519                       dd->comm->cellsize_min[dim],
3520                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3521         }
3522     }
3523
3524     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3525     {
3526         /* Communicate the boundaries and update cell_ns_x0/1 */
3527         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3528         if (dd->bGridJump && dd->ndim > 1)
3529         {
3530             check_grid_jump(step,dd,ddbox);
3531         }
3532     }
3533 }
3534
3535 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3536 {
3537     if (YY < npbcdim)
3538     {
3539         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3540     }
3541     else
3542     {
3543         tcm[YY][XX] = 0;
3544     }
3545     if (ZZ < npbcdim)
3546     {
3547         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3548         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3549     }
3550     else
3551     {
3552         tcm[ZZ][XX] = 0;
3553         tcm[ZZ][YY] = 0;
3554     }
3555 }
3556
3557 static void check_screw_box(matrix box)
3558 {
3559     /* Mathematical limitation */
3560     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3561     {
3562         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3563     }
3564
3565     /* Limitation due to the asymmetry of the eighth shell method */
3566     if (box[ZZ][YY] != 0)
3567     {
3568         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3569     }
3570 }
3571
3572 static void distribute_cg(FILE *fplog,gmx_step_t step,
3573                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3574                           gmx_domdec_t *dd)
3575 {
3576     gmx_domdec_master_t *ma;
3577     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3578     int  i,icg,j,k,k0,k1,d,npbcdim;
3579     matrix tcm;
3580     rvec box_size,cg_cm;
3581     ivec ind;
3582     real nrcg,inv_ncg,pos_d;
3583     atom_id *cgindex;
3584     bool bUnbounded,bScrew;
3585
3586     ma = dd->ma;
3587
3588     if (tmp_ind == NULL)
3589     {
3590         snew(tmp_nalloc,dd->nnodes);
3591         snew(tmp_ind,dd->nnodes);
3592         for(i=0; i<dd->nnodes; i++)
3593         {
3594             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3595             snew(tmp_ind[i],tmp_nalloc[i]);
3596         }
3597     }
3598
3599     /* Clear the count */
3600     for(i=0; i<dd->nnodes; i++)
3601     {
3602         ma->ncg[i] = 0;
3603         ma->nat[i] = 0;
3604     }
3605
3606     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3607
3608     cgindex = cgs->index;
3609
3610     /* Compute the center of geometry for all charge groups */
3611     for(icg=0; icg<cgs->nr; icg++)
3612     {
3613         k0      = cgindex[icg];
3614         k1      = cgindex[icg+1];
3615         nrcg    = k1 - k0;
3616         if (nrcg == 1)
3617         {
3618             copy_rvec(pos[k0],cg_cm);
3619         }
3620         else
3621         {
3622             inv_ncg = 1.0/nrcg;
3623
3624             clear_rvec(cg_cm);
3625             for(k=k0; (k<k1); k++)
3626             {
3627                 rvec_inc(cg_cm,pos[k]);
3628             }
3629             for(d=0; (d<DIM); d++)
3630             {
3631                 cg_cm[d] *= inv_ncg;
3632             }
3633         }
3634         /* Put the charge group in the box and determine the cell index */
3635         for(d=DIM-1; d>=0; d--) {
3636             pos_d = cg_cm[d];
3637             if (d < dd->npbcdim)
3638             {
3639                 bScrew = (dd->bScrewPBC && d == XX);
3640                 if (tric_dir[d] && dd->nc[d] > 1)
3641                 {
3642                     /* Use triclinic coordintates for this dimension */
3643                     for(j=d+1; j<DIM; j++)
3644                     {
3645                         pos_d += cg_cm[j]*tcm[j][d];
3646                     }
3647                 }
3648                 while(pos_d >= box[d][d])
3649                 {
3650                     pos_d -= box[d][d];
3651                     rvec_dec(cg_cm,box[d]);
3652                     if (bScrew)
3653                     {
3654                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3655                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3656                     }
3657                     for(k=k0; (k<k1); k++)
3658                     {
3659                         rvec_dec(pos[k],box[d]);
3660                         if (bScrew)
3661                         {
3662                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3663                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3664                         }
3665                     }
3666                 }
3667                 while(pos_d < 0)
3668                 {
3669                     pos_d += box[d][d];
3670                     rvec_inc(cg_cm,box[d]);
3671                     if (bScrew)
3672                     {
3673                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3674                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3675                     }
3676                     for(k=k0; (k<k1); k++)
3677                     {
3678                         rvec_inc(pos[k],box[d]);
3679                         if (bScrew) {
3680                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3681                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3682                         }
3683                     }
3684                 }
3685             }
3686             /* This could be done more efficiently */
3687             ind[d] = 0;
3688             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3689             {
3690                 ind[d]++;
3691             }
3692         }
3693         i = dd_index(dd->nc,ind);
3694         if (ma->ncg[i] == tmp_nalloc[i])
3695         {
3696             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3697             srenew(tmp_ind[i],tmp_nalloc[i]);
3698         }
3699         tmp_ind[i][ma->ncg[i]] = icg;
3700         ma->ncg[i]++;
3701         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3702     }
3703
3704     k1 = 0;
3705     for(i=0; i<dd->nnodes; i++)
3706     {
3707         ma->index[i] = k1;
3708         for(k=0; k<ma->ncg[i]; k++)
3709         {
3710             ma->cg[k1++] = tmp_ind[i][k];
3711         }
3712     }
3713     ma->index[dd->nnodes] = k1;
3714
3715     for(i=0; i<dd->nnodes; i++)
3716     {
3717         sfree(tmp_ind[i]);
3718     }
3719     sfree(tmp_ind);
3720     sfree(tmp_nalloc);
3721
3722     if (fplog)
3723     {
3724         char buf[22];
3725         fprintf(fplog,"Charge group distribution at step %s:",
3726                 gmx_step_str(step,buf));
3727         for(i=0; i<dd->nnodes; i++)
3728         {
3729             fprintf(fplog," %d",ma->ncg[i]);
3730         }
3731         fprintf(fplog,"\n");
3732     }
3733 }
3734
3735 static void get_cg_distribution(FILE *fplog,gmx_step_t step,gmx_domdec_t *dd,
3736                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3737                                 rvec pos[])
3738 {
3739     gmx_domdec_master_t *ma=NULL;
3740     ivec npulse;
3741     int  i,cg_gl;
3742     int  *ibuf,buf2[2] = { 0, 0 };
3743
3744     if (DDMASTER(dd))
3745     {
3746         ma = dd->ma;
3747
3748         if (dd->bScrewPBC)
3749         {
3750             check_screw_box(box);
3751         }
3752
3753         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3754
3755         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3756         for(i=0; i<dd->nnodes; i++)
3757         {
3758             ma->ibuf[2*i]   = ma->ncg[i];
3759             ma->ibuf[2*i+1] = ma->nat[i];
3760         }
3761         ibuf = ma->ibuf;
3762     }
3763     else
3764     {
3765         ibuf = NULL;
3766     }
3767     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3768
3769     dd->ncg_home = buf2[0];
3770     dd->nat_home = buf2[1];
3771     dd->ncg_tot  = dd->ncg_home;
3772     dd->nat_tot  = dd->nat_home;
3773     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3774     {
3775         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3776         srenew(dd->index_gl,dd->cg_nalloc);
3777         srenew(dd->cgindex,dd->cg_nalloc+1);
3778     }
3779     if (DDMASTER(dd))
3780     {
3781         for(i=0; i<dd->nnodes; i++)
3782         {
3783             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3784             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3785         }
3786     }
3787
3788     dd_scatterv(dd,
3789                 DDMASTER(dd) ? ma->ibuf : NULL,
3790                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3791                 DDMASTER(dd) ? ma->cg : NULL,
3792                 dd->ncg_home*sizeof(int),dd->index_gl);
3793
3794     /* Determine the home charge group sizes */
3795     dd->cgindex[0] = 0;
3796     for(i=0; i<dd->ncg_home; i++)
3797     {
3798         cg_gl = dd->index_gl[i];
3799         dd->cgindex[i+1] =
3800             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3801     }
3802
3803     if (debug)
3804     {
3805         fprintf(debug,"Home charge groups:\n");
3806         for(i=0; i<dd->ncg_home; i++)
3807         {
3808             fprintf(debug," %d",dd->index_gl[i]);
3809             if (i % 10 == 9)
3810                 fprintf(debug,"\n");
3811         }
3812         fprintf(debug,"\n");
3813     }
3814 }
3815
3816 static int compact_and_copy_vec_at(int ncg,int *move,
3817                                    int *cgindex,
3818                                    int nvec,int vec,
3819                                    rvec *src,gmx_domdec_comm_t *comm,
3820                                    bool bCompact)
3821 {
3822     int m,icg,i,i0,i1,nrcg;
3823     int home_pos;
3824     int pos_vec[DIM*2];
3825
3826     home_pos = 0;
3827
3828     for(m=0; m<DIM*2; m++)
3829     {
3830         pos_vec[m] = 0;
3831     }
3832
3833     i0 = 0;
3834     for(icg=0; icg<ncg; icg++)
3835     {
3836         i1 = cgindex[icg+1];
3837         m = move[icg];
3838         if (m == -1)
3839         {
3840             if (bCompact)
3841             {
3842                 /* Compact the home array in place */
3843                 for(i=i0; i<i1; i++)
3844                 {
3845                     copy_rvec(src[i],src[home_pos++]);
3846                 }
3847             }
3848         }
3849         else
3850         {
3851             /* Copy to the communication buffer */
3852             nrcg = i1 - i0;
3853             pos_vec[m] += 1 + vec*nrcg;
3854             for(i=i0; i<i1; i++)
3855             {
3856                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
3857             }
3858             pos_vec[m] += (nvec - vec - 1)*nrcg;
3859         }
3860         if (!bCompact)
3861         {
3862             home_pos += i1 - i0;
3863         }
3864         i0 = i1;
3865     }
3866
3867     return home_pos;
3868 }
3869
3870 static int compact_and_copy_vec_cg(int ncg,int *move,
3871                                    int *cgindex,
3872                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
3873                                    bool bCompact)
3874 {
3875     int m,icg,i0,i1,nrcg;
3876     int home_pos;
3877     int pos_vec[DIM*2];
3878
3879     home_pos = 0;
3880
3881     for(m=0; m<DIM*2; m++)
3882     {
3883         pos_vec[m] = 0;
3884     }
3885
3886     i0 = 0;
3887     for(icg=0; icg<ncg; icg++)
3888     {
3889         i1 = cgindex[icg+1];
3890         m = move[icg];
3891         if (m == -1)
3892         {
3893             if (bCompact)
3894             {
3895                 /* Compact the home array in place */
3896                 copy_rvec(src[icg],src[home_pos++]);
3897             }
3898         }
3899         else
3900         {
3901             nrcg = i1 - i0;
3902             /* Copy to the communication buffer */
3903             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
3904             pos_vec[m] += 1 + nrcg*nvec;
3905         }
3906         i0 = i1;
3907     }
3908     if (!bCompact)
3909     {
3910         home_pos = ncg;
3911     }
3912
3913     return home_pos;
3914 }
3915
3916 static int compact_ind(int ncg,int *move,
3917                        int *index_gl,int *cgindex,
3918                        int *gatindex,
3919                        gmx_ga2la_t ga2la,char *bLocalCG,
3920                        int *cginfo)
3921 {
3922     int cg,nat,a0,a1,a,a_gl;
3923     int home_pos;
3924
3925     home_pos = 0;
3926     nat = 0;
3927     for(cg=0; cg<ncg; cg++)
3928     {
3929         a0 = cgindex[cg];
3930         a1 = cgindex[cg+1];
3931         if (move[cg] == -1)
3932         {
3933             /* Compact the home arrays in place.
3934              * Anything that can be done here avoids access to global arrays.
3935              */
3936             cgindex[home_pos] = nat;
3937             for(a=a0; a<a1; a++)
3938             {
3939                 a_gl = gatindex[a];
3940                 gatindex[nat] = a_gl;
3941                 /* The cell number stays 0, so we don't need to set it */
3942                 ga2la_change_la(ga2la,a_gl,nat);
3943                 nat++;
3944             }
3945             index_gl[home_pos] = index_gl[cg];
3946             cginfo[home_pos]   = cginfo[cg];
3947             /* The charge group remains local, so bLocalCG does not change */
3948             home_pos++;
3949         }
3950         else
3951         {
3952             /* Clear the global indices */
3953             for(a=a0; a<a1; a++)
3954             {
3955                 ga2la_del(ga2la,gatindex[a]);
3956             }
3957             if (bLocalCG)
3958             {
3959                 bLocalCG[index_gl[cg]] = FALSE;
3960             }
3961         }
3962     }
3963     cgindex[home_pos] = nat;
3964
3965     return home_pos;
3966 }
3967
3968 static void clear_and_mark_ind(int ncg,int *move,
3969                                int *index_gl,int *cgindex,int *gatindex,
3970                                gmx_ga2la_t ga2la,char *bLocalCG,
3971                                int *cell_index)
3972 {
3973     int cg,a0,a1,a;
3974
3975     for(cg=0; cg<ncg; cg++)
3976     {
3977         if (move[cg] >= 0)
3978         {
3979             a0 = cgindex[cg];
3980             a1 = cgindex[cg+1];
3981             /* Clear the global indices */
3982             for(a=a0; a<a1; a++)
3983             {
3984                 ga2la_del(ga2la,gatindex[a]);
3985             }
3986             if (bLocalCG)
3987             {
3988                 bLocalCG[index_gl[cg]] = FALSE;
3989             }
3990             /* Signal that this cg has moved using the ns cell index.
3991              * Here we set it to -1.
3992              * fill_grid will change it from -1 to 4*grid->ncells.
3993              */
3994             cell_index[cg] = -1;
3995         }
3996     }
3997 }
3998
3999 static void print_cg_move(FILE *fplog,
4000                           gmx_domdec_t *dd,
4001                           gmx_step_t step,int cg,int dim,int dir,
4002                           real limitd,
4003                           rvec cm_old,rvec cm_new,real pos_d)
4004 {
4005     gmx_domdec_comm_t *comm;
4006     char buf[22];
4007
4008     comm = dd->comm;
4009
4010     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4011     fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition (%f) in direction %c\n",
4012             ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4013     fprintf(fplog,"distance out of cell %f\n",
4014             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4015     fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4016             cm_old[XX],cm_old[YY],cm_old[ZZ]);
4017     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4018             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4019     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4020             dim2char(dim),
4021             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4022     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4023             dim2char(dim),
4024             comm->cell_x0[dim],comm->cell_x1[dim]);
4025 }
4026
4027 static void cg_move_error(FILE *fplog,
4028                           gmx_domdec_t *dd,
4029                           gmx_step_t step,int cg,int dim,int dir,
4030                           real limitd,
4031                           rvec cm_old,rvec cm_new,real pos_d)
4032 {
4033     if (fplog)
4034     {
4035         print_cg_move(fplog, dd,step,cg,dim,dir,limitd,cm_old,cm_new,pos_d);
4036     }
4037     print_cg_move(stderr,dd,step,cg,dim,dir,limitd,cm_old,cm_new,pos_d);
4038     gmx_fatal(FARGS,
4039               "A charge group moved too far between two domain decomposition steps\n"
4040               "This usually means that your system is not well equilibrated");
4041 }
4042
4043 static void rotate_state_atom(t_state *state,int a)
4044 {
4045     int est;
4046
4047     for(est=estX; est<estNR; est++)
4048     {
4049         if (state->flags & (1<<est)) {
4050             switch (est) {
4051             case estX:
4052                 /* Rotate the complete state; for a rectangular box only */
4053                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4054                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4055                 break;
4056             case estV:
4057                 state->v[a][YY] = -state->v[a][YY];
4058                 state->v[a][ZZ] = -state->v[a][ZZ];
4059                 break;
4060             case estSDX:
4061                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4062                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4063                 break;
4064             case estCGP:
4065                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4066                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4067                 break;
4068             case estDISRE_INITF:
4069             case estDISRE_RM3TAV:
4070             case estORIRE_INITF:
4071             case estORIRE_DTAV:
4072                 /* These are distances, so not affected by rotation */
4073                 break;
4074             default:
4075                 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4076             }
4077         }
4078     }
4079 }
4080
4081 static int dd_redistribute_cg(FILE *fplog,gmx_step_t step,
4082                               gmx_domdec_t *dd,ivec tric_dir,
4083                               t_state *state,rvec **f,
4084                               t_forcerec *fr,t_mdatoms *md,
4085                               bool bCompact,
4086                               t_nrnb *nrnb)
4087 {
4088     int  *move;
4089     int  npbcdim;
4090     int  ncg[DIM*2],nat[DIM*2];
4091     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4092     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4093     int  sbuf[2],rbuf[2];
4094     int  home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
4095     int  flag;
4096     bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4097     bool bScrew;
4098     ivec dev;
4099     real inv_ncg,pos_d;
4100     matrix tcm;
4101     rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4102     atom_id *cgindex;
4103     cginfo_mb_t *cginfo_mb;
4104     gmx_domdec_comm_t *comm;
4105
4106     if (dd->bScrewPBC)
4107     {
4108         check_screw_box(state->box);
4109     }
4110
4111     comm  = dd->comm;
4112     cg_cm = fr->cg_cm;
4113
4114     for(i=estX; i<estNR; i++)
4115     {
4116         switch (i)
4117         {
4118         case estX:   /* Always present */            break;
4119         case estV:   bV   = (state->flags & (1<<i)); break;
4120         case estSDX: bSDX = (state->flags & (1<<i)); break;
4121         case estCGP: bCGP = (state->flags & (1<<i)); break;
4122         case estLD_RNG:
4123         case estLD_RNGI:
4124         case estDISRE_INITF:
4125         case estDISRE_RM3TAV:
4126         case estORIRE_INITF:
4127         case estORIRE_DTAV:
4128             /* No processing required */
4129             break;
4130         default:
4131             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4132         }
4133     }
4134
4135     if (dd->ncg_tot > comm->nalloc_int)
4136     {
4137         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4138         srenew(comm->buf_int,comm->nalloc_int);
4139     }
4140     move = comm->buf_int;
4141
4142     /* Clear the count */
4143     for(c=0; c<dd->ndim*2; c++)
4144     {
4145         ncg[c] = 0;
4146         nat[c] = 0;
4147     }
4148
4149     npbcdim = dd->npbcdim;
4150
4151     for(d=0; (d<DIM); d++)
4152     {
4153         limitd[d] = dd->comm->cellsize_min[d];
4154         if (d >= npbcdim && dd->ci[d] == 0)
4155         {
4156             cell_x0[d] = -GMX_FLOAT_MAX;
4157         }
4158         else
4159         {
4160             cell_x0[d] = comm->cell_x0[d];
4161         }
4162         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4163         {
4164             cell_x1[d] = GMX_FLOAT_MAX;
4165         }
4166         else
4167         {
4168             cell_x1[d] = comm->cell_x1[d];
4169         }
4170         limit0[d] = comm->old_cell_x0[d] - limitd[d];
4171         limit1[d] = comm->old_cell_x1[d] + limitd[d];
4172     }
4173
4174     make_tric_corr_matrix(npbcdim,state->box,tcm);
4175
4176     cgindex = dd->cgindex;
4177
4178     /* Compute the center of geometry for all home charge groups
4179      * and put them in the box and determine where they should go.
4180      */
4181     for(cg=0; cg<dd->ncg_home; cg++)
4182     {
4183         k0   = cgindex[cg];
4184         k1   = cgindex[cg+1];
4185         nrcg = k1 - k0;
4186         if (nrcg == 1)
4187         {
4188             copy_rvec(state->x[k0],cm_new);
4189         }
4190         else
4191         {
4192             inv_ncg = 1.0/nrcg;
4193
4194             clear_rvec(cm_new);
4195             for(k=k0; (k<k1); k++)
4196             {
4197                 rvec_inc(cm_new,state->x[k]);
4198             }
4199             for(d=0; (d<DIM); d++)
4200             {
4201                 cm_new[d] = inv_ncg*cm_new[d];
4202             }
4203         }
4204
4205         clear_ivec(dev);
4206         /* Do pbc and check DD cell boundary crossings */
4207         for(d=DIM-1; d>=0; d--)
4208         {
4209             if (dd->nc[d] > 1)
4210             {
4211                 bScrew = (dd->bScrewPBC && d == XX);
4212                 /* Determine the location of this cg in lattice coordinates */
4213                 pos_d = cm_new[d];
4214                 if (tric_dir[d])
4215                 {
4216                     for(d2=d+1; d2<DIM; d2++)
4217                     {
4218                         pos_d += cm_new[d2]*tcm[d2][d];
4219                     }
4220                 }
4221                 /* Put the charge group in the triclinic unit-cell */
4222                 if (pos_d >= cell_x1[d])
4223                 {
4224                     if (pos_d >= limit1[d])
4225                     {
4226                         cg_move_error(fplog,dd,step,cg,d,1,limitd[d],
4227                                       cg_cm[cg],cm_new,pos_d);
4228                     }
4229                     dev[d] = 1;
4230                     if (dd->ci[d] == dd->nc[d] - 1)
4231                     {
4232                         rvec_dec(cm_new,state->box[d]);
4233                         if (bScrew)
4234                         {
4235                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4236                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4237                         }
4238                         for(k=k0; (k<k1); k++)
4239                         {
4240                             rvec_dec(state->x[k],state->box[d]);
4241                             if (bScrew)
4242                             {
4243                                 rotate_state_atom(state,k);
4244                             }
4245                         }
4246                     }
4247                 }
4248                 else if (pos_d < cell_x0[d])
4249                 {
4250                     if (pos_d < limit0[d])
4251                     {
4252                         cg_move_error(fplog,dd,step,cg,d,-1,limitd[d],
4253                                       cg_cm[cg],cm_new,pos_d);
4254                     }
4255                     dev[d] = -1;
4256                     if (dd->ci[d] == 0)
4257                     {
4258                         rvec_inc(cm_new,state->box[d]);
4259                         if (bScrew)
4260                         {
4261                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4262                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4263                         }
4264                         for(k=k0; (k<k1); k++)
4265                         {
4266                             rvec_inc(state->x[k],state->box[d]);
4267                             if (bScrew)
4268                             {
4269                                 rotate_state_atom(state,k);
4270                             }
4271                         }
4272                     }
4273                 }
4274             }
4275             else if (d < npbcdim)
4276             {
4277                 /* Put the charge group in the rectangular unit-cell */
4278                 while (cm_new[d] >= state->box[d][d])
4279                 {
4280                     rvec_dec(cm_new,state->box[d]);
4281                     for(k=k0; (k<k1); k++)
4282                     {
4283                         rvec_dec(state->x[k],state->box[d]);
4284                     }
4285                 }
4286                 while (cm_new[d] < 0)
4287                 {
4288                     rvec_inc(cm_new,state->box[d]);
4289                     for(k=k0; (k<k1); k++)
4290                     {
4291                         rvec_inc(state->x[k],state->box[d]);
4292                     }
4293                 }
4294             }
4295         }
4296
4297         copy_rvec(cm_new,cg_cm[cg]);
4298
4299         /* Determine where this cg should go */
4300         flag = 0;
4301         mc = -1;
4302         for(d=0; d<dd->ndim; d++)
4303         {
4304             dim = dd->dim[d];
4305             if (dev[dim] == 1)
4306             {
4307                 flag |= DD_FLAG_FW(d);
4308                 if (mc == -1)
4309                 {
4310                     mc = d*2;
4311                 }
4312             }
4313             else if (dev[dim] == -1)
4314             {
4315                 flag |= DD_FLAG_BW(d);
4316                 if (mc == -1) {
4317                     if (dd->nc[dim] > 2)
4318                     {
4319                         mc = d*2 + 1;
4320                     }
4321                     else
4322                     {
4323                         mc = d*2;
4324                     }
4325                 }
4326             }
4327         }
4328         move[cg] = mc;
4329         if (mc >= 0)
4330         {
4331             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4332             {
4333                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4334                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4335             }
4336             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4337             /* We store the cg size in the lower 16 bits
4338              * and the place where the charge group should go
4339              * in the next 6 bits. This saves some communication volume.
4340              */
4341             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4342             ncg[mc] += 1;
4343             nat[mc] += nrcg;
4344         }
4345     }
4346
4347     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4348     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4349
4350     nvec = 1;
4351     if (bV)
4352     {
4353         nvec++;
4354     }
4355     if (bSDX)
4356     {
4357         nvec++;
4358     }
4359     if (bCGP)
4360     {
4361         nvec++;
4362     }
4363
4364     /* Make sure the communication buffers are large enough */
4365     for(mc=0; mc<dd->ndim*2; mc++)
4366     {
4367         nvr = ncg[mc] + nat[mc]*nvec;
4368         if (nvr > comm->cgcm_state_nalloc[mc])
4369         {
4370             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4371             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4372         }
4373     }
4374
4375     /* Recalculating cg_cm might be cheaper than communicating,
4376      * but that could give rise to rounding issues.
4377      */
4378     home_pos_cg =
4379         compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4380                                 nvec,cg_cm,comm,bCompact);
4381
4382     vec = 0;
4383     home_pos_at =
4384         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4385                                 nvec,vec++,state->x,comm,bCompact);
4386     if (bV)
4387     {
4388         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4389                                 nvec,vec++,state->v,comm,bCompact);
4390     }
4391     if (bSDX)
4392     {
4393         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4394                                 nvec,vec++,state->sd_X,comm,bCompact);
4395     }
4396     if (bCGP)
4397     {
4398         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4399                                 nvec,vec++,state->cg_p,comm,bCompact);
4400     }
4401
4402     if (bCompact)
4403     {
4404         compact_ind(dd->ncg_home,move,
4405                     dd->index_gl,dd->cgindex,dd->gatindex,
4406                     dd->ga2la,comm->bLocalCG,
4407                     fr->cginfo);
4408     }
4409     else
4410     {
4411         clear_and_mark_ind(dd->ncg_home,move,
4412                            dd->index_gl,dd->cgindex,dd->gatindex,
4413                            dd->ga2la,comm->bLocalCG,
4414                            fr->ns.grid->cell_index);
4415     }
4416
4417     cginfo_mb = fr->cginfo_mb;
4418
4419     ncg_stay_home = home_pos_cg;
4420     for(d=0; d<dd->ndim; d++)
4421     {
4422         dim = dd->dim[d];
4423         ncg_recv = 0;
4424         nat_recv = 0;
4425         nvr      = 0;
4426         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4427         {
4428             cdd = d*2 + dir;
4429             /* Communicate the cg and atom counts */
4430             sbuf[0] = ncg[cdd];
4431             sbuf[1] = nat[cdd];
4432             if (debug)
4433             {
4434                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4435                         d,dir,sbuf[0],sbuf[1]);
4436             }
4437             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4438
4439             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4440             {
4441                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4442                 srenew(comm->buf_int,comm->nalloc_int);
4443             }
4444
4445             /* Communicate the charge group indices, sizes and flags */
4446             dd_sendrecv_int(dd, d, dir,
4447                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4448                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4449
4450             nvs = ncg[cdd] + nat[cdd]*nvec;
4451             i   = rbuf[0]  + rbuf[1] *nvec;
4452             check_vec_rvec_alloc(&comm->vbuf,nvr+i);
4453
4454             /* Communicate cgcm and state */
4455             dd_sendrecv_rvec(dd, d, dir,
4456                              comm->cgcm_state[cdd], nvs,
4457                              comm->vbuf.v+nvr, i);
4458             ncg_recv += rbuf[0];
4459             nat_recv += rbuf[1];
4460             nvr      += i;
4461         }
4462
4463         /* Process the received charge groups */
4464         buf_pos = 0;
4465         for(cg=0; cg<ncg_recv; cg++)
4466         {
4467             flag = comm->buf_int[cg*DD_CGIBS+1];
4468             mc = -1;
4469             if (d < dd->ndim-1)
4470             {
4471                 /* Check which direction this cg should go */
4472                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4473                 {
4474                     if (dd->bGridJump)
4475                     {
4476                         /* The cell boundaries for dimension d2 are not equal
4477                          * for each cell row of the lower dimension(s),
4478                          * therefore we might need to redetermine where
4479                          * this cg should go.
4480                          */
4481                         dim2 = dd->dim[d2];
4482                         /* If this cg crosses the box boundary in dimension d2
4483                          * we can use the communicated flag, so we do not
4484                          * have to worry about pbc.
4485                          */
4486                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4487                                (flag & DD_FLAG_FW(d2))) ||
4488                               (dd->ci[dim2] == 0 &&
4489                                (flag & DD_FLAG_BW(d2)))))
4490                         {
4491                             /* Clear the two flags for this dimension */
4492                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4493                             /* Determine the location of this cg
4494                              * in lattice coordinates
4495                              */
4496                             pos_d = comm->vbuf.v[buf_pos][dim2];
4497                             if (tric_dir[dim2])
4498                             {
4499                                 for(d3=dim2+1; d3<DIM; d3++)
4500                                 {
4501                                     pos_d +=
4502                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4503                                 }
4504                             }
4505                             /* Check of we are not at the box edge.
4506                              * pbc is only handled in the first step above,
4507                              * but this check could move over pbc while
4508                              * the first step did not due to different rounding.
4509                              */
4510                             if (pos_d >= cell_x1[dim2] &&
4511                                 dd->ci[dim2] != dd->nc[dim2]-1)
4512                             {
4513                                 flag |= DD_FLAG_FW(d2);
4514                             }
4515                             else if (pos_d < cell_x0[dim2] &&
4516                                      dd->ci[dim2] != 0)
4517                             {
4518                                 flag |= DD_FLAG_BW(d2);
4519                             }
4520                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4521                         }
4522                     }
4523                     /* Set to which neighboring cell this cg should go */
4524                     if (flag & DD_FLAG_FW(d2))
4525                     {
4526                         mc = d2*2;
4527                     }
4528                     else if (flag & DD_FLAG_BW(d2))
4529                     {
4530                         if (dd->nc[dd->dim[d2]] > 2)
4531                         {
4532                             mc = d2*2+1;
4533                         }
4534                         else
4535                         {
4536                             mc = d2*2;
4537                         }
4538                     }
4539                 }
4540             }
4541
4542             nrcg = flag & DD_FLAG_NRCG;
4543             if (mc == -1)
4544             {
4545                 if (home_pos_cg+1 > dd->cg_nalloc)
4546                 {
4547                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4548                     srenew(dd->index_gl,dd->cg_nalloc);
4549                     srenew(dd->cgindex,dd->cg_nalloc+1);
4550                 }
4551                 /* Set the global charge group index and size */
4552                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4553                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4554                 /* Copy the state from the buffer */
4555                 if (home_pos_cg >= fr->cg_nalloc)
4556                 {
4557                     dd_realloc_fr_cg(fr,home_pos_cg+1);
4558                     cg_cm = fr->cg_cm;
4559                 }
4560                 copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
4561                 /* Set the cginfo */
4562                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4563                                                    dd->index_gl[home_pos_cg]);
4564                 if (comm->bLocalCG)
4565                 {
4566                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4567                 }
4568
4569                 if (home_pos_at+nrcg > state->nalloc)
4570                 {
4571                     dd_realloc_state(state,f,home_pos_at+nrcg);
4572                 }
4573                 for(i=0; i<nrcg; i++)
4574                 {
4575                     copy_rvec(comm->vbuf.v[buf_pos++],
4576                               state->x[home_pos_at+i]);
4577                 }
4578                 if (bV)
4579                 {
4580                     for(i=0; i<nrcg; i++)
4581                     {
4582                         copy_rvec(comm->vbuf.v[buf_pos++],
4583                                   state->v[home_pos_at+i]);
4584                     }
4585                 }
4586                 if (bSDX)
4587                 {
4588                     for(i=0; i<nrcg; i++)
4589                     {
4590                         copy_rvec(comm->vbuf.v[buf_pos++],
4591                                   state->sd_X[home_pos_at+i]);
4592                     }
4593                 }
4594                 if (bCGP)
4595                 {
4596                     for(i=0; i<nrcg; i++)
4597                     {
4598                         copy_rvec(comm->vbuf.v[buf_pos++],
4599                                   state->cg_p[home_pos_at+i]);
4600                     }
4601                 }
4602                 home_pos_cg += 1;
4603                 home_pos_at += nrcg;
4604             }
4605             else
4606             {
4607                 /* Reallocate the buffers if necessary  */
4608                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4609                 {
4610                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4611                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4612                 }
4613                 nvr = ncg[mc] + nat[mc]*nvec;
4614                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4615                 {
4616                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4617                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4618                 }
4619                 /* Copy from the receive to the send buffers */
4620                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4621                        comm->buf_int + cg*DD_CGIBS,
4622                        DD_CGIBS*sizeof(int));
4623                 memcpy(comm->cgcm_state[mc][nvr],
4624                        comm->vbuf.v[buf_pos],
4625                        (1+nrcg*nvec)*sizeof(rvec));
4626                 buf_pos += 1 + nrcg*nvec;
4627                 ncg[mc] += 1;
4628                 nat[mc] += nrcg;
4629             }
4630         }
4631     }
4632
4633     /* With sorting (!bCompact) the indices are now only partially up to date
4634      * and ncg_home and nat_home are not the real count, since there are
4635      * "holes" in the arrays for the charge groups that moved to neighbors.
4636      */
4637     dd->ncg_home = home_pos_cg;
4638     dd->nat_home = home_pos_at;
4639
4640     if (debug)
4641     {
4642         fprintf(debug,"Finished repartitioning\n");
4643     }
4644
4645     return ncg_stay_home;
4646 }
4647
4648 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4649 {
4650     dd->comm->cycl[ddCycl] += cycles;
4651     dd->comm->cycl_n[ddCycl]++;
4652     if (cycles > dd->comm->cycl_max[ddCycl])
4653     {
4654         dd->comm->cycl_max[ddCycl] = cycles;
4655     }
4656 }
4657
4658 static double force_flop_count(t_nrnb *nrnb)
4659 {
4660     int i;
4661     double sum;
4662     const char *name;
4663
4664     sum = 0;
4665     for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
4666     {
4667         /* To get closer to the real timings, we half the count
4668          * for the normal loops and again half it for water loops.
4669          */
4670         name = nrnb_str(i);
4671         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4672         {
4673             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4674         }
4675         else
4676         {
4677             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4678         }
4679     }
4680     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
4681     {
4682         name = nrnb_str(i);
4683         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4684         sum += nrnb->n[i]*cost_nrnb(i);
4685     }
4686     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
4687     {
4688         sum += nrnb->n[i]*cost_nrnb(i);
4689     }
4690
4691     return sum;
4692 }
4693
4694 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
4695 {
4696     if (dd->comm->eFlop)
4697     {
4698         dd->comm->flop -= force_flop_count(nrnb);
4699     }
4700 }
4701 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
4702 {
4703     if (dd->comm->eFlop)
4704     {
4705         dd->comm->flop += force_flop_count(nrnb);
4706         dd->comm->flop_n++;
4707     }
4708 }
4709
4710 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4711 {
4712     int i;
4713
4714     for(i=0; i<ddCyclNr; i++)
4715     {
4716         dd->comm->cycl[i] = 0;
4717         dd->comm->cycl_n[i] = 0;
4718         dd->comm->cycl_max[i] = 0;
4719     }
4720     dd->comm->flop = 0;
4721     dd->comm->flop_n = 0;
4722 }
4723
4724 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
4725 {
4726     gmx_domdec_comm_t *comm;
4727     gmx_domdec_load_t *load;
4728     gmx_domdec_root_t *root=NULL;
4729     int  d,dim,cid,i,pos;
4730     float cell_frac=0,sbuf[DD_NLOAD_MAX];
4731     bool bSepPME;
4732
4733     if (debug)
4734     {
4735         fprintf(debug,"get_load_distribution start\n");
4736     }
4737
4738     wallcycle_start(wcycle,ewcDDCOMMLOAD);
4739
4740     comm = dd->comm;
4741
4742     bSepPME = (dd->pme_nodeid >= 0);
4743
4744     for(d=dd->ndim-1; d>=0; d--)
4745     {
4746         dim = dd->dim[d];
4747         /* Check if we participate in the communication in this dimension */
4748         if (d == dd->ndim-1 ||
4749             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
4750         {
4751             load = &comm->load[d];
4752             if (dd->bGridJump)
4753             {
4754                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4755             }
4756             pos = 0;
4757             if (d == dd->ndim-1)
4758             {
4759                 sbuf[pos++] = dd_force_load(comm);
4760                 sbuf[pos++] = sbuf[0];
4761                 if (dd->bGridJump)
4762                 {
4763                     sbuf[pos++] = sbuf[0];
4764                     sbuf[pos++] = cell_frac;
4765                     if (d > 0)
4766                     {
4767                         sbuf[pos++] = comm->cell_f_max0[d];
4768                         sbuf[pos++] = comm->cell_f_min1[d];
4769                     }
4770                 }
4771                 if (bSepPME)
4772                 {
4773                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4774                     sbuf[pos++] = comm->cycl[ddCyclPME];
4775                 }
4776             }
4777             else
4778             {
4779                 sbuf[pos++] = comm->load[d+1].sum;
4780                 sbuf[pos++] = comm->load[d+1].max;
4781                 if (dd->bGridJump)
4782                 {
4783                     sbuf[pos++] = comm->load[d+1].sum_m;
4784                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4785                     sbuf[pos++] = comm->load[d+1].flags;
4786                     if (d > 0)
4787                     {
4788                         sbuf[pos++] = comm->cell_f_max0[d];
4789                         sbuf[pos++] = comm->cell_f_min1[d];
4790                     }
4791                 }
4792                 if (bSepPME)
4793                 {
4794                     sbuf[pos++] = comm->load[d+1].mdf;
4795                     sbuf[pos++] = comm->load[d+1].pme;
4796                 }
4797             }
4798             load->nload = pos;
4799             /* Communicate a row in DD direction d.
4800              * The communicators are setup such that the root always has rank 0.
4801              */
4802 #ifdef GMX_MPI
4803             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
4804                        load->load,load->nload*sizeof(float),MPI_BYTE,
4805                        0,comm->mpi_comm_load[d]);
4806 #endif
4807             if (dd->ci[dim] == dd->master_ci[dim])
4808             {
4809                 /* We are the root, process this row */
4810                 if (comm->bDynLoadBal)
4811                 {
4812                     root = comm->root[d];
4813                 }
4814                 load->sum = 0;
4815                 load->max = 0;
4816                 load->sum_m = 0;
4817                 load->cvol_min = 1;
4818                 load->flags = 0;
4819                 load->mdf = 0;
4820                 load->pme = 0;
4821                 pos = 0;
4822                 for(i=0; i<dd->nc[dim]; i++)
4823                 {
4824                     load->sum += load->load[pos++];
4825                     load->max = max(load->max,load->load[pos]);
4826                     pos++;
4827                     if (dd->bGridJump)
4828                     {
4829                         if (root->bLimited)
4830                         {
4831                             /* This direction could not be load balanced properly,
4832                              * therefore we need to use the maximum iso the average load.
4833                              */
4834                             load->sum_m = max(load->sum_m,load->load[pos]);
4835                         }
4836                         else
4837                         {
4838                             load->sum_m += load->load[pos];
4839                         }
4840                         pos++;
4841                         load->cvol_min = min(load->cvol_min,load->load[pos]);
4842                         pos++;
4843                         if (d < dd->ndim-1)
4844                         {
4845                             load->flags = (int)(load->load[pos++] + 0.5);
4846                         }
4847                         if (d > 0)
4848                         {
4849                             root->cell_f_max0[i] = load->load[pos++];
4850                             root->cell_f_min1[i] = load->load[pos++];
4851                         }
4852                     }
4853                     if (bSepPME)
4854                     {
4855                         load->mdf = max(load->mdf,load->load[pos]);
4856                         pos++;
4857                         load->pme = max(load->pme,load->load[pos]);
4858                         pos++;
4859                     }
4860                 }
4861                 if (comm->bDynLoadBal && root->bLimited)
4862                 {
4863                     load->sum_m *= dd->nc[dim];
4864                     load->flags |= (1<<d);
4865                 }
4866             }
4867         }
4868     }
4869
4870     if (DDMASTER(dd))
4871     {
4872         comm->nload      += dd_load_count(comm);
4873         comm->load_step  += comm->cycl[ddCyclStep];
4874         comm->load_sum   += comm->load[0].sum;
4875         comm->load_max   += comm->load[0].max;
4876         if (comm->bDynLoadBal)
4877         {
4878             for(d=0; d<dd->ndim; d++)
4879             {
4880                 if (comm->load[0].flags & (1<<d))
4881                 {
4882                     comm->load_lim[d]++;
4883                 }
4884             }
4885         }
4886         if (bSepPME)
4887         {
4888             comm->load_mdf += comm->load[0].mdf;
4889             comm->load_pme += comm->load[0].pme;
4890         }
4891     }
4892
4893     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
4894
4895     if (debug)
4896     {
4897         fprintf(debug,"get_load_distribution finished\n");
4898     }
4899 }
4900
4901 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
4902 {
4903     /* Return the relative performance loss on the total run time
4904      * due to the force calculation load imbalance.
4905      */
4906     if (dd->comm->nload > 0)
4907     {
4908         return
4909             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
4910             (dd->comm->load_step*dd->nnodes);
4911     }
4912     else
4913     {
4914         return 0;
4915     }
4916 }
4917
4918 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
4919 {
4920     char  buf[STRLEN];
4921     int   npp,npme,nnodes,d,limp;
4922     float imbal,pme_f_ratio,lossf,lossp=0;
4923     bool  bLim;
4924     gmx_domdec_comm_t *comm;
4925
4926     comm = dd->comm;
4927     if (DDMASTER(dd) && comm->nload > 0)
4928     {
4929         npp    = dd->nnodes;
4930         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
4931         nnodes = npp + npme;
4932         imbal = comm->load_max*npp/comm->load_sum - 1;
4933         lossf = dd_force_imb_perf_loss(dd);
4934         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
4935         fprintf(fplog,"%s",buf);
4936         fprintf(stderr,"\n");
4937         fprintf(stderr,"%s",buf);
4938         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
4939         fprintf(fplog,"%s",buf);
4940         fprintf(stderr,"%s",buf);
4941         bLim = FALSE;
4942         if (comm->bDynLoadBal)
4943         {
4944             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
4945             for(d=0; d<dd->ndim; d++)
4946             {
4947                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
4948                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
4949                 if (limp >= 50)
4950                 {
4951                     bLim = TRUE;
4952                 }
4953             }
4954             sprintf(buf+strlen(buf),"\n");
4955             fprintf(fplog,"%s",buf);
4956             fprintf(stderr,"%s",buf);
4957         }
4958         if (npme > 0)
4959         {
4960             pme_f_ratio = comm->load_pme/comm->load_mdf;
4961             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
4962             if (lossp <= 0)
4963             {
4964                 lossp *= (float)npme/(float)nnodes;
4965             }
4966             else
4967             {
4968                 lossp *= (float)npp/(float)nnodes;
4969             }
4970             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
4971             fprintf(fplog,"%s",buf);
4972             fprintf(stderr,"%s",buf);
4973             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
4974             fprintf(fplog,"%s",buf);
4975             fprintf(stderr,"%s",buf);
4976         }
4977         fprintf(fplog,"\n");
4978         fprintf(stderr,"\n");
4979
4980         if (lossf >= DD_PERF_LOSS)
4981         {
4982             sprintf(buf,
4983                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
4984                     "      in the domain decomposition.\n",lossf*100);
4985             if (!comm->bDynLoadBal)
4986             {
4987                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
4988             }
4989             else if (bLim)
4990             {
4991                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
4992             }
4993             fprintf(fplog,"%s\n",buf);
4994             fprintf(stderr,"%s\n",buf);
4995         }
4996         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
4997         {
4998             sprintf(buf,
4999                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5000                     "      had %s work to do than the PP nodes.\n"
5001                     "      You might want to %s the number of PME nodes\n"
5002                     "      or %s the cut-off and the grid spacing.\n",
5003                     fabs(lossp*100),
5004                     (lossp < 0) ? "less"     : "more",
5005                     (lossp < 0) ? "decrease" : "increase",
5006                     (lossp < 0) ? "decrease" : "increase");
5007             fprintf(fplog,"%s\n",buf);
5008             fprintf(stderr,"%s\n",buf);
5009         }
5010     }
5011 }
5012
5013 static float dd_vol_min(gmx_domdec_t *dd)
5014 {
5015     return dd->comm->load[0].cvol_min*dd->nnodes;
5016 }
5017
5018 static bool dd_load_flags(gmx_domdec_t *dd)
5019 {
5020     return dd->comm->load[0].flags;
5021 }
5022
5023 static float dd_f_imbal(gmx_domdec_t *dd)
5024 {
5025     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5026 }
5027
5028 static float dd_pme_f_ratio(gmx_domdec_t *dd)
5029 {
5030     return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5031 }
5032
5033 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_step_t step)
5034 {
5035     int flags,d;
5036     char buf[22];
5037
5038     flags = dd_load_flags(dd);
5039     if (flags)
5040     {
5041         fprintf(fplog,
5042                 "DD  load balancing is limited by minimum cell size in dimension");
5043         for(d=0; d<dd->ndim; d++)
5044         {
5045             if (flags & (1<<d))
5046             {
5047                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5048             }
5049         }
5050         fprintf(fplog,"\n");
5051     }
5052     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5053     if (dd->comm->bDynLoadBal)
5054     {
5055         fprintf(fplog,"  vol min/aver %5.3f%c",
5056                 dd_vol_min(dd),flags ? '!' : ' ');
5057     }
5058     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5059     if (dd->comm->cycl_n[ddCyclPME])
5060     {
5061         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5062     }
5063     fprintf(fplog,"\n\n");
5064 }
5065
5066 static void dd_print_load_verbose(gmx_domdec_t *dd)
5067 {
5068     if (dd->comm->bDynLoadBal)
5069     {
5070         fprintf(stderr,"vol %4.2f%c ",
5071                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5072     }
5073     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5074     if (dd->comm->cycl_n[ddCyclPME])
5075     {
5076         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5077     }
5078 }
5079
5080 #ifdef GMX_MPI
5081 static void make_load_communicator(gmx_domdec_t *dd,MPI_Group g_all,
5082                                    int dim_ind,ivec loc)
5083 {
5084     MPI_Group g_row;
5085     MPI_Comm  c_row;
5086     int  dim,i,*rank;
5087     ivec loc_c;
5088     gmx_domdec_root_t *root;
5089
5090     dim = dd->dim[dim_ind];
5091     copy_ivec(loc,loc_c);
5092     snew(rank,dd->nc[dim]);
5093     for(i=0; i<dd->nc[dim]; i++)
5094     {
5095         loc_c[dim] = i;
5096         rank[i] = dd_index(dd->nc,loc_c);
5097     }
5098     /* Here we create a new group, that does not necessarily
5099      * include our process. But MPI_Comm_create needs to be
5100      * called by all the processes in the original communicator.
5101      * Calling MPI_Group_free afterwards gives errors, so I assume
5102      * also the group is needed by all processes. (B. Hess)
5103      */
5104     MPI_Group_incl(g_all,dd->nc[dim],rank,&g_row);
5105     MPI_Comm_create(dd->mpi_comm_all,g_row,&c_row);
5106     if (c_row != MPI_COMM_NULL)
5107     {
5108         /* This process is part of the group */
5109         dd->comm->mpi_comm_load[dim_ind] = c_row;
5110         if (dd->comm->eDLB != edlbNO)
5111         {
5112             if (dd->ci[dim] == dd->master_ci[dim])
5113             {
5114                 /* This is the root process of this row */
5115                 snew(dd->comm->root[dim_ind],1);
5116                 root = dd->comm->root[dim_ind];
5117                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5118                 snew(root->old_cell_f,dd->nc[dim]+1);
5119                 snew(root->bCellMin,dd->nc[dim]);
5120                 if (dim_ind > 0)
5121                 {
5122                     snew(root->cell_f_max0,dd->nc[dim]);
5123                     snew(root->cell_f_min1,dd->nc[dim]);
5124                     snew(root->bound_min,dd->nc[dim]);
5125                     snew(root->bound_max,dd->nc[dim]);
5126                 }
5127                 snew(root->buf_ncd,dd->nc[dim]);
5128             }
5129             else
5130             {
5131                 /* This is not a root process, we only need to receive cell_f */
5132                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5133             }
5134         }
5135         if (dd->ci[dim] == dd->master_ci[dim])
5136         {
5137             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5138         }
5139     }
5140     sfree(rank);
5141 }
5142 #endif
5143
5144 static void make_load_communicators(gmx_domdec_t *dd)
5145 {
5146 #ifdef GMX_MPI
5147   MPI_Group g_all;
5148   int  dim0,dim1,i,j;
5149   ivec loc;
5150
5151   if (debug)
5152     fprintf(debug,"Making load communicators\n");
5153
5154   MPI_Comm_group(dd->mpi_comm_all,&g_all);
5155
5156   snew(dd->comm->load,dd->ndim);
5157   snew(dd->comm->mpi_comm_load,dd->ndim);
5158
5159   clear_ivec(loc);
5160   make_load_communicator(dd,g_all,0,loc);
5161   if (dd->ndim > 1) {
5162     dim0 = dd->dim[0];
5163     for(i=0; i<dd->nc[dim0]; i++) {
5164       loc[dim0] = i;
5165       make_load_communicator(dd,g_all,1,loc);
5166     }
5167   }
5168   if (dd->ndim > 2) {
5169     dim0 = dd->dim[0];
5170     for(i=0; i<dd->nc[dim0]; i++) {
5171       loc[dim0] = i;
5172       dim1 = dd->dim[1];
5173       for(j=0; j<dd->nc[dim1]; j++) {
5174           loc[dim1] = j;
5175           make_load_communicator(dd,g_all,2,loc);
5176       }
5177     }
5178   }
5179
5180   MPI_Group_free(&g_all);
5181
5182   if (debug)
5183     fprintf(debug,"Finished making load communicators\n");
5184 #endif
5185 }
5186
5187 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5188 {
5189     bool bZYX;
5190     int  d,dim,i,j,m;
5191     ivec tmp,s;
5192     int  nzone,nzonep;
5193     ivec dd_zp[DD_MAXIZONE];
5194     gmx_domdec_zones_t *zones;
5195     gmx_domdec_ns_ranges_t *izone;
5196
5197     for(d=0; d<dd->ndim; d++)
5198     {
5199         dim = dd->dim[d];
5200         copy_ivec(dd->ci,tmp);
5201         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5202         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5203         copy_ivec(dd->ci,tmp);
5204         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5205         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5206         if (debug)
5207         {
5208             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5209                     dd->rank,dim,
5210                     dd->neighbor[d][0],
5211                     dd->neighbor[d][1]);
5212         }
5213     }
5214
5215     if (DDMASTER(dd))
5216     {
5217         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5218             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5219     }
5220     if (fplog)
5221     {
5222         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5223                 dd->ndim,
5224                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5225                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5226     }
5227     switch (dd->ndim)
5228     {
5229     case 3:
5230         nzone  = dd_z3n;
5231         nzonep = dd_zp3n;
5232         for(i=0; i<nzonep; i++)
5233         {
5234             copy_ivec(dd_zp3[i],dd_zp[i]);
5235         }
5236         break;
5237     case 2:
5238         nzone  = dd_z2n;
5239         nzonep = dd_zp2n;
5240         for(i=0; i<nzonep; i++)
5241         {
5242             copy_ivec(dd_zp2[i],dd_zp[i]);
5243         }
5244         break;
5245     case 1:
5246         nzone  = dd_z1n;
5247         nzonep = dd_zp1n;
5248         for(i=0; i<nzonep; i++)
5249         {
5250             copy_ivec(dd_zp1[i],dd_zp[i]);
5251         }
5252         break;
5253     default:
5254         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5255         nzone = 0;
5256         nzonep = 0;
5257     }
5258
5259     zones = &dd->comm->zones;
5260
5261     for(i=0; i<nzone; i++)
5262     {
5263         m = 0;
5264         clear_ivec(zones->shift[i]);
5265         for(d=0; d<dd->ndim; d++)
5266         {
5267             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5268         }
5269     }
5270
5271     zones->n = nzone;
5272     for(i=0; i<nzone; i++)
5273     {
5274         for(d=0; d<DIM; d++)
5275         {
5276             s[d] = dd->ci[d] - zones->shift[i][d];
5277             if (s[d] < 0)
5278             {
5279                 s[d] += dd->nc[d];
5280             }
5281             else if (s[d] >= dd->nc[d])
5282             {
5283                 s[d] -= dd->nc[d];
5284             }
5285         }
5286     }
5287     zones->nizone = nzonep;
5288     for(i=0; i<zones->nizone; i++)
5289     {
5290         if (dd_zp[i][0] != i)
5291         {
5292             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5293         }
5294         izone = &zones->izone[i];
5295         izone->j0 = dd_zp[i][1];
5296         izone->j1 = dd_zp[i][2];
5297         for(dim=0; dim<DIM; dim++)
5298         {
5299             if (dd->nc[dim] == 1)
5300             {
5301                 /* All shifts should be allowed */
5302                 izone->shift0[dim] = -1;
5303                 izone->shift1[dim] = 1;
5304             }
5305             else
5306             {
5307                 /*
5308                   izone->shift0[d] = 0;
5309                   izone->shift1[d] = 0;
5310                   for(j=izone->j0; j<izone->j1; j++) {
5311                   if (dd->shift[j][d] > dd->shift[i][d])
5312                   izone->shift0[d] = -1;
5313                   if (dd->shift[j][d] < dd->shift[i][d])
5314                   izone->shift1[d] = 1;
5315                   }
5316                 */
5317
5318                 int shift_diff;
5319
5320                 /* Assume the shift are not more than 1 cell */
5321                 izone->shift0[dim] = 1;
5322                 izone->shift1[dim] = -1;
5323                 for(j=izone->j0; j<izone->j1; j++)
5324                 {
5325                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5326                     if (shift_diff < izone->shift0[dim])
5327                     {
5328                         izone->shift0[dim] = shift_diff;
5329                     }
5330                     if (shift_diff > izone->shift1[dim])
5331                     {
5332                         izone->shift1[dim] = shift_diff;
5333                     }
5334                 }
5335             }
5336         }
5337     }
5338
5339     if (dd->comm->eDLB != edlbNO)
5340     {
5341         snew(dd->comm->root,dd->ndim);
5342     }
5343
5344     if (dd->comm->bRecordLoad)
5345     {
5346         make_load_communicators(dd);
5347     }
5348 }
5349
5350 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5351 {
5352     gmx_domdec_t *dd;
5353     gmx_domdec_comm_t *comm;
5354     int  i,rank,*buf;
5355     ivec periods;
5356 #ifdef GMX_MPI
5357     MPI_Comm comm_cart;
5358 #endif
5359
5360     dd = cr->dd;
5361     comm = dd->comm;
5362
5363 #ifdef GMX_MPI
5364     if (comm->bCartesianPP)
5365     {
5366         /* Set up cartesian communication for the particle-particle part */
5367         if (fplog)
5368         {
5369             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5370                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5371         }
5372
5373         for(i=0; i<DIM; i++)
5374         {
5375             periods[i] = TRUE;
5376         }
5377         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5378                         &comm_cart);
5379         /* We overwrite the old communicator with the new cartesian one */
5380         cr->mpi_comm_mygroup = comm_cart;
5381     }
5382
5383     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5384     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5385
5386     if (comm->bCartesianPP_PME)
5387     {
5388         /* Since we want to use the original cartesian setup for sim,
5389          * and not the one after split, we need to make an index.
5390          */
5391         snew(comm->ddindex2ddnodeid,dd->nnodes);
5392         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5393         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5394         /* Get the rank of the DD master,
5395          * above we made sure that the master node is a PP node.
5396          */
5397         if (MASTER(cr))
5398         {
5399             rank = dd->rank;
5400         }
5401         else
5402         {
5403             rank = 0;
5404         }
5405         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5406     }
5407     else if (comm->bCartesianPP)
5408     {
5409         if (cr->npmenodes == 0)
5410         {
5411             /* The PP communicator is also
5412              * the communicator for this simulation
5413              */
5414             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5415         }
5416         cr->nodeid = dd->rank;
5417
5418         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5419
5420         /* We need to make an index to go from the coordinates
5421          * to the nodeid of this simulation.
5422          */
5423         snew(comm->ddindex2simnodeid,dd->nnodes);
5424         snew(buf,dd->nnodes);
5425         if (cr->duty & DUTY_PP)
5426         {
5427             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5428         }
5429         /* Communicate the ddindex to simulation nodeid index */
5430         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5431                       cr->mpi_comm_mysim);
5432         sfree(buf);
5433
5434         /* Determine the master coordinates and rank.
5435          * The DD master should be the same node as the master of this sim.
5436          */
5437         for(i=0; i<dd->nnodes; i++)
5438         {
5439             if (comm->ddindex2simnodeid[i] == 0)
5440             {
5441                 ddindex2xyz(dd->nc,i,dd->master_ci);
5442                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5443             }
5444         }
5445         if (debug)
5446         {
5447             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5448         }
5449     }
5450     else
5451     {
5452         /* No Cartesian communicators */
5453         /* We use the rank in dd->comm->all as DD index */
5454         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5455         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5456         dd->masterrank = 0;
5457         clear_ivec(dd->master_ci);
5458     }
5459 #endif
5460
5461     if (fplog)
5462     {
5463         fprintf(fplog,
5464                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5465                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5466     }
5467     if (debug)
5468     {
5469         fprintf(debug,
5470                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5471                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5472     }
5473 }
5474
5475 static void receive_ddindex2simnodeid(t_commrec *cr)
5476 {
5477     gmx_domdec_t *dd;
5478
5479     gmx_domdec_comm_t *comm;
5480     int  *buf;
5481
5482     dd = cr->dd;
5483     comm = dd->comm;
5484
5485 #ifdef GMX_MPI
5486     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5487     {
5488         snew(comm->ddindex2simnodeid,dd->nnodes);
5489         snew(buf,dd->nnodes);
5490         if (cr->duty & DUTY_PP)
5491         {
5492             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5493         }
5494 #ifdef GMX_MPI
5495         /* Communicate the ddindex to simulation nodeid index */
5496         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5497                       cr->mpi_comm_mysim);
5498 #endif
5499         sfree(buf);
5500     }
5501 #endif
5502 }
5503
5504 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5505                                                      int ncg,int natoms)
5506 {
5507     gmx_domdec_master_t *ma;
5508     int i;
5509
5510     snew(ma,1);
5511
5512     snew(ma->ncg,dd->nnodes);
5513     snew(ma->index,dd->nnodes+1);
5514     snew(ma->cg,ncg);
5515     snew(ma->nat,dd->nnodes);
5516     snew(ma->ibuf,dd->nnodes*2);
5517     snew(ma->cell_x,DIM);
5518     for(i=0; i<DIM; i++)
5519     {
5520         snew(ma->cell_x[i],dd->nc[i]+1);
5521     }
5522
5523     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5524     {
5525         ma->vbuf = NULL;
5526     }
5527     else
5528     {
5529         snew(ma->vbuf,natoms);
5530     }
5531
5532     return ma;
5533 }
5534
5535 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5536                                int reorder)
5537 {
5538     gmx_domdec_t *dd;
5539     gmx_domdec_comm_t *comm;
5540     int  i,rank;
5541     bool bDiv[DIM];
5542     ivec periods;
5543 #ifdef GMX_MPI
5544     MPI_Comm comm_cart;
5545 #endif
5546
5547     dd = cr->dd;
5548     comm = dd->comm;
5549
5550     if (comm->bCartesianPP)
5551     {
5552         for(i=1; i<DIM; i++)
5553         {
5554             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5555         }
5556         if (bDiv[YY] || bDiv[ZZ])
5557         {
5558             comm->bCartesianPP_PME = TRUE;
5559             /* We choose the direction that provides the thinnest slab
5560              * of PME only nodes as this will have the least effect
5561              * on the PP communication.
5562              * But for the PME communication the opposite might be better.
5563              */
5564             if (bDiv[YY] && (!bDiv[ZZ] || dd->nc[YY] <= dd->nc[ZZ]))
5565             {
5566                 comm->cartpmedim = YY;
5567             }
5568             else
5569             {
5570                 comm->cartpmedim = ZZ;
5571             }
5572             comm->ntot[comm->cartpmedim]
5573                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5574         }
5575         else if (fplog)
5576         {
5577             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5578             fprintf(fplog,
5579                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5580         }
5581     }
5582
5583 #ifdef GMX_MPI
5584     if (comm->bCartesianPP_PME)
5585     {
5586         if (fplog)
5587         {
5588             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5589         }
5590
5591         for(i=0; i<DIM; i++)
5592         {
5593             periods[i] = TRUE;
5594         }
5595         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5596                         &comm_cart);
5597
5598         MPI_Comm_rank(comm_cart,&rank);
5599         if (MASTERNODE(cr) && rank != 0)
5600         {
5601             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5602         }
5603
5604         /* With this assigment we loose the link to the original communicator
5605          * which will usually be MPI_COMM_WORLD, unless have multisim.
5606          */
5607         cr->mpi_comm_mysim = comm_cart;
5608         cr->sim_nodeid = rank;
5609
5610         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5611
5612         if (fplog)
5613         {
5614             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5615                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5616         }
5617
5618         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5619         {
5620             cr->duty = DUTY_PP;
5621         }
5622         if (cr->npmenodes == 0 ||
5623             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5624         {
5625             cr->duty = DUTY_PME;
5626         }
5627
5628         /* Split the sim communicator into PP and PME only nodes */
5629         MPI_Comm_split(cr->mpi_comm_mysim,
5630                        cr->duty,
5631                        dd_index(comm->ntot,dd->ci),
5632                        &cr->mpi_comm_mygroup);
5633     }
5634     else
5635     {
5636         switch (dd_node_order)
5637         {
5638         case ddnoPP_PME:
5639             if (fplog)
5640             {
5641                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5642             }
5643             break;
5644         case ddnoINTERLEAVE:
5645             /* Interleave the PP-only and PME-only nodes,
5646              * as on clusters with dual-core machines this will double
5647              * the communication bandwidth of the PME processes
5648              * and thus speed up the PP <-> PME and inter PME communication.
5649              */
5650             if (fplog)
5651             {
5652                 fprintf(fplog,"Interleaving PP and PME nodes\n");
5653             }
5654             comm->pmenodes = dd_pmenodes(cr);
5655             break;
5656         case ddnoCARTESIAN:
5657             break;
5658         default:
5659             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
5660         }
5661
5662         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
5663         {
5664             cr->duty = DUTY_PME;
5665         }
5666         else
5667         {
5668             cr->duty = DUTY_PP;
5669         }
5670
5671         /* Split the sim communicator into PP and PME only nodes */
5672         MPI_Comm_split(cr->mpi_comm_mysim,
5673                        cr->duty,
5674                        cr->nodeid,
5675                        &cr->mpi_comm_mygroup);
5676         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
5677     }
5678 #endif
5679
5680     if (fplog)
5681     {
5682         fprintf(fplog,"This is a %s only node\n\n",
5683                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5684     }
5685 }
5686
5687 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
5688 {
5689     gmx_domdec_t *dd;
5690     gmx_domdec_comm_t *comm;
5691     int CartReorder;
5692
5693     dd = cr->dd;
5694     comm = dd->comm;
5695
5696     copy_ivec(dd->nc,comm->ntot);
5697
5698     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
5699     comm->bCartesianPP_PME = FALSE;
5700
5701     /* Reorder the nodes by default. This might change the MPI ranks.
5702      * Real reordering is only supported on very few architectures,
5703      * Blue Gene is one of them.
5704      */
5705     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
5706
5707     if (cr->npmenodes > 0)
5708     {
5709         /* Split the communicator into a PP and PME part */
5710         split_communicator(fplog,cr,dd_node_order,CartReorder);
5711         if (comm->bCartesianPP_PME)
5712         {
5713             /* We (possibly) reordered the nodes in split_communicator,
5714              * so it is no longer required in make_pp_communicator.
5715              */
5716             CartReorder = FALSE;
5717         }
5718     }
5719     else
5720     {
5721         /* All nodes do PP and PME */
5722 #ifdef GMX_MPI
5723         /* We do not require separate communicators */
5724         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5725 #endif
5726     }
5727
5728     if (cr->duty & DUTY_PP)
5729     {
5730         /* Copy or make a new PP communicator */
5731         make_pp_communicator(fplog,cr,CartReorder);
5732     }
5733     else
5734     {
5735         receive_ddindex2simnodeid(cr);
5736     }
5737
5738     if (!(cr->duty & DUTY_PME))
5739     {
5740         /* Set up the commnuication to our PME node */
5741         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
5742         dd->pme_receive_vir_ener = receive_vir_ener(cr);
5743         if (debug)
5744         {
5745             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
5746                     dd->pme_nodeid,dd->pme_receive_vir_ener);
5747         }
5748     }
5749     else
5750     {
5751         dd->pme_nodeid = -1;
5752     }
5753
5754     if (DDMASTER(dd))
5755     {
5756         dd->ma = init_gmx_domdec_master_t(dd,
5757                                           comm->cgs_gl.nr,
5758                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
5759     }
5760 }
5761
5762 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
5763 {
5764     real *slb_frac,tot;
5765     int  i,n;
5766     double dbl;
5767
5768     slb_frac = NULL;
5769     if (nc > 1 && size_string != NULL)
5770     {
5771         if (fplog)
5772         {
5773             fprintf(fplog,"Using static load balancing for the %s direction\n",
5774                     dir);
5775         }
5776         snew(slb_frac,nc);
5777         tot = 0;
5778         for (i=0; i<nc; i++)
5779         {
5780             dbl = 0;
5781             sscanf(size_string,"%lf%n",&dbl,&n);
5782             if (dbl == 0)
5783             {
5784                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
5785             }
5786             slb_frac[i] = dbl;
5787             size_string += n;
5788             tot += slb_frac[i];
5789         }
5790         /* Normalize */
5791         if (fplog)
5792         {
5793             fprintf(fplog,"Relative cell sizes:");
5794         }
5795         for (i=0; i<nc; i++)
5796         {
5797             slb_frac[i] /= tot;
5798             if (fplog)
5799             {
5800                 fprintf(fplog," %5.3f",slb_frac[i]);
5801             }
5802         }
5803         if (fplog)
5804         {
5805             fprintf(fplog,"\n");
5806         }
5807     }
5808
5809     return slb_frac;
5810 }
5811
5812 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
5813 {
5814     int n,nmol,ftype;
5815     gmx_mtop_ilistloop_t iloop;
5816     t_ilist *il;
5817
5818     n = 0;
5819     iloop = gmx_mtop_ilistloop_init(mtop);
5820     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
5821     {
5822         for(ftype=0; ftype<F_NRE; ftype++)
5823         {
5824             if ((interaction_function[ftype].flags & IF_BOND) &&
5825                 NRAL(ftype) >  2)
5826             {
5827                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
5828             }
5829         }
5830   }
5831
5832   return n;
5833 }
5834
5835 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
5836 {
5837     char *val;
5838     int  nst;
5839
5840     nst = def;
5841     val = getenv(env_var);
5842     if (val)
5843     {
5844         if (sscanf(val,"%d",&nst) <= 0)
5845         {
5846             nst = 1;
5847         }
5848         if (fplog)
5849         {
5850             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
5851                     env_var,val,nst);
5852         }
5853     }
5854
5855     return nst;
5856 }
5857
5858 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
5859 {
5860     if (MASTER(cr))
5861     {
5862         fprintf(stderr,"\n%s\n",warn_string);
5863     }
5864     if (fplog)
5865     {
5866         fprintf(fplog,"\n%s\n",warn_string);
5867     }
5868 }
5869
5870 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
5871                                   t_inputrec *ir,FILE *fplog)
5872 {
5873     if (ir->ePBC == epbcSCREW &&
5874         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
5875     {
5876         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
5877     }
5878
5879     if (ir->ns_type == ensSIMPLE)
5880     {
5881         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
5882     }
5883
5884     if (ir->nstlist == 0)
5885     {
5886         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
5887     }
5888
5889     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
5890     {
5891         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
5892     }
5893 }
5894
5895 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
5896 {
5897     int  di,d;
5898     real r;
5899
5900     r = ddbox->box_size[XX];
5901     for(di=0; di<dd->ndim; di++)
5902     {
5903         d = dd->dim[di];
5904         /* Check using the initial average cell size */
5905         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
5906     }
5907
5908     return r;
5909 }
5910
5911 static int check_dlb_support(FILE *fplog,t_commrec *cr,
5912                              const char *dlb_opt,bool bRecordLoad,
5913                              unsigned long Flags,t_inputrec *ir)
5914 {
5915     gmx_domdec_t *dd;
5916     int  eDLB=-1;
5917     char buf[STRLEN];
5918
5919     switch (dlb_opt[0])
5920     {
5921     case 'a': eDLB = edlbAUTO; break;
5922     case 'n': eDLB = edlbNO;   break;
5923     case 'y': eDLB = edlbYES;  break;
5924     default: gmx_incons("Unknown dlb_opt");
5925     }
5926
5927     if (Flags & MD_RERUN)
5928     {
5929         return edlbNO;
5930     }
5931
5932     if (!EI_DYNAMICS(ir->eI))
5933     {
5934         if (eDLB == edlbYES)
5935         {
5936             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
5937             dd_warning(cr,fplog,buf);
5938         }
5939
5940         return edlbNO;
5941     }
5942
5943     if (!bRecordLoad)
5944     {
5945         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
5946
5947         return edlbNO;
5948     }
5949
5950     if (Flags & MD_REPRODUCIBLE)
5951     {
5952         switch (eDLB)
5953         {
5954                         case edlbNO:
5955                                 break;
5956                         case edlbAUTO:
5957                                 dd_warning(cr,fplog,"NOTE: reproducability requested, will not use dynamic load balancing\n");
5958                                 eDLB = edlbNO;
5959                                 break;
5960                         case edlbYES:
5961                                 dd_warning(cr,fplog,"WARNING: reproducability requested with dynamic load balancing, the simulation will NOT be binary reproducable\n");
5962                                 break;
5963                         default:
5964                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
5965                                 break;
5966         }
5967     }
5968
5969     return eDLB;
5970 }
5971
5972 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
5973 {
5974     int dim;
5975
5976     dd->ndim = 0;
5977     if (getenv("GMX_DD_ORDER_ZYX"))
5978     {
5979         /* Decomposition order z,y,x */
5980         if (fplog)
5981         {
5982             fprintf(fplog,"Using domain decomposition order z, y, x\n");
5983         }
5984         for(dim=DIM-1; dim>=0; dim--)
5985         {
5986             if (dd->nc[dim] > 1)
5987             {
5988                 dd->dim[dd->ndim++] = dim;
5989             }
5990         }
5991     }
5992     else
5993     {
5994         /* Decomposition order x,y,z */
5995         for(dim=0; dim<DIM; dim++)
5996         {
5997             if (dd->nc[dim] > 1)
5998             {
5999                 dd->dim[dd->ndim++] = dim;
6000             }
6001         }
6002     }
6003 }
6004
6005 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6006                                         unsigned long Flags,
6007                                         ivec nc,
6008                                         real comm_distance_min,real rconstr,
6009                                         const char *dlb_opt,real dlb_scale,
6010                                         const char *sizex,const char *sizey,const char *sizez,
6011                                         gmx_mtop_t *mtop,t_inputrec *ir,
6012                                         matrix box,rvec *x,
6013                                         gmx_ddbox_t *ddbox,
6014                                         int *npme_major)
6015 {
6016     gmx_domdec_t *dd;
6017     gmx_domdec_comm_t *comm;
6018     int  recload;
6019     int  d,i,j;
6020     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6021     bool bC;
6022     char buf[STRLEN];
6023
6024     if (fplog)
6025     {
6026         fprintf(fplog,
6027                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6028     }
6029
6030     snew(dd,1);
6031     snew(dd->comm,1);
6032     comm = dd->comm;
6033     snew(comm->cggl_flag,DIM*2);
6034     snew(comm->cgcm_state,DIM*2);
6035
6036     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6037     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6038
6039     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6040     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6041     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6042     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6043     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6044     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6045     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6046
6047     if (dd->bSendRecv2 && fplog)
6048     {
6049         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6050     }
6051     if (comm->eFlop)
6052     {
6053         if (fplog)
6054         {
6055             fprintf(fplog,"Will load balance based on FLOP count\n");
6056         }
6057         if (comm->eFlop > 1)
6058         {
6059             srand(1+cr->nodeid);
6060         }
6061         comm->bRecordLoad = TRUE;
6062     }
6063     else
6064     {
6065         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6066
6067     }
6068
6069     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6070
6071     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6072     if (fplog)
6073     {
6074         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6075     }
6076     dd->bGridJump = comm->bDynLoadBal;
6077
6078     if (comm->nstSortCG)
6079     {
6080         if (fplog)
6081         {
6082             if (comm->nstSortCG == 1)
6083             {
6084                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6085             }
6086             else
6087             {
6088                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6089                         comm->nstSortCG);
6090             }
6091         }
6092         snew(comm->sort,1);
6093     }
6094     else
6095     {
6096         if (fplog)
6097         {
6098             fprintf(fplog,"Will not sort the charge groups\n");
6099         }
6100     }
6101
6102     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6103     if (comm->bInterCGBondeds)
6104     {
6105         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6106     }
6107     else
6108     {
6109         comm->bInterCGMultiBody = FALSE;
6110     }
6111
6112     dd->bInterCGcons = inter_charge_group_constraints(mtop);
6113
6114     if (ir->rlistlong == 0)
6115     {
6116         /* Set the cut-off to some very large value,
6117          * so we don't need if statements everywhere in the code.
6118          * We use sqrt, since the cut-off is squared in some places.
6119          */
6120         comm->cutoff   = GMX_CUTOFF_INF;
6121     }
6122     else
6123     {
6124         comm->cutoff   = ir->rlistlong;
6125     }
6126     comm->cutoff_mbody = 0;
6127
6128     comm->cellsize_limit = 0;
6129     comm->bBondComm = FALSE;
6130
6131     if (comm->bInterCGBondeds)
6132     {
6133         if (comm_distance_min > 0)
6134         {
6135             comm->cutoff_mbody = comm_distance_min;
6136             if (Flags & MD_DDBONDCOMM)
6137             {
6138                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6139             }
6140             else
6141             {
6142                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6143             }
6144             r_bonded_limit = comm->cutoff_mbody;
6145         }
6146         else if (ir->bPeriodicMols)
6147         {
6148             /* Can not easily determine the required cut-off */
6149             dd_warning(cr,fplog,"NOTE: Periodic molecules: can not easily determine the required minimum bonded cut-off, using half the non-bonded cut-off\n");
6150             comm->cutoff_mbody = comm->cutoff/2;
6151             r_bonded_limit = comm->cutoff_mbody;
6152         }
6153         else
6154         {
6155             if (MASTER(cr))
6156             {
6157                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6158                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6159             }
6160             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6161             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6162
6163             /* We use an initial margin of 10% for the minimum cell size,
6164              * except when we are just below the non-bonded cut-off.
6165              */
6166             if (Flags & MD_DDBONDCOMM)
6167             {
6168                 if (max(r_2b,r_mb) > comm->cutoff)
6169                 {
6170                     r_bonded       = max(r_2b,r_mb);
6171                     r_bonded_limit = 1.1*r_bonded;
6172                     comm->bBondComm = TRUE;
6173                 }
6174                 else
6175                 {
6176                     r_bonded       = r_mb;
6177                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6178                 }
6179                 /* We determine cutoff_mbody later */
6180             }
6181             else
6182             {
6183                 /* No special bonded communication,
6184                  * simply increase the DD cut-off.
6185                  */
6186                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6187                 comm->cutoff_mbody = r_bonded_limit;
6188                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6189             }
6190         }
6191         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6192         if (fplog)
6193         {
6194             fprintf(fplog,
6195                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6196                     comm->cellsize_limit);
6197         }
6198     }
6199
6200     if (dd->bInterCGcons && rconstr <= 0)
6201     {
6202         /* There is a cell size limit due to the constraints (P-LINCS) */
6203         rconstr = constr_r_max(fplog,mtop,ir);
6204         if (fplog)
6205         {
6206             fprintf(fplog,
6207                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6208                     rconstr);
6209             if (rconstr > comm->cellsize_limit)
6210             {
6211                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6212             }
6213         }
6214     }
6215     else if (rconstr > 0 && fplog)
6216     {
6217         /* Here we do not check for dd->bInterCGcons,
6218          * because one can also set a cell size limit for virtual sites only
6219          * and at this point we don't know yet if there are intercg v-sites.
6220          */
6221         fprintf(fplog,
6222                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6223                 rconstr);
6224     }
6225     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6226
6227     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6228
6229     if (nc[XX] > 0)
6230     {
6231         copy_ivec(nc,dd->nc);
6232         set_dd_dim(fplog,dd);
6233         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6234
6235         if (cr->npmenodes == -1)
6236         {
6237             cr->npmenodes = 0;
6238         }
6239         acs = average_cellsize_min(dd,ddbox);
6240         if (acs < comm->cellsize_limit)
6241         {
6242             if (fplog)
6243             {
6244                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6245             }
6246             if (MASTER(cr))
6247             {
6248                 gmx_fatal(FARGS,"The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",acs,comm->cellsize_limit);
6249             }
6250 #ifdef GMX_MPI
6251             MPI_Abort(MPI_COMM_WORLD, 0);
6252 #else
6253             exit(0);
6254 #endif
6255         }
6256     }
6257     else
6258     {
6259         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6260
6261         /* We need to choose the optimal DD grid and possibly PME nodes */
6262         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6263                                comm->eDLB!=edlbNO,dlb_scale,
6264                                comm->cellsize_limit,comm->cutoff,
6265                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6266
6267         if (dd->nc[XX] == 0)
6268         {
6269             if (MASTER(cr))
6270             {
6271                 bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6272                 sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6273                         !bC ? "-rdd" : "-rcon",
6274                         comm->eDLB!=edlbNO ? " or -dds" : "",
6275                         bC ? " or your LINCS settings" : "");
6276                 gmx_fatal(FARGS,"There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6277                           "%s\n"
6278                           "Look in the log file for details on the domain decomposition",
6279                           cr->nnodes-cr->npmenodes,limit,buf);
6280             }
6281 #ifdef GMX_MPI
6282             MPI_Abort(MPI_COMM_WORLD, 0);
6283 #else
6284             exit(0);
6285 #endif
6286         }
6287         set_dd_dim(fplog,dd);
6288     }
6289
6290     if (fplog)
6291     {
6292         fprintf(fplog,
6293                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6294                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6295     }
6296
6297     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6298     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6299     {
6300         gmx_fatal(FARGS,"The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6301                   dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6302     }
6303     if (cr->npmenodes > dd->nnodes)
6304     {
6305         gmx_fatal(FARGS,"The number of separate PME node (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6306     }
6307     if (cr->npmenodes > 0)
6308     {
6309         comm->npmenodes = cr->npmenodes;
6310     }
6311     else
6312     {
6313         comm->npmenodes = dd->nnodes;
6314     }
6315
6316     if (EEL_PME(ir->coulombtype))
6317     {
6318         if (FALSE && dd->nc[XX] > cr->npmenodes && cr->npmenodes % dd->nc[XX] == 0)
6319         {
6320             comm->npmedecompdim   = 2;
6321             comm->npmenodes_major = dd->nc[XX];
6322         }
6323         else
6324         {
6325             comm->npmedecompdim   = 1;
6326             comm->npmenodes_major = comm->npmenodes;
6327         }
6328     }
6329     else
6330     {
6331         comm->npmedecompdim   = 0;
6332         comm->npmenodes_major = 0;
6333     }
6334     *npme_major = comm->npmenodes_major;
6335
6336     snew(comm->slb_frac,DIM);
6337     if (comm->eDLB == edlbNO)
6338     {
6339         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6340         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6341         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6342     }
6343
6344     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6345     {
6346         if (comm->bBondComm || comm->eDLB != edlbNO)
6347         {
6348             /* Set the bonded communication distance to halfway
6349              * the minimum and the maximum,
6350              * since the extra communication cost is nearly zero.
6351              */
6352             acs = average_cellsize_min(dd,ddbox);
6353             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6354             if (comm->eDLB != edlbNO)
6355             {
6356                 /* Check if this does not limit the scaling */
6357                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6358             }
6359             if (!comm->bBondComm)
6360             {
6361                 /* Without bBondComm do not go beyond the n.b. cut-off */
6362                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6363                 if (comm->cellsize_limit >= comm->cutoff)
6364                 {
6365                     /* We don't loose a lot of efficieny
6366                      * when increasing it to the n.b. cut-off.
6367                      * It can even be slightly faster, because we need
6368                      * less checks for the communication setup.
6369                      */
6370                     comm->cutoff_mbody = comm->cutoff;
6371                 }
6372             }
6373             /* Check if we did not end up below our original limit */
6374             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6375
6376             if (comm->cutoff_mbody > comm->cellsize_limit)
6377             {
6378                 comm->cellsize_limit = comm->cutoff_mbody;
6379             }
6380         }
6381         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6382     }
6383
6384     if (debug)
6385     {
6386         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6387                 "cellsize limit %f\n",
6388                 comm->bBondComm,comm->cellsize_limit);
6389     }
6390
6391     if (MASTER(cr))
6392     {
6393         check_dd_restrictions(cr,dd,ir,fplog);
6394     }
6395
6396     comm->partition_step = INT_MIN;
6397     dd->ddp_count = 0;
6398
6399     return dd;
6400 }
6401
6402 static void set_dlb_limits(gmx_domdec_t *dd)
6403
6404 {
6405     int d;
6406
6407     for(d=0; d<dd->ndim; d++)
6408     {
6409         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6410         dd->comm->cellsize_min[dd->dim[d]] =
6411             dd->comm->cellsize_min_dlb[dd->dim[d]];
6412     }
6413 }
6414
6415
6416 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_step_t step)
6417 {
6418     gmx_domdec_t *dd;
6419     gmx_domdec_comm_t *comm;
6420     real cellsize_min;
6421     int  d,nc,i;
6422     char buf[STRLEN];
6423
6424     dd = cr->dd;
6425     comm = dd->comm;
6426
6427     if (fplog)
6428     {
6429         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6430     }
6431
6432     cellsize_min = comm->cellsize_min[dd->dim[0]];
6433     for(d=1; d<dd->ndim; d++)
6434     {
6435         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6436     }
6437
6438     if (cellsize_min < comm->cellsize_limit*1.05)
6439     {
6440         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6441
6442         return;
6443     }
6444
6445     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6446     comm->bDynLoadBal = TRUE;
6447     dd->bGridJump = TRUE;
6448
6449     set_dlb_limits(dd);
6450
6451     /* We can set the required cell size info here,
6452      * so we do not need to communicate this.
6453      * The grid is completely uniform.
6454      */
6455     for(d=0; d<dd->ndim; d++)
6456     {
6457         if (comm->root[d])
6458         {
6459             comm->load[d].sum_m = comm->load[d].sum;
6460
6461             nc = dd->nc[dd->dim[d]];
6462             for(i=0; i<nc; i++)
6463             {
6464                 comm->root[d]->cell_f[i]    = i/(real)nc;
6465                 if (d > 0)
6466                 {
6467                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6468                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6469                 }
6470             }
6471             comm->root[d]->cell_f[nc] = 1.0;
6472         }
6473     }
6474 }
6475
6476 static char *init_bLocalCG(gmx_mtop_t *mtop)
6477 {
6478     int  ncg,cg;
6479     char *bLocalCG;
6480
6481     ncg = ncg_mtop(mtop);
6482     snew(bLocalCG,ncg);
6483     for(cg=0; cg<ncg; cg++)
6484     {
6485         bLocalCG[cg] = FALSE;
6486     }
6487
6488     return bLocalCG;
6489 }
6490
6491 void dd_init_bondeds(FILE *fplog,
6492                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6493                      gmx_vsite_t *vsite,gmx_constr_t constr,
6494                      t_inputrec *ir,bool bBCheck,cginfo_mb_t *cginfo_mb)
6495 {
6496     gmx_domdec_comm_t *comm;
6497     bool bBondComm;
6498     int  d;
6499
6500     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6501
6502     comm = dd->comm;
6503
6504     if (comm->bBondComm)
6505     {
6506         /* Communicate atoms beyond the cut-off for bonded interactions */
6507         comm = dd->comm;
6508
6509         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6510
6511         comm->bLocalCG = init_bLocalCG(mtop);
6512     }
6513     else
6514     {
6515         /* Only communicate atoms based on cut-off */
6516         comm->cglink   = NULL;
6517         comm->bLocalCG = NULL;
6518     }
6519 }
6520
6521 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6522                               t_inputrec *ir,
6523                               bool bDynLoadBal,real dlb_scale,
6524                               gmx_ddbox_t *ddbox)
6525 {
6526     gmx_domdec_comm_t *comm;
6527     int  d;
6528     ivec np;
6529     real limit,shrink;
6530     char buf[64];
6531
6532     if (fplog == NULL)
6533     {
6534         return;
6535     }
6536
6537     comm = dd->comm;
6538
6539     if (bDynLoadBal)
6540     {
6541         fprintf(fplog,"The maximum number of communication pulses is:");
6542         for(d=0; d<dd->ndim; d++)
6543         {
6544             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6545         }
6546         fprintf(fplog,"\n");
6547         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6548         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6549         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6550         for(d=0; d<DIM; d++)
6551         {
6552             if (dd->nc[d] > 1)
6553             {
6554                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6555                 {
6556                     shrink = 0;
6557                 }
6558                 else
6559                 {
6560                     shrink =
6561                         comm->cellsize_min_dlb[d]/
6562                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6563                 }
6564                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6565             }
6566         }
6567         fprintf(fplog,"\n");
6568     }
6569     else
6570     {
6571         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6572         fprintf(fplog,"The initial number of communication pulses is:");
6573         for(d=0; d<dd->ndim; d++)
6574         {
6575             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
6576         }
6577         fprintf(fplog,"\n");
6578         fprintf(fplog,"The initial domain decomposition cell size is:");
6579         for(d=0; d<DIM; d++) {
6580             if (dd->nc[d] > 1)
6581             {
6582                 fprintf(fplog," %c %.2f nm",
6583                         dim2char(d),dd->comm->cellsize_min[d]);
6584             }
6585         }
6586         fprintf(fplog,"\n\n");
6587     }
6588
6589     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
6590     {
6591         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
6592         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6593                 "non-bonded interactions","",comm->cutoff);
6594
6595         if (bDynLoadBal)
6596         {
6597             limit = dd->comm->cellsize_limit;
6598         }
6599         else
6600         {
6601             if (dynamic_dd_box(ddbox,ir))
6602             {
6603                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
6604             }
6605             limit = dd->comm->cellsize_min[XX];
6606             for(d=1; d<DIM; d++)
6607             {
6608                 limit = min(limit,dd->comm->cellsize_min[d]);
6609             }
6610         }
6611
6612         if (comm->bInterCGBondeds)
6613         {
6614             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6615                     "two-body bonded interactions","(-rdd)",
6616                     max(comm->cutoff,comm->cutoff_mbody));
6617             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6618                     "multi-body bonded interactions","(-rdd)",
6619                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
6620         }
6621         if (dd->vsite_comm)
6622         {
6623             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6624                     "virtual site constructions","(-rcon)",limit);
6625         }
6626         if (dd->constraint_comm)
6627         {
6628             sprintf(buf,"atoms separated by up to %d constraints",
6629                     1+ir->nProjOrder);
6630             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6631                     buf,"(-rcon)",limit);
6632         }
6633         fprintf(fplog,"\n");
6634     }
6635
6636     fflush(fplog);
6637 }
6638
6639 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
6640                        t_inputrec *ir,t_forcerec *fr,
6641                        gmx_ddbox_t *ddbox)
6642 {
6643     gmx_domdec_comm_t *comm;
6644     int  d,dim,npulse,npulse_d_max,npulse_d;
6645     bool bNoCutOff;
6646     int  natoms_tot;
6647     real vol_frac;
6648
6649     comm = dd->comm;
6650
6651     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6652
6653     if (EEL_PME(ir->coulombtype))
6654     {
6655         init_ddpme(dd,&comm->ddpme[0],0,comm->npmenodes_major);
6656         if (comm->npmedecompdim >= 2)
6657         {
6658             init_ddpme(dd,&comm->ddpme[1],1,
6659                        comm->npmenodes/comm->npmenodes_major);
6660         }
6661     }
6662     else
6663     {
6664         comm->npmenodes = 0;
6665         if (dd->pme_nodeid >= 0)
6666         {
6667             gmx_fatal(FARGS,"Can not have separate PME nodes without PME electrostatics");
6668         }
6669     }
6670
6671     /* If each molecule is a single charge group
6672      * or we use domain decomposition for each periodic dimension,
6673      * we do not need to take pbc into account for the bonded interactions.
6674      */
6675     if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
6676         (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
6677     {
6678         fr->bMolPBC = FALSE;
6679     }
6680     else
6681     {
6682         fr->bMolPBC = TRUE;
6683     }
6684
6685     if (debug)
6686     {
6687         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
6688     }
6689     if (comm->eDLB != edlbNO)
6690     {
6691         /* Determine the maximum number of comm. pulses in one dimension */
6692
6693         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6694
6695         /* Determine the maximum required number of grid pulses */
6696         if (comm->cellsize_limit >= comm->cutoff)
6697         {
6698             /* Only a single pulse is required */
6699             npulse = 1;
6700         }
6701         else if (!bNoCutOff && comm->cellsize_limit > 0)
6702         {
6703             /* We round down slightly here to avoid overhead due to the latency
6704              * of extra communication calls when the cut-off
6705              * would be only slightly longer than the cell size.
6706              * Later cellsize_limit is redetermined,
6707              * so we can not miss interactions due to this rounding.
6708              */
6709             npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6710         }
6711         else
6712         {
6713             /* There is no cell size limit */
6714             npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
6715         }
6716
6717         if (!bNoCutOff && npulse > 1)
6718         {
6719             /* See if we can do with less pulses, based on dlb_scale */
6720             npulse_d_max = 0;
6721             for(d=0; d<dd->ndim; d++)
6722             {
6723                 dim = dd->dim[d];
6724                 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6725                                  /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6726                 npulse_d_max = max(npulse_d_max,npulse_d);
6727             }
6728             npulse = min(npulse,npulse_d_max);
6729         }
6730
6731         /* This env var can override npulse */
6732         d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
6733         if (d > 0)
6734         {
6735             npulse = d;
6736         }
6737
6738         comm->maxpulse = 1;
6739         comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6740         for(d=0; d<dd->ndim; d++)
6741         {
6742             comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
6743             comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6744             snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
6745             comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
6746             if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6747             {
6748                 comm->bVacDLBNoLimit = FALSE;
6749             }
6750         }
6751
6752         /* cellsize_limit is set for LINCS in init_domain_decomposition */
6753         if (!comm->bVacDLBNoLimit)
6754         {
6755             comm->cellsize_limit = max(comm->cellsize_limit,
6756                                        comm->cutoff/comm->maxpulse);
6757         }
6758         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6759         /* Set the minimum cell size for each DD dimension */
6760         for(d=0; d<dd->ndim; d++)
6761         {
6762             if (comm->bVacDLBNoLimit ||
6763                 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
6764             {
6765                 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
6766             }
6767             else
6768             {
6769                 comm->cellsize_min_dlb[dd->dim[d]] =
6770                     comm->cutoff/comm->cd[d].np_dlb;
6771             }
6772         }
6773         if (comm->cutoff_mbody <= 0)
6774         {
6775             comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
6776         }
6777         if (comm->bDynLoadBal)
6778         {
6779             set_dlb_limits(dd);
6780         }
6781     }
6782
6783     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
6784     if (comm->eDLB == edlbAUTO)
6785     {
6786         if (fplog)
6787         {
6788             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
6789         }
6790         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
6791     }
6792
6793     vol_frac = 1/(real)dd->nnodes + comm_box_frac(dd->nc,comm->cutoff,ddbox);
6794     if (debug)
6795     {
6796         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
6797     }
6798     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
6799
6800     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
6801 }
6802
6803 static void merge_cg_buffers(int ncell,
6804                              gmx_domdec_comm_dim_t *cd, int pulse,
6805                              int  *ncg_cell,
6806                              int  *index_gl, int  *recv_i,
6807                              rvec *cg_cm,    rvec *recv_vr,
6808                              int *cgindex,
6809                              cginfo_mb_t *cginfo_mb,int *cginfo)
6810 {
6811     gmx_domdec_ind_t *ind,*ind_p;
6812     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
6813     int shift,shift_at;
6814
6815     ind = &cd->ind[pulse];
6816
6817     /* First correct the already stored data */
6818     shift = ind->nrecv[ncell];
6819     for(cell=ncell-1; cell>=0; cell--)
6820     {
6821         shift -= ind->nrecv[cell];
6822         if (shift > 0)
6823         {
6824             /* Move the cg's present from previous grid pulses */
6825             cg0 = ncg_cell[ncell+cell];
6826             cg1 = ncg_cell[ncell+cell+1];
6827             cgindex[cg1+shift] = cgindex[cg1];
6828             for(cg=cg1-1; cg>=cg0; cg--)
6829             {
6830                 index_gl[cg+shift] = index_gl[cg];
6831                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
6832                 cgindex[cg+shift] = cgindex[cg];
6833                 cginfo[cg+shift] = cginfo[cg];
6834             }
6835             /* Correct the already stored send indices for the shift */
6836             for(p=1; p<=pulse; p++)
6837             {
6838                 ind_p = &cd->ind[p];
6839                 cg0 = 0;
6840                 for(c=0; c<cell; c++)
6841                 {
6842                     cg0 += ind_p->nsend[c];
6843                 }
6844                 cg1 = cg0 + ind_p->nsend[cell];
6845                 for(cg=cg0; cg<cg1; cg++)
6846                 {
6847                     ind_p->index[cg] += shift;
6848                 }
6849             }
6850         }
6851     }
6852
6853     /* Merge in the communicated buffers */
6854     shift = 0;
6855     shift_at = 0;
6856     cg0 = 0;
6857     for(cell=0; cell<ncell; cell++)
6858     {
6859         cg1 = ncg_cell[ncell+cell+1] + shift;
6860         if (shift_at > 0)
6861         {
6862             /* Correct the old cg indices */
6863             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
6864             {
6865                 cgindex[cg+1] += shift_at;
6866             }
6867         }
6868         for(cg=0; cg<ind->nrecv[cell]; cg++)
6869         {
6870             /* Copy this charge group from the buffer */
6871             index_gl[cg1] = recv_i[cg0];
6872             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
6873             /* Add it to the cgindex */
6874             cg_gl = index_gl[cg1];
6875             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
6876             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
6877             cgindex[cg1+1] = cgindex[cg1] + nat;
6878             cg0++;
6879             cg1++;
6880             shift_at += nat;
6881         }
6882         shift += ind->nrecv[cell];
6883         ncg_cell[ncell+cell+1] = cg1;
6884     }
6885 }
6886
6887 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
6888                                int nzone,int cg0,const int *cgindex)
6889 {
6890     int cg,zone,p;
6891
6892     /* Store the atom block boundaries for easy copying of communication buffers
6893      */
6894     cg = cg0;
6895     for(zone=0; zone<nzone; zone++)
6896     {
6897         for(p=0; p<cd->np; p++) {
6898             cd->ind[p].cell2at0[zone] = cgindex[cg];
6899             cg += cd->ind[p].nrecv[zone];
6900             cd->ind[p].cell2at1[zone] = cgindex[cg];
6901         }
6902     }
6903 }
6904
6905 static bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
6906 {
6907     int  i;
6908     bool bMiss;
6909
6910     bMiss = FALSE;
6911     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
6912     {
6913         if (!bLocalCG[link->a[i]])
6914         {
6915             bMiss = TRUE;
6916         }
6917     }
6918
6919     return bMiss;
6920 }
6921
6922 static void setup_dd_communication(gmx_domdec_t *dd,
6923                                    matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
6924 {
6925     int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
6926     int nzone,nzone_send,zone,zonei,cg0,cg1;
6927     int c,i,j,cg,cg_gl,nrcg;
6928     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
6929     gmx_domdec_comm_t *comm;
6930     gmx_domdec_zones_t *zones;
6931     gmx_domdec_comm_dim_t *cd;
6932     gmx_domdec_ind_t *ind;
6933     cginfo_mb_t *cginfo_mb;
6934     bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
6935     real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
6936     rvec rb,rn;
6937     real corner[DIM][4],corner_round_0=0,corner_round_1[4];
6938     real bcorner[DIM],bcorner_round_1=0;
6939     ivec tric_dist;
6940     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
6941     real skew_fac2_d,skew_fac_01;
6942     rvec sf2_round;
6943     int  nsend,nat;
6944
6945     if (debug)
6946     {
6947         fprintf(debug,"Setting up DD communication\n");
6948     }
6949
6950     comm  = dd->comm;
6951     cg_cm = fr->cg_cm;
6952
6953     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
6954     {
6955         dim = dd->dim[dim_ind];
6956
6957         /* Check if we need to use triclinic distances */
6958         tric_dist[dim_ind] = 0;
6959         for(i=0; i<=dim_ind; i++)
6960         {
6961             if (ddbox->tric_dir[dd->dim[i]])
6962             {
6963                 tric_dist[dim_ind] = 1;
6964             }
6965         }
6966     }
6967
6968     bBondComm = comm->bBondComm;
6969
6970     /* Do we need to determine extra distances for multi-body bondeds? */
6971     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
6972
6973     /* Do we need to determine extra distances for only two-body bondeds? */
6974     bDist2B = (bBondComm && !bDistMB);
6975
6976     r_comm2  = sqr(comm->cutoff);
6977     r_bcomm2 = sqr(comm->cutoff_mbody);
6978
6979     if (debug)
6980     {
6981         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
6982     }
6983
6984     zones = &comm->zones;
6985
6986     dim0 = dd->dim[0];
6987     /* The first dimension is equal for all cells */
6988     corner[0][0] = comm->cell_x0[dim0];
6989     if (bDistMB)
6990     {
6991         bcorner[0] = corner[0][0];
6992     }
6993     if (dd->ndim >= 2)
6994     {
6995         dim1 = dd->dim[1];
6996         /* This cell row is only seen from the first row */
6997         corner[1][0] = comm->cell_x0[dim1];
6998         /* All rows can see this row */
6999         corner[1][1] = comm->cell_x0[dim1];
7000         if (dd->bGridJump)
7001         {
7002             corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7003             if (bDistMB)
7004             {
7005                 /* For the multi-body distance we need the maximum */
7006                 bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7007             }
7008         }
7009         /* Set the upper-right corner for rounding */
7010         corner_round_0 = comm->cell_x1[dim0];
7011
7012         if (dd->ndim >= 3)
7013         {
7014             dim2 = dd->dim[2];
7015             for(j=0; j<4; j++)
7016             {
7017                 corner[2][j] = comm->cell_x0[dim2];
7018             }
7019             if (dd->bGridJump)
7020             {
7021                 /* Use the maximum of the i-cells that see a j-cell */
7022                 for(i=0; i<zones->nizone; i++)
7023                 {
7024                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7025                     {
7026                         if (j >= 4)
7027                         {
7028                             corner[2][j-4] =
7029                                 max(corner[2][j-4],
7030                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7031                         }
7032                     }
7033                 }
7034                 if (bDistMB)
7035                 {
7036                     /* For the multi-body distance we need the maximum */
7037                     bcorner[2] = comm->cell_x0[dim2];
7038                     for(i=0; i<2; i++)
7039                     {
7040                         for(j=0; j<2; j++)
7041                         {
7042                             bcorner[2] = max(bcorner[2],
7043                                              comm->zone_d2[i][j].p1_0);
7044                         }
7045                     }
7046                 }
7047             }
7048
7049             /* Set the upper-right corner for rounding */
7050             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7051              * Only cell (0,0,0) can see cell 7 (1,1,1)
7052              */
7053             corner_round_1[0] = comm->cell_x1[dim1];
7054             corner_round_1[3] = comm->cell_x1[dim1];
7055             if (dd->bGridJump)
7056             {
7057                 corner_round_1[0] = max(comm->cell_x1[dim1],
7058                                         comm->zone_d1[1].mch1);
7059                 if (bDistMB)
7060                 {
7061                     /* For the multi-body distance we need the maximum */
7062                     bcorner_round_1 = max(comm->cell_x1[dim1],
7063                                           comm->zone_d1[1].p1_1);
7064                 }
7065             }
7066         }
7067     }
7068
7069     /* Triclinic stuff */
7070     normal = ddbox->normal;
7071     skew_fac_01 = 0;
7072     if (dd->ndim >= 2)
7073     {
7074         v_0 = ddbox->v[dim0];
7075         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7076         {
7077             /* Determine the coupling coefficient for the distances
7078              * to the cell planes along dim0 and dim1 through dim2.
7079              * This is required for correct rounding.
7080              */
7081             skew_fac_01 =
7082                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7083             if (debug)
7084             {
7085                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7086             }
7087         }
7088     }
7089     if (dd->ndim >= 3)
7090     {
7091         v_1 = ddbox->v[dim1];
7092     }
7093
7094     zone_cg_range = zones->cg_range;
7095     index_gl = dd->index_gl;
7096     cgindex  = dd->cgindex;
7097     cginfo_mb = fr->cginfo_mb;
7098
7099     zone_cg_range[0]   = 0;
7100     zone_cg_range[1]   = dd->ncg_home;
7101     comm->zone_ncg1[0] = dd->ncg_home;
7102     pos_cg             = dd->ncg_home;
7103
7104     nat_tot = dd->nat_home;
7105     nzone = 1;
7106     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7107     {
7108         dim = dd->dim[dim_ind];
7109         cd = &comm->cd[dim_ind];
7110
7111         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7112         {
7113             /* No pbc in this dimension, the first node should not comm. */
7114             nzone_send = 0;
7115         }
7116         else
7117         {
7118             nzone_send = nzone;
7119         }
7120
7121         bScrew = (dd->bScrewPBC && dim == XX);
7122
7123         v_d = ddbox->v[dim];
7124         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7125
7126         cd->bInPlace = TRUE;
7127         for(p=0; p<cd->np; p++)
7128         {
7129             /* Only atoms communicated in the first pulse are used
7130              * for multi-body bonded interactions or for bBondComm.
7131              */
7132             bDistBonded   = ((bDistMB || bDist2B) && p == 0);
7133             bDistMB_pulse = (bDistMB && bDistBonded);
7134
7135             ind = &cd->ind[p];
7136             nsend = 0;
7137             nat = 0;
7138             for(zone=0; zone<nzone_send; zone++)
7139             {
7140                 if (tric_dist[dim_ind] && dim_ind > 0)
7141                 {
7142                     /* Determine slightly more optimized skew_fac's
7143                      * for rounding.
7144                      * This reduces the number of communicated atoms
7145                      * by about 10% for 3D DD of rhombic dodecahedra.
7146                      */
7147                     for(dimd=0; dimd<dim; dimd++)
7148                     {
7149                         sf2_round[dimd] = 1;
7150                         if (ddbox->tric_dir[dimd])
7151                         {
7152                             for(i=dd->dim[dimd]+1; i<DIM; i++)
7153                             {
7154                                 /* If we are shifted in dimension i
7155                                  * and the cell plane is tilted forward
7156                                  * in dimension i, skip this coupling.
7157                                  */
7158                                 if (!(zones->shift[nzone+zone][i] &&
7159                                       ddbox->v[dimd][i][dimd] >= 0))
7160                                 {
7161                                     sf2_round[dimd] +=
7162                                         sqr(ddbox->v[dimd][i][dimd]);
7163                                 }
7164                             }
7165                             sf2_round[dimd] = 1/sf2_round[dimd];
7166                         }
7167                     }
7168                 }
7169
7170                 zonei = zone_perm[dim_ind][zone];
7171                 if (p == 0)
7172                 {
7173                     /* Here we permutate the zones to obtain a convenient order
7174                      * for neighbor searching
7175                      */
7176                     cg0 = zone_cg_range[zonei];
7177                     cg1 = zone_cg_range[zonei+1];
7178                 }
7179                 else
7180                 {
7181                     /* Look only at the cg's received in the previous grid pulse
7182                      */
7183                     cg1 = zone_cg_range[nzone+zone+1];
7184                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
7185                 }
7186                 ind->nsend[zone] = 0;
7187                 for(cg=cg0; cg<cg1; cg++)
7188                 {
7189                     r2  = 0;
7190                     rb2 = 0;
7191                     if (tric_dist[dim_ind] == 0)
7192                     {
7193                         /* Rectangular direction, easy */
7194                         r = cg_cm[cg][dim] - corner[dim_ind][zone];
7195                         if (r > 0)
7196                         {
7197                             r2 += r*r;
7198                         }
7199                         if (bDistMB_pulse)
7200                         {
7201                             r = cg_cm[cg][dim] - bcorner[dim_ind];
7202                             if (r > 0)
7203                             {
7204                                 rb2 += r*r;
7205                             }
7206                         }
7207                         /* Rounding gives at most a 16% reduction
7208                          * in communicated atoms
7209                          */
7210                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7211                         {
7212                             r = cg_cm[cg][dim0] - corner_round_0;
7213                             /* This is the first dimension, so always r >= 0 */
7214                             r2 += r*r;
7215                             if (bDistMB_pulse)
7216                             {
7217                                 rb2 += r*r;
7218                             }
7219                         }
7220                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7221                         {
7222                             r = cg_cm[cg][dim1] - corner_round_1[zone];
7223                             if (r > 0)
7224                             {
7225                                 r2 += r*r;
7226                             }
7227                             if (bDistMB_pulse)
7228                             {
7229                                 r = cg_cm[cg][dim1] - bcorner_round_1;
7230                                 if (r > 0)
7231                                 {
7232                                     rb2 += r*r;
7233                                 }
7234                             }
7235                         }
7236                     }
7237                     else
7238                     {
7239                         /* Triclinic direction, more complicated */
7240                         clear_rvec(rn);
7241                         clear_rvec(rb);
7242                         /* Rounding, conservative as the skew_fac multiplication
7243                          * will slightly underestimate the distance.
7244                          */
7245                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7246                         {
7247                             rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
7248                             for(i=dim0+1; i<DIM; i++)
7249                             {
7250                                 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7251                             }
7252                             r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7253                             if (bDistMB_pulse)
7254                             {
7255                                 rb[dim0] = rn[dim0];
7256                                 rb2 = r2;
7257                             }
7258                             /* Take care that the cell planes along dim0 might not
7259                              * be orthogonal to those along dim1 and dim2.
7260                              */
7261                             for(i=1; i<=dim_ind; i++)
7262                             {
7263                                 dimd = dd->dim[i];
7264                                 if (normal[dim0][dimd] > 0)
7265                                 {
7266                                     rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7267                                     if (bDistMB_pulse)
7268                                     {
7269                                         rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7270                                     }
7271                                 }
7272                             }
7273                         }
7274                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7275                         {
7276                             rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
7277                             tric_sh = 0;
7278                             for(i=dim1+1; i<DIM; i++)
7279                             {
7280                                 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7281                             }
7282                             rn[dim1] += tric_sh;
7283                             if (rn[dim1] > 0)
7284                             {
7285                                 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7286                                 /* Take care of coupling of the distances
7287                                  * to the planes along dim0 and dim1 through dim2.
7288                                  */
7289                                 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7290                                 /* Take care that the cell planes along dim1
7291                                  * might not be orthogonal to that along dim2.
7292                                  */
7293                                 if (normal[dim1][dim2] > 0)
7294                                 {
7295                                     rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7296                                 }
7297                             }
7298                             if (bDistMB_pulse)
7299                             {
7300                                 rb[dim1] +=
7301                                     cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
7302                                 if (rb[dim1] > 0)
7303                                 {
7304                                     rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7305                                     /* Take care of coupling of the distances
7306                                      * to the planes along dim0 and dim1 through dim2.
7307                                      */
7308                                     rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7309                                     /* Take care that the cell planes along dim1
7310                                      * might not be orthogonal to that along dim2.
7311                                      */
7312                                     if (normal[dim1][dim2] > 0)
7313                                     {
7314                                         rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7315                                     }
7316                                 }
7317                             }
7318                         }
7319                         /* The distance along the communication direction */
7320                         rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
7321                         tric_sh = 0;
7322                         for(i=dim+1; i<DIM; i++)
7323                         {
7324                             tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7325                         }
7326                         rn[dim] += tric_sh;
7327                         if (rn[dim] > 0)
7328                         {
7329                             r2 += rn[dim]*rn[dim]*skew_fac2_d;
7330                             /* Take care of coupling of the distances
7331                              * to the planes along dim0 and dim1 through dim2.
7332                              */
7333                             if (dim_ind == 1 && zonei == 1)
7334                             {
7335                                 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7336                             }
7337                         }
7338                         if (bDistMB_pulse)
7339                         {
7340                             clear_rvec(rb);
7341                             rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
7342                             if (rb[dim] > 0)
7343                             {
7344                                 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7345                                 /* Take care of coupling of the distances
7346                                  * to the planes along dim0 and dim1 through dim2.
7347                                  */
7348                                 if (dim_ind == 1 && zonei == 1)
7349                                 {
7350                                     rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7351                                 }
7352                             }
7353                         }
7354                     }
7355
7356                     if (r2 < r_comm2 ||
7357                         (bDistBonded &&
7358                          ((bDistMB && rb2 < r_bcomm2) ||
7359                           (bDist2B && r2  < r_bcomm2)) &&
7360                          (!bBondComm ||
7361                           (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
7362                            missing_link(comm->cglink,index_gl[cg],
7363                                         comm->bLocalCG)))))
7364                     {
7365                         /* Make an index to the local charge groups */
7366                         if (nsend+1 > ind->nalloc)
7367                         {
7368                             ind->nalloc = over_alloc_large(nsend+1);
7369                             srenew(ind->index,ind->nalloc);
7370                         }
7371                         if (nsend+1 > comm->nalloc_int)
7372                         {
7373                             comm->nalloc_int = over_alloc_large(nsend+1);
7374                             srenew(comm->buf_int,comm->nalloc_int);
7375                         }
7376                         ind->index[nsend] = cg;
7377                         comm->buf_int[nsend] = index_gl[cg];
7378                         ind->nsend[zone]++;
7379                         check_vec_rvec_alloc(&comm->vbuf,nsend+1);
7380
7381                         if (dd->ci[dim] == 0)
7382                         {
7383                             /* Correct cg_cm for pbc */
7384                             rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
7385                             if (bScrew)
7386                             {
7387                                 comm->vbuf.v[nsend][YY] =
7388                                     box[YY][YY]-comm->vbuf.v[nsend][YY];
7389                                 comm->vbuf.v[nsend][ZZ] =
7390                                     box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
7391                             }
7392                         }
7393                         else
7394                         {
7395                             copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
7396                         }
7397                         nsend++;
7398                         nat += cgindex[cg+1] - cgindex[cg];
7399                     }
7400                 }
7401             }
7402             /* Clear the counts in case we do not have pbc */
7403             for(zone=nzone_send; zone<nzone; zone++)
7404             {
7405                 ind->nsend[zone] = 0;
7406             }
7407             ind->nsend[nzone]   = nsend;
7408             ind->nsend[nzone+1] = nat;
7409             /* Communicate the number of cg's and atoms to receive */
7410             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7411                             ind->nsend, nzone+2,
7412                             ind->nrecv, nzone+2);
7413
7414             /* The rvec buffer is also required for atom buffers of size nsend
7415              * in dd_move_x and dd_move_f.
7416              */
7417             check_vec_rvec_alloc(&comm->vbuf,ind->nsend[nzone+1]);
7418
7419             if (p > 0)
7420             {
7421                 /* We can receive in place if only the last zone is not empty */
7422                 for(zone=0; zone<nzone-1; zone++)
7423                 {
7424                     if (ind->nrecv[zone] > 0)
7425                     {
7426                         cd->bInPlace = FALSE;
7427                     }
7428                 }
7429                 if (!cd->bInPlace)
7430                 {
7431                     /* The int buffer is only required here for the cg indices */
7432                     if (ind->nrecv[nzone] > comm->nalloc_int2)
7433                     {
7434                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
7435                         srenew(comm->buf_int2,comm->nalloc_int2);
7436                     }
7437                     /* The rvec buffer is also required for atom buffers
7438                      * of size nrecv in dd_move_x and dd_move_f.
7439                      */
7440                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
7441                     check_vec_rvec_alloc(&comm->vbuf2,i);
7442                 }
7443             }
7444
7445             /* Make space for the global cg indices */
7446             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
7447                 || dd->cg_nalloc == 0)
7448             {
7449                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
7450                 srenew(index_gl,dd->cg_nalloc);
7451                 srenew(cgindex,dd->cg_nalloc+1);
7452             }
7453             /* Communicate the global cg indices */
7454             if (cd->bInPlace)
7455             {
7456                 recv_i = index_gl + pos_cg;
7457             }
7458             else
7459             {
7460                 recv_i = comm->buf_int2;
7461             }
7462             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7463                             comm->buf_int, nsend,
7464                             recv_i,        ind->nrecv[nzone]);
7465
7466             /* Make space for cg_cm */
7467             if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
7468             {
7469                 dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
7470                 cg_cm = fr->cg_cm;
7471             }
7472             /* Communicate cg_cm */
7473             if (cd->bInPlace)
7474             {
7475                 recv_vr = cg_cm + pos_cg;
7476             }
7477             else
7478             {
7479                 recv_vr = comm->vbuf2.v;
7480             }
7481             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
7482                              comm->vbuf.v, nsend,
7483                              recv_vr,      ind->nrecv[nzone]);
7484
7485             /* Make the charge group index */
7486             if (cd->bInPlace)
7487             {
7488                 zone = (p == 0 ? 0 : nzone - 1);
7489                 while (zone < nzone)
7490                 {
7491                     for(cg=0; cg<ind->nrecv[zone]; cg++)
7492                     {
7493                         cg_gl = index_gl[pos_cg];
7494                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
7495                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
7496                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
7497                         if (bBondComm)
7498                         {
7499                             /* Update the charge group presence,
7500                              * so we can use it in the next pass of the loop.
7501                              */
7502                             comm->bLocalCG[cg_gl] = TRUE;
7503                         }
7504                         pos_cg++;
7505                     }
7506                     if (p == 0)
7507                     {
7508                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
7509                     }
7510                     zone++;
7511                     zone_cg_range[nzone+zone] = pos_cg;
7512                 }
7513             }
7514             else
7515             {
7516                 /* This part of the code is never executed with bBondComm. */
7517                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
7518                                  index_gl,recv_i,cg_cm,recv_vr,
7519                                  cgindex,fr->cginfo_mb,fr->cginfo);
7520                 pos_cg += ind->nrecv[nzone];
7521             }
7522             nat_tot += ind->nrecv[nzone+1];
7523         }
7524         if (!cd->bInPlace)
7525         {
7526             /* Store the atom block for easy copying of communication buffers */
7527             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
7528         }
7529         nzone += nzone;
7530     }
7531     dd->index_gl = index_gl;
7532     dd->cgindex  = cgindex;
7533
7534     dd->ncg_tot = zone_cg_range[zones->n];
7535     dd->nat_tot = nat_tot;
7536     comm->nat[ddnatHOME] = dd->nat_home;
7537     for(i=ddnatZONE; i<ddnatNR; i++)
7538     {
7539         comm->nat[i] = dd->nat_tot;
7540     }
7541
7542     if (!bBondComm)
7543     {
7544         /* We don't need to update cginfo, since that was alrady done above.
7545          * So we pass NULL for the forcerec.
7546          */
7547         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
7548                       NULL,comm->bLocalCG);
7549     }
7550
7551     if (debug)
7552     {
7553         fprintf(debug,"Finished setting up DD communication, zones:");
7554         for(c=0; c<zones->n; c++)
7555         {
7556             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
7557         }
7558         fprintf(debug,"\n");
7559     }
7560 }
7561
7562 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
7563 {
7564     int c;
7565
7566     for(c=0; c<zones->nizone; c++)
7567     {
7568         zones->izone[c].cg1  = zones->cg_range[c+1];
7569         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
7570         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
7571     }
7572 }
7573
7574 static int comp_cgsort(const void *a,const void *b)
7575 {
7576     int comp;
7577
7578     gmx_cgsort_t *cga,*cgb;
7579     cga = (gmx_cgsort_t *)a;
7580     cgb = (gmx_cgsort_t *)b;
7581
7582     comp = cga->nsc - cgb->nsc;
7583     if (comp == 0)
7584     {
7585         comp = cga->ind_gl - cgb->ind_gl;
7586     }
7587
7588     return comp;
7589 }
7590
7591 static void order_int_cg(int n,gmx_cgsort_t *sort,
7592                          int *a,int *buf)
7593 {
7594     int i;
7595
7596     /* Order the data */
7597     for(i=0; i<n; i++)
7598     {
7599         buf[i] = a[sort[i].ind];
7600     }
7601
7602     /* Copy back to the original array */
7603     for(i=0; i<n; i++)
7604     {
7605         a[i] = buf[i];
7606     }
7607 }
7608
7609 static void order_vec_cg(int n,gmx_cgsort_t *sort,
7610                          rvec *v,rvec *buf)
7611 {
7612     int i;
7613
7614     /* Order the data */
7615     for(i=0; i<n; i++)
7616     {
7617         copy_rvec(v[sort[i].ind],buf[i]);
7618     }
7619
7620     /* Copy back to the original array */
7621     for(i=0; i<n; i++)
7622     {
7623         copy_rvec(buf[i],v[i]);
7624     }
7625 }
7626
7627 static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
7628                            rvec *v,rvec *buf)
7629 {
7630     int a,atot,cg,cg0,cg1,i;
7631
7632     /* Order the data */
7633     a = 0;
7634     for(cg=0; cg<ncg; cg++)
7635     {
7636         cg0 = cgindex[sort[cg].ind];
7637         cg1 = cgindex[sort[cg].ind+1];
7638         for(i=cg0; i<cg1; i++)
7639         {
7640             copy_rvec(v[i],buf[a]);
7641             a++;
7642         }
7643     }
7644     atot = a;
7645
7646     /* Copy back to the original array */
7647     for(a=0; a<atot; a++)
7648     {
7649         copy_rvec(buf[a],v[a]);
7650     }
7651 }
7652
7653 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
7654                          int nsort_new,gmx_cgsort_t *sort_new,
7655                          gmx_cgsort_t *sort1)
7656 {
7657     int i1,i2,i_new;
7658
7659     /* The new indices are not very ordered, so we qsort them */
7660     qsort(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
7661
7662     /* sort2 is already ordered, so now we can merge the two arrays */
7663     i1 = 0;
7664     i2 = 0;
7665     i_new = 0;
7666     while(i2 < nsort2 || i_new < nsort_new)
7667     {
7668         if (i2 == nsort2)
7669         {
7670             sort1[i1++] = sort_new[i_new++];
7671         }
7672         else if (i_new == nsort_new)
7673         {
7674             sort1[i1++] = sort2[i2++];
7675         }
7676         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
7677                  (sort2[i2].nsc == sort_new[i_new].nsc &&
7678                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
7679         {
7680             sort1[i1++] = sort2[i2++];
7681         }
7682         else
7683         {
7684             sort1[i1++] = sort_new[i_new++];
7685         }
7686     }
7687 }
7688
7689 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
7690                           rvec *cgcm,t_forcerec *fr,t_state *state,
7691                           int ncg_home_old)
7692 {
7693     gmx_domdec_sort_t *sort;
7694     gmx_cgsort_t *cgsort,*sort_i;
7695     int  ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
7696     rvec *vbuf;
7697
7698     sort = dd->comm->sort;
7699
7700     if (dd->ncg_home > sort->sort_nalloc)
7701     {
7702         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
7703         srenew(sort->sort1,sort->sort_nalloc);
7704         srenew(sort->sort2,sort->sort_nalloc);
7705     }
7706
7707     if (ncg_home_old >= 0)
7708     {
7709         /* The charge groups that remained in the same ns grid cell
7710          * are completely ordered. So we can sort efficiently by sorting
7711          * the charge groups that did move into the stationary list.
7712          */
7713         ncg_new = 0;
7714         nsort2 = 0;
7715         nsort_new = 0;
7716         for(i=0; i<dd->ncg_home; i++)
7717         {
7718             /* Check if this cg did not move to another node */
7719             cell_index = fr->ns.grid->cell_index[i];
7720             if (cell_index !=  4*fr->ns.grid->ncells)
7721             {
7722                 if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
7723                 {
7724                     /* This cg is new on this node or moved ns grid cell */
7725                     if (nsort_new >= sort->sort_new_nalloc)
7726                     {
7727                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
7728                         srenew(sort->sort_new,sort->sort_new_nalloc);
7729                     }
7730                     sort_i = &(sort->sort_new[nsort_new++]);
7731                 }
7732                 else
7733                 {
7734                     /* This cg did not move */
7735                     sort_i = &(sort->sort2[nsort2++]);
7736                 }
7737                 /* Sort on the ns grid cell indices
7738                  * and the global topology index
7739                  */
7740                 sort_i->nsc    = cell_index;
7741                 sort_i->ind_gl = dd->index_gl[i];
7742                 sort_i->ind    = i;
7743                 ncg_new++;
7744             }
7745         }
7746         if (debug)
7747         {
7748             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
7749                     nsort2,nsort_new);
7750         }
7751         /* Sort efficiently */
7752         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
7753     }
7754     else
7755     {
7756         cgsort = sort->sort1;
7757         ncg_new = 0;
7758         for(i=0; i<dd->ncg_home; i++)
7759         {
7760             /* Sort on the ns grid cell indices
7761              * and the global topology index
7762              */
7763             cgsort[i].nsc    = fr->ns.grid->cell_index[i];
7764             cgsort[i].ind_gl = dd->index_gl[i];
7765             cgsort[i].ind    = i;
7766             if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
7767             {
7768                 ncg_new++;
7769             }
7770         }
7771         if (debug)
7772         {
7773             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
7774         }
7775         /* Determine the order of the charge groups using qsort */
7776         qsort(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
7777     }
7778     cgsort = sort->sort1;
7779
7780     /* We alloc with the old size, since cgindex is still old */
7781     check_vec_rvec_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
7782     vbuf = dd->comm->vbuf.v;
7783
7784     /* Remove the charge groups which are no longer at home here */
7785     dd->ncg_home = ncg_new;
7786
7787     /* Reorder the state */
7788     for(i=estX; i<estNR; i++)
7789     {
7790         if (state->flags & (1<<i))
7791         {
7792             switch (i)
7793             {
7794             case estX:
7795                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
7796                 break;
7797             case estV:
7798                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
7799                 break;
7800             case estSDX:
7801                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
7802                 break;
7803             case estCGP:
7804                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
7805                 break;
7806             case estLD_RNG:
7807             case estLD_RNGI:
7808             case estDISRE_INITF:
7809             case estDISRE_RM3TAV:
7810             case estORIRE_INITF:
7811             case estORIRE_DTAV:
7812                 /* No ordering required */
7813                 break;
7814             default:
7815                 gmx_incons("Unknown state entry encountered in dd_sort_state");
7816                 break;
7817             }
7818         }
7819     }
7820     /* Reorder cgcm */
7821     order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
7822
7823     if (dd->ncg_home+1 > sort->ibuf_nalloc)
7824     {
7825         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
7826         srenew(sort->ibuf,sort->ibuf_nalloc);
7827     }
7828     ibuf = sort->ibuf;
7829     /* Reorder the global cg index */
7830     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
7831     /* Reorder the cginfo */
7832     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
7833     /* Rebuild the local cg index */
7834     ibuf[0] = 0;
7835     for(i=0; i<dd->ncg_home; i++)
7836     {
7837         cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
7838         ibuf[i+1] = ibuf[i] + cgsize;
7839     }
7840     for(i=0; i<dd->ncg_home+1; i++)
7841     {
7842         dd->cgindex[i] = ibuf[i];
7843     }
7844     /* Set the home atom number */
7845     dd->nat_home = dd->cgindex[dd->ncg_home];
7846
7847     /* Copy the sorted ns cell indices back to the ns grid struct */
7848     for(i=0; i<dd->ncg_home; i++)
7849     {
7850         fr->ns.grid->cell_index[i] = cgsort[i].nsc;
7851     }
7852     fr->ns.grid->nr = dd->ncg_home;
7853 }
7854
7855 static void add_dd_statistics(gmx_domdec_t *dd)
7856 {
7857     gmx_domdec_comm_t *comm;
7858     int ddnat;
7859
7860     comm = dd->comm;
7861
7862     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
7863     {
7864         comm->sum_nat[ddnat-ddnatZONE] +=
7865             comm->nat[ddnat] - comm->nat[ddnat-1];
7866     }
7867     comm->ndecomp++;
7868 }
7869
7870 void reset_dd_statistics_counters(gmx_domdec_t *dd)
7871 {
7872     gmx_domdec_comm_t *comm;
7873     int ddnat;
7874
7875     comm = dd->comm;
7876
7877     /* Reset all the statistics and counters for total run counting */
7878     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
7879     {
7880         comm->sum_nat[ddnat-ddnatZONE] = 0;
7881     }
7882     comm->ndecomp = 0;
7883     comm->nload = 0;
7884     comm->load_step = 0;
7885     comm->load_sum = 0;
7886     comm->load_max = 0;
7887     clear_ivec(comm->load_lim);
7888     comm->load_mdf = 0;
7889     comm->load_pme = 0;
7890 }
7891
7892 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
7893 {
7894     gmx_domdec_comm_t *comm;
7895     int ddnat;
7896     double av;
7897
7898     comm = cr->dd->comm;
7899
7900     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
7901
7902     if (fplog == NULL)
7903     {
7904         return;
7905     }
7906
7907     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
7908
7909     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
7910     {
7911         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
7912         switch(ddnat)
7913         {
7914         case ddnatZONE:
7915             fprintf(fplog,
7916                     " av. #atoms communicated per step for force:  %d x %.1f\n",
7917                     2,av);
7918             break;
7919         case ddnatVSITE:
7920             if (cr->dd->vsite_comm)
7921             {
7922                 fprintf(fplog,
7923                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
7924                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
7925                         av);
7926             }
7927             break;
7928         case ddnatCON:
7929             if (cr->dd->constraint_comm)
7930             {
7931                 fprintf(fplog,
7932                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
7933                         1 + ir->nLincsIter,av);
7934             }
7935             break;
7936         default:
7937             gmx_incons(" Unknown type for DD statistics");
7938         }
7939     }
7940     fprintf(fplog,"\n");
7941
7942     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
7943     {
7944         print_dd_load_av(fplog,cr->dd);
7945     }
7946 }
7947
7948 void dd_partition_system(FILE            *fplog,
7949                          gmx_step_t      step,
7950                          t_commrec       *cr,
7951                          bool            bMasterState,
7952                          int             nstglobalcomm,
7953                          t_state         *state_global,
7954                          gmx_mtop_t      *top_global,
7955                          t_inputrec      *ir,
7956                          t_state         *state_local,
7957                          rvec            **f,
7958                          t_mdatoms       *mdatoms,
7959                          gmx_localtop_t  *top_local,
7960                          t_forcerec      *fr,
7961                          gmx_vsite_t     *vsite,
7962                          gmx_shellfc_t   shellfc,
7963                          gmx_constr_t    constr,
7964                          t_nrnb          *nrnb,
7965                          gmx_wallcycle_t wcycle,
7966                          bool            bVerbose)
7967 {
7968     gmx_domdec_t *dd;
7969     gmx_domdec_comm_t *comm;
7970     gmx_ddbox_t ddbox;
7971     t_block *cgs_gl;
7972     gmx_step_t step_pcoupl;
7973     rvec cell_ns_x0,cell_ns_x1;
7974     int  i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
7975     bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
7976     bool bRedist,bSortCG,bResortAll;
7977     ivec ncells_old,np;
7978     real grid_density;
7979     char sbuf[22];
7980
7981     dd = cr->dd;
7982     comm = dd->comm;
7983
7984     bBoxChanged = (bMasterState || DEFORM(*ir));
7985     if (ir->epc != epcNO)
7986     {
7987         /* With nstcalcenery > 1 pressure coupling happens.
7988          * one step after calculating the energies.
7989          * Box scaling happens at the end of the MD step,
7990          * after the DD partitioning.
7991          * We therefore have to do DLB in the first partitioning
7992          * after an MD step where P-coupling occured.
7993          * We need to determine the last step in which p-coupling occurred.
7994          */
7995         n = ir->nstcalcenergy;
7996         if (n == 1)
7997         {
7998             step_pcoupl = step - 1;
7999         }
8000         else
8001         {
8002             step_pcoupl = ((step - 1)/n)*n + 1;
8003         }
8004         if (step_pcoupl >= comm->partition_step)
8005         {
8006             bBoxChanged = TRUE;
8007         }
8008     }
8009
8010     bNStGlobalComm = (step >= comm->partition_step + nstglobalcomm);
8011
8012     if (!comm->bDynLoadBal)
8013     {
8014         bDoDLB = FALSE;
8015     }
8016     else
8017     {
8018         /* Should we do dynamic load balacing this step?
8019          * Since it requires (possibly expensive) global communication,
8020          * we might want to do DLB less frequently.
8021          */
8022         if (bBoxChanged || ir->epc != epcNO)
8023         {
8024             bDoDLB = bBoxChanged;
8025         }
8026         else
8027         {
8028             bDoDLB = bNStGlobalComm;
8029         }
8030     }
8031
8032     /* Check if we have recorded loads on the nodes */
8033     if (comm->bRecordLoad && dd_load_count(comm))
8034     {
8035         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
8036         {
8037             /* Check if we should use DLB at the second partitioning
8038              * and every 100 partitionings,
8039              * so the extra communication cost is negligible.
8040              */
8041             n = max(100,nstglobalcomm);
8042             bCheckDLB = (comm->n_load_collect == 0 ||
8043                          comm->n_load_have % n == n-1);
8044         }
8045         else
8046         {
8047             bCheckDLB = FALSE;
8048         }
8049
8050         /* Print load every nstlog, first and last step to the log file */
8051         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
8052                     comm->n_load_collect == 0 ||
8053                     (step + ir->nstlist > ir->init_step + ir->nsteps));
8054
8055         /* Avoid extra communication due to verbose screen output
8056          * when nstglobalcomm is set.
8057          */
8058         if (bDoDLB || bLogLoad || bCheckDLB ||
8059             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
8060         {
8061             get_load_distribution(dd,wcycle);
8062             if (DDMASTER(dd))
8063             {
8064                 if (bLogLoad)
8065                 {
8066                     dd_print_load(fplog,dd,step-1);
8067                 }
8068                 if (bVerbose)
8069                 {
8070                     dd_print_load_verbose(dd);
8071                 }
8072             }
8073             comm->n_load_collect++;
8074
8075             if (bCheckDLB) {
8076                 /* Since the timings are node dependent, the master decides */
8077                 if (DDMASTER(dd))
8078                 {
8079                     bTurnOnDLB =
8080                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
8081                     if (debug)
8082                     {
8083                         fprintf(debug,"step %s, imb loss %f\n",
8084                                 gmx_step_str(step,sbuf),
8085                                 dd_force_imb_perf_loss(dd));
8086                     }
8087                 }
8088                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
8089                 if (bTurnOnDLB)
8090                 {
8091                     turn_on_dlb(fplog,cr,step);
8092                     bDoDLB = TRUE;
8093                 }
8094             }
8095         }
8096         comm->n_load_have++;
8097     }
8098
8099     cgs_gl = &comm->cgs_gl;
8100
8101     bRedist = FALSE;
8102     if (bMasterState)
8103     {
8104         /* Clear the old state */
8105         clear_dd_indices(dd,0,0);
8106
8107         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
8108                   TRUE,cgs_gl,state_global->x,&ddbox);
8109
8110         get_cg_distribution(fplog,step,dd,cgs_gl,
8111                             state_global->box,&ddbox,state_global->x);
8112
8113         dd_distribute_state(dd,cgs_gl,
8114                             state_global,state_local,f);
8115
8116         dd_make_local_cgs(dd,&top_local->cgs);
8117
8118         if (dd->ncg_home > fr->cg_nalloc)
8119         {
8120             dd_realloc_fr_cg(fr,dd->ncg_home);
8121         }
8122         calc_cgcm(fplog,0,dd->ncg_home,
8123                   &top_local->cgs,state_local->x,fr->cg_cm);
8124
8125         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8126
8127         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8128
8129         cg0 = 0;
8130     }
8131     else if (state_local->ddp_count != dd->ddp_count)
8132     {
8133         if (state_local->ddp_count > dd->ddp_count)
8134         {
8135             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
8136         }
8137
8138         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
8139         {
8140             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
8141         }
8142
8143         /* Clear the old state */
8144         clear_dd_indices(dd,0,0);
8145
8146         /* Build the new indices */
8147         rebuild_cgindex(dd,cgs_gl->index,state_local);
8148         make_dd_indices(dd,cgs_gl->index,0);
8149
8150         /* Redetermine the cg COMs */
8151         calc_cgcm(fplog,0,dd->ncg_home,
8152                   &top_local->cgs,state_local->x,fr->cg_cm);
8153
8154         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8155
8156         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8157
8158         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8159                   TRUE,&top_local->cgs,state_local->x,&ddbox);
8160
8161         bRedist = comm->bDynLoadBal;
8162     }
8163     else
8164     {
8165         /* We have the full state, only redistribute the cgs */
8166
8167         /* Clear the non-home indices */
8168         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
8169
8170         /* Avoid global communication for dim's without pbc and -gcom */
8171         if (!bNStGlobalComm)
8172         {
8173             copy_rvec(comm->box0    ,ddbox.box0    );
8174             copy_rvec(comm->box_size,ddbox.box_size);
8175         }
8176         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8177                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
8178
8179         bBoxChanged = TRUE;
8180         bRedist = TRUE;
8181     }
8182     /* For dim's without pbc and -gcom */
8183     copy_rvec(ddbox.box0    ,comm->box0    );
8184     copy_rvec(ddbox.box_size,comm->box_size);
8185
8186     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
8187                       step,wcycle);
8188
8189     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
8190     {
8191         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
8192     }
8193
8194     /* Check if we should sort the charge groups */
8195     if (comm->nstSortCG > 0)
8196     {
8197         bSortCG = (bMasterState ||
8198                    (bRedist && (step % comm->nstSortCG == 0)));
8199     }
8200     else
8201     {
8202         bSortCG = FALSE;
8203     }
8204
8205     ncg_home_old = dd->ncg_home;
8206
8207     if (bRedist)
8208     {
8209         cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
8210                                  state_local,f,fr,mdatoms,
8211                                  !bSortCG,nrnb);
8212     }
8213
8214     get_nsgrid_boundaries(fr->ns.grid,dd,
8215                           state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
8216                           dd->ncg_home,fr->cg_cm,
8217                           cell_ns_x0,cell_ns_x1,&grid_density);
8218
8219     if (bBoxChanged)
8220     {
8221         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
8222     }
8223
8224     copy_ivec(fr->ns.grid->n,ncells_old);
8225     grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
8226                state_local->box,cell_ns_x0,cell_ns_x1,
8227                fr->rlistlong,grid_density);
8228     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
8229     copy_ivec(ddbox.tric_dir,comm->tric_dir);
8230
8231     if (bSortCG)
8232     {
8233         /* Sort the state on charge group position.
8234          * This enables exact restarts from this step.
8235          * It also improves performance by about 15% with larger numbers
8236          * of atoms per node.
8237          */
8238
8239         /* Fill the ns grid with the home cell,
8240          * so we can sort with the indices.
8241          */
8242         set_zones_ncg_home(dd);
8243         fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
8244                   0,dd->ncg_home,fr->cg_cm);
8245
8246         /* Check if we can user the old order and ns grid cell indices
8247          * of the charge groups to sort the charge groups efficiently.
8248          */
8249         bResortAll = (bMasterState ||
8250                       fr->ns.grid->n[XX] != ncells_old[XX] ||
8251                       fr->ns.grid->n[YY] != ncells_old[YY] ||
8252                       fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
8253
8254         if (debug)
8255         {
8256             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
8257                     gmx_step_str(step,sbuf),dd->ncg_home);
8258         }
8259         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
8260                       bResortAll ? -1 : ncg_home_old);
8261         /* Rebuild all the indices */
8262         cg0 = 0;
8263         ga2la_clear(dd->ga2la);
8264     }
8265
8266     /* Setup up the communication and communicate the coordinates */
8267     setup_dd_communication(dd,state_local->box,&ddbox,fr);
8268
8269     /* Set the indices */
8270     make_dd_indices(dd,cgs_gl->index,cg0);
8271
8272     /* Set the charge group boundaries for neighbor searching */
8273     set_cg_boundaries(&comm->zones);
8274
8275     /*
8276     write_dd_pdb("dd_home",step,"dump",top_global,cr,
8277                  -1,state_local->x,state_local->box);
8278     */
8279
8280     /* Extract a local topology from the global topology */
8281     for(i=0; i<dd->ndim; i++)
8282     {
8283         np[dd->dim[i]] = comm->cd[i].np;
8284     }
8285     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
8286                       comm->cellsize_min,np,
8287                       fr,vsite,top_global,top_local);
8288
8289     /* Set up the special atom communication */
8290     n = comm->nat[ddnatZONE];
8291     for(i=ddnatZONE+1; i<ddnatNR; i++)
8292     {
8293         switch(i)
8294         {
8295         case ddnatVSITE:
8296             if (vsite && vsite->n_intercg_vsite)
8297             {
8298                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
8299             }
8300             break;
8301         case ddnatCON:
8302             if (dd->bInterCGcons)
8303             {
8304                 /* Only for inter-cg constraints we need special code */
8305                 n = dd_make_local_constraints(dd,n,top_global,
8306                                               constr,ir->nProjOrder,
8307                                               &top_local->idef.il[F_CONSTR]);
8308             }
8309             break;
8310         default:
8311             gmx_incons("Unknown special atom type setup");
8312         }
8313         comm->nat[i] = n;
8314     }
8315
8316     /* Make space for the extra coordinates for virtual site
8317      * or constraint communication.
8318      */
8319     state_local->natoms = comm->nat[ddnatNR-1];
8320     if (state_local->natoms > state_local->nalloc)
8321     {
8322         dd_realloc_state(state_local,f,state_local->natoms);
8323     }
8324
8325     if (fr->bF_NoVirSum)
8326     {
8327         if (vsite && vsite->n_intercg_vsite)
8328         {
8329             nat_f_novirsum = comm->nat[ddnatVSITE];
8330         }
8331         else
8332         {
8333             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
8334             {
8335                 nat_f_novirsum = dd->nat_tot;
8336             }
8337             else
8338             {
8339                 nat_f_novirsum = dd->nat_home;
8340             }
8341         }
8342     }
8343     else
8344     {
8345         nat_f_novirsum = 0;
8346     }
8347
8348     /* Set the number of atoms required for the force calculation */
8349     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,dd->nat_tot,nat_f_novirsum);
8350
8351     /* We make the all mdatoms up to nat_tot_con.
8352      * We could save some work by only setting invmass
8353      * between nat_tot and nat_tot_con.
8354      */
8355     /* This call also sets the new number of home particles to dd->nat_home */
8356     atoms2md(top_global,ir,
8357              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
8358
8359     if (shellfc)
8360     {
8361         /* Make the local shell stuff, currently no communication is done */
8362         make_local_shells(cr,mdatoms,shellfc);
8363     }
8364
8365         if (ir->implicit_solvent)
8366     {
8367         make_local_gb(cr,fr->born,ir->gb_algorithm);
8368     }
8369
8370     if (!(cr->duty & DUTY_PME))
8371     {
8372         /* Send the charges to our PME only node */
8373         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
8374                        mdatoms->chargeA,mdatoms->chargeB,
8375                        comm->ddpme[0].maxshift,comm->ddpme[1].maxshift);
8376     }
8377
8378     if (constr)
8379     {
8380         set_constraints(constr,top_local,ir,mdatoms,cr);
8381     }
8382
8383     if (ir->ePull != epullNO)
8384     {
8385         /* Update the local pull groups */
8386         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
8387     }
8388
8389     if (ir->bRot)
8390     {
8391         /* Update the local rotation groups */
8392         dd_make_local_rotation_groups(dd,ir->rot,mdatoms);
8393     }
8394
8395
8396     add_dd_statistics(dd);
8397
8398     /* Make sure we only count the cycles for this DD partitioning */
8399     clear_dd_cycle_counts(dd);
8400
8401     /* Because the order of the atoms might have changed since
8402      * the last vsite construction, we need to communicate the constructing
8403      * atom coordinates again (for spreading the forces this MD step).
8404      */
8405     dd_move_x_vsites(dd,state_local->box,state_local->x);
8406
8407     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
8408     {
8409         dd_move_x(dd,state_local->box,state_local->x);
8410         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
8411                      -1,state_local->x,state_local->box);
8412     }
8413
8414     /* Store the partitioning step */
8415     comm->partition_step = step;
8416
8417     /* Increase the DD partitioning counter */
8418     dd->ddp_count++;
8419     /* The state currently matches this DD partitioning count, store it */
8420     state_local->ddp_count = dd->ddp_count;
8421     if (bMasterState)
8422     {
8423         /* The DD master node knows the complete cg distribution,
8424          * store the count so we can possibly skip the cg info communication.
8425          */
8426         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
8427     }
8428
8429     if (comm->DD_debug > 0)
8430     {
8431         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
8432         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
8433                                 "after partitioning");
8434     }
8435 }