src/mdlib/domdec.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of Gromacs        Copyright (c) 1991-2008
   5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation; either version 2
  10  * of the License, or (at your option) any later version.
  11  *
  12  * To help us fund GROMACS development, we humbly ask that you cite
  13  * the research papers on the package. Check out http://www.gromacs.org
  14  *
  15  * And Hey:
  16  * Gnomes, ROck Monsters And Chili Sauce
  17  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 #include <config.h>
  21 #endif
  22
  23 #include <stdio.h>
  24 #include <time.h>
  25 #include <math.h>
  26 #include <string.h>
  27 #include <stdlib.h>
  28 #include "typedefs.h"
  29 #include "smalloc.h"
  30 #include "vec.h"
  31 #include "domdec.h"
  32 #include "domdec_network.h"
  33 #include "nrnb.h"
  34 #include "pbc.h"
  35 #include "chargegroup.h"
  36 #include "constr.h"
  37 #include "mdatoms.h"
  38 #include "names.h"
  39 #include "pdbio.h"
  40 #include "futil.h"
  41 #include "force.h"
  42 #include "pme.h"
  43 #include "pull.h"
  44 #include "pull_rotation.h"
  45 #include "gmx_wallcycle.h"
  46 #include "mdrun.h"
  47 #include "nsgrid.h"
  48 #include "shellfc.h"
  49 #include "mtop_util.h"
  50 #include "gmxfio.h"
  51 #include "gmx_ga2la.h"
  52 #include "gmx_sort.h"
  53
  54 #ifdef GMX_LIB_MPI
  55 #include <mpi.h>
  56 #endif
  57 #ifdef GMX_THREADS
  58 #include "tmpi.h"
  59 #endif
  60
  61 #define DDRANK(dd,rank)    (rank)
  62 #define DDMASTERRANK(dd)   (dd->masterrank)
  63
  64 typedef struct gmx_domdec_master
  65 {
  66     /* The cell boundaries */
  67     real **cell_x;
  68     /* The global charge group division */
  69     int  *ncg;     /* Number of home charge groups for each node */
  70     int  *index;   /* Index of nnodes+1 into cg */
  71     int  *cg;      /* Global charge group index */
  72     int  *nat;     /* Number of home atoms for each node. */
  73     int  *ibuf;    /* Buffer for communication */
  74     rvec *vbuf;    /* Buffer for state scattering and gathering */
  75 } gmx_domdec_master_t;
  76
  77 typedef struct
  78 {
  79     /* The numbers of charge groups to send and receive for each cell
  80      * that requires communication, the last entry contains the total
  81      * number of atoms that needs to be communicated.
  82      */
  83     int nsend[DD_MAXIZONE+2];
  84     int nrecv[DD_MAXIZONE+2];
  85     /* The charge groups to send */
  86     int *index;
  87     int nalloc;
  88     /* The atom range for non-in-place communication */
  89     int cell2at0[DD_MAXIZONE];
  90     int cell2at1[DD_MAXIZONE];
  91 } gmx_domdec_ind_t;
  92
  93 typedef struct
  94 {
  95     int  np;                   /* Number of grid pulses in this dimension */
  96     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
  97     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
  98     int  np_nalloc;
  99     gmx_bool bInPlace;             /* Can we communicate in place?            */
 100 } gmx_domdec_comm_dim_t;
 101
 102 typedef struct
 103 {
 104     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 105     real *cell_f;      /* State var.: cell boundaries, box relative      */
 106     real *old_cell_f;  /* Temp. var.: old cell size                      */
 107     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 108     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 109     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 110     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 111     gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
 112     real *buf_ncd;     /* Temp. var.                                     */
 113 } gmx_domdec_root_t;
 114
 115 #define DD_NLOAD_MAX 9
 116
 117 /* Here floats are accurate enough, since these variables
 118  * only influence the load balancing, not the actual MD results.
 119  */
 120 typedef struct
 121 {
 122     int  nload;
 123     float *load;
 124     float sum;
 125     float max;
 126     float sum_m;
 127     float cvol_min;
 128     float mdf;
 129     float pme;
 130     int   flags;
 131 } gmx_domdec_load_t;
 132
 133 typedef struct
 134 {
 135     int  nsc;
 136     int  ind_gl;
 137     int  ind;
 138 } gmx_cgsort_t;
 139
 140 typedef struct
 141 {
 142     gmx_cgsort_t *sort1,*sort2;
 143     int  sort_nalloc;
 144     gmx_cgsort_t *sort_new;
 145     int  sort_new_nalloc;
 146     int  *ibuf;
 147     int  ibuf_nalloc;
 148 } gmx_domdec_sort_t;
 149
 150 typedef struct
 151 {
 152     rvec *v;
 153     int  nalloc;
 154 } vec_rvec_t;
 155
 156 /* This enum determines the order of the coordinates.
 157  * ddnatHOME and ddnatZONE should be first and second,
 158  * the others can be ordered as wanted.
 159  */
 160 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
 161
 162 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
 163 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 164
 165 typedef struct
 166 {
 167     int  dim;      /* The dimension                                          */
 168     gmx_bool dim_match;/* Tells if DD and PME dims match                         */
 169     int  nslab;    /* The number of PME slabs in this dimension              */
 170     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 171     int  *pp_min;  /* The minimum pp node location, size nslab               */
 172     int  *pp_max;  /* The maximum pp node location,size nslab                */
 173     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
 174 } gmx_ddpme_t;
 175
 176 typedef struct
 177 {
 178     real min0;    /* The minimum bottom of this zone                        */
 179     real max1;    /* The maximum top of this zone                           */
 180     real mch0;    /* The maximum bottom communicaton height for this zone   */
 181     real mch1;    /* The maximum top communicaton height for this zone      */
 182     real p1_0;    /* The bottom value of the first cell in this zone        */
 183     real p1_1;    /* The top value of the first cell in this zone           */
 184 } gmx_ddzone_t;
 185
 186 typedef struct gmx_domdec_comm
 187 {
 188     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 189      * unless stated otherwise.
 190      */
 191
 192     /* The number of decomposition dimensions for PME, 0: no PME */
 193     int  npmedecompdim;
 194     /* The number of nodes doing PME (PP/PME or only PME) */
 195     int  npmenodes;
 196     int  npmenodes_x;
 197     int  npmenodes_y;
 198     /* The communication setup including the PME only nodes */
 199     gmx_bool bCartesianPP_PME;
 200     ivec ntot;
 201     int  cartpmedim;
 202     int  *pmenodes;          /* size npmenodes                         */
 203     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 204                               * but with bCartesianPP_PME              */
 205     gmx_ddpme_t ddpme[2];
 206
 207     /* The DD particle-particle nodes only */
 208     gmx_bool bCartesianPP;
 209     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 210
 211     /* The global charge groups */
 212     t_block cgs_gl;
 213
 214     /* Should we sort the cgs */
 215     int  nstSortCG;
 216     gmx_domdec_sort_t *sort;
 217
 218     /* Are there bonded and multi-body interactions between charge groups? */
 219     gmx_bool bInterCGBondeds;
 220     gmx_bool bInterCGMultiBody;
 221
 222     /* Data for the optional bonded interaction atom communication range */
 223     gmx_bool bBondComm;
 224     t_blocka *cglink;
 225     char *bLocalCG;
 226
 227     /* The DLB option */
 228     int  eDLB;
 229     /* Are we actually using DLB? */
 230     gmx_bool bDynLoadBal;
 231
 232     /* Cell sizes for static load balancing, first index cartesian */
 233     real **slb_frac;
 234
 235     /* The width of the communicated boundaries */
 236     real cutoff_mbody;
 237     real cutoff;
 238     /* The minimum cell size (including triclinic correction) */
 239     rvec cellsize_min;
 240     /* For dlb, for use with edlbAUTO */
 241     rvec cellsize_min_dlb;
 242     /* The lower limit for the DD cell size with DLB */
 243     real cellsize_limit;
 244     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 245     gmx_bool bVacDLBNoLimit;
 246
 247     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 248     ivec tric_dir;
 249     /* box0 and box_size are required with dim's without pbc and -gcom */
 250     rvec box0;
 251     rvec box_size;
 252
 253     /* The cell boundaries */
 254     rvec cell_x0;
 255     rvec cell_x1;
 256
 257     /* The old location of the cell boundaries, to check cg displacements */
 258     rvec old_cell_x0;
 259     rvec old_cell_x1;
 260
 261     /* The communication setup and charge group boundaries for the zones */
 262     gmx_domdec_zones_t zones;
 263
 264     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 265      * cell boundaries of neighboring cells for dynamic load balancing.
 266      */
 267     gmx_ddzone_t zone_d1[2];
 268     gmx_ddzone_t zone_d2[2][2];
 269
 270     /* The coordinate/force communication setup and indices */
 271     gmx_domdec_comm_dim_t cd[DIM];
 272     /* The maximum number of cells to communicate with in one dimension */
 273     int  maxpulse;
 274
 275     /* Which cg distribution is stored on the master node */
 276     int master_cg_ddp_count;
 277
 278     /* The number of cg's received from the direct neighbors */
 279     int  zone_ncg1[DD_MAXZONE];
 280
 281     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 282     int  nat[ddnatNR];
 283
 284     /* Communication buffer for general use */
 285     int  *buf_int;
 286     int  nalloc_int;
 287
 288      /* Communication buffer for general use */
 289     vec_rvec_t vbuf;
 290
 291     /* Communication buffers only used with multiple grid pulses */
 292     int  *buf_int2;
 293     int  nalloc_int2;
 294     vec_rvec_t vbuf2;
 295
 296     /* Communication buffers for local redistribution */
 297     int  **cggl_flag;
 298     int  cggl_flag_nalloc[DIM*2];
 299     rvec **cgcm_state;
 300     int  cgcm_state_nalloc[DIM*2];
 301
 302     /* Cell sizes for dynamic load balancing */
 303     gmx_domdec_root_t **root;
 304     real *cell_f_row;
 305     real cell_f0[DIM];
 306     real cell_f1[DIM];
 307     real cell_f_max0[DIM];
 308     real cell_f_min1[DIM];
 309
 310     /* Stuff for load communication */
 311     gmx_bool bRecordLoad;
 312     gmx_domdec_load_t *load;
 313 #ifdef GMX_MPI
 314     MPI_Comm *mpi_comm_load;
 315 #endif
 316     /* Cycle counters */
 317     float cycl[ddCyclNr];
 318     int   cycl_n[ddCyclNr];
 319     float cycl_max[ddCyclNr];
 320     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 321     int eFlop;
 322     double flop;
 323     int    flop_n;
 324     /* Have often have did we have load measurements */
 325     int    n_load_have;
 326     /* Have often have we collected the load measurements */
 327     int    n_load_collect;
 328
 329     /* Statistics */
 330     double sum_nat[ddnatNR-ddnatZONE];
 331     int    ndecomp;
 332     int    nload;
 333     double load_step;
 334     double load_sum;
 335     double load_max;
 336     ivec   load_lim;
 337     double load_mdf;
 338     double load_pme;
 339
 340     /* The last partition step */
 341     gmx_large_int_t partition_step;
 342
 343     /* Debugging */
 344     int  nstDDDump;
 345     int  nstDDDumpGrid;
 346     int  DD_debug;
 347 } gmx_domdec_comm_t;
 348
 349 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 350 #define DD_CGIBS 2
 351
 352 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 353 #define DD_FLAG_NRCG  65535
 354 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 355 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 356
 357 /* Zone permutation required to obtain consecutive charge groups
 358  * for neighbor searching.
 359  */
 360 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
 361
 362 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 363  * components see only j zones with that component 0.
 364  */
 365
 366 /* The DD zone order */
 367 static const ivec dd_zo[DD_MAXZONE] =
 368   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
 369
 370 /* The 3D setup */
 371 #define dd_z3n  8
 372 #define dd_zp3n 4
 373 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
 374
 375 /* The 2D setup */
 376 #define dd_z2n  4
 377 #define dd_zp2n 2
 378 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
 379
 380 /* The 1D setup */
 381 #define dd_z1n  2
 382 #define dd_zp1n 1
 383 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
 384
 385 /* Factors used to avoid problems due to rounding issues */
 386 #define DD_CELL_MARGIN       1.0001
 387 #define DD_CELL_MARGIN2      1.00005
 388 /* Factor to account for pressure scaling during nstlist steps */
 389 #define DD_PRES_SCALE_MARGIN 1.02
 390
 391 /* Allowed performance loss before we DLB or warn */
 392 #define DD_PERF_LOSS 0.05
 393
 394 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 395
 396 /* Use separate MPI send and receive commands
 397  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 398  * This saves memory (and some copying for small nnodes).
 399  * For high parallelization scatter and gather calls are used.
 400  */
 401 #define GMX_DD_NNODES_SENDRECV 4
 402
 403
 404 /*
 405 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 406
 407 static void index2xyz(ivec nc,int ind,ivec xyz)
 408 {
 409   xyz[XX] = ind % nc[XX];
 410   xyz[YY] = (ind / nc[XX]) % nc[YY];
 411   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 412 }
 413 */
 414
 415 /* This order is required to minimize the coordinate communication in PME
 416  * which uses decomposition in the x direction.
 417  */
 418 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 419
 420 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
 421 {
 422     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 423     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 424     xyz[ZZ] = ind % nc[ZZ];
 425 }
 426
 427 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
 428 {
 429     int ddindex;
 430     int ddnodeid=-1;
 431
 432     ddindex = dd_index(dd->nc,c);
 433     if (dd->comm->bCartesianPP_PME)
 434     {
 435         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 436     }
 437     else if (dd->comm->bCartesianPP)
 438     {
 439 #ifdef GMX_MPI
 440         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
 441 #endif
 442     }
 443     else
 444     {
 445         ddnodeid = ddindex;
 446     }
 447
 448     return ddnodeid;
 449 }
 450
 451 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
 452 {
 453     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 454 }
 455
 456 int ddglatnr(gmx_domdec_t *dd,int i)
 457 {
 458     int atnr;
 459
 460     if (dd == NULL)
 461     {
 462         atnr = i + 1;
 463     }
 464     else
 465     {
 466         if (i >= dd->comm->nat[ddnatNR-1])
 467         {
 468             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
 469         }
 470         atnr = dd->gatindex[i] + 1;
 471     }
 472
 473     return atnr;
 474 }
 475
 476 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 477 {
 478     return &dd->comm->cgs_gl;
 479 }
 480
 481 static void vec_rvec_init(vec_rvec_t *v)
 482 {
 483     v->nalloc = 0;
 484     v->v      = NULL;
 485 }
 486
 487 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
 488 {
 489     if (n > v->nalloc)
 490     {
 491         v->nalloc = over_alloc_dd(n);
 492         srenew(v->v,v->nalloc);
 493     }
 494 }
 495
 496 void dd_store_state(gmx_domdec_t *dd,t_state *state)
 497 {
 498     int i;
 499
 500     if (state->ddp_count != dd->ddp_count)
 501     {
 502         gmx_incons("The state does not the domain decomposition state");
 503     }
 504
 505     state->ncg_gl = dd->ncg_home;
 506     if (state->ncg_gl > state->cg_gl_nalloc)
 507     {
 508         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 509         srenew(state->cg_gl,state->cg_gl_nalloc);
 510     }
 511     for(i=0; i<state->ncg_gl; i++)
 512     {
 513         state->cg_gl[i] = dd->index_gl[i];
 514     }
 515
 516     state->ddp_count_cg_gl = dd->ddp_count;
 517 }
 518
 519 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 520 {
 521     return &dd->comm->zones;
 522 }
 523
 524 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
 525                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
 526 {
 527     gmx_domdec_zones_t *zones;
 528     int izone,d,dim;
 529
 530     zones = &dd->comm->zones;
 531
 532     izone = 0;
 533     while (icg >= zones->izone[izone].cg1)
 534     {
 535         izone++;
 536     }
 537
 538     if (izone == 0)
 539     {
 540         *jcg0 = icg;
 541     }
 542     else if (izone < zones->nizone)
 543     {
 544         *jcg0 = zones->izone[izone].jcg0;
 545     }
 546     else
 547     {
 548         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
 549                   icg,izone,zones->nizone);
 550     }
 551
 552     *jcg1 = zones->izone[izone].jcg1;
 553
 554     for(d=0; d<dd->ndim; d++)
 555     {
 556         dim = dd->dim[d];
 557         shift0[dim] = zones->izone[izone].shift0[dim];
 558         shift1[dim] = zones->izone[izone].shift1[dim];
 559         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 560         {
 561             /* A conservative approach, this can be optimized */
 562             shift0[dim] -= 1;
 563             shift1[dim] += 1;
 564         }
 565     }
 566 }
 567
 568 int dd_natoms_vsite(gmx_domdec_t *dd)
 569 {
 570     return dd->comm->nat[ddnatVSITE];
 571 }
 572
 573 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
 574 {
 575     *at_start = dd->comm->nat[ddnatCON-1];
 576     *at_end   = dd->comm->nat[ddnatCON];
 577 }
 578
 579 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
 580 {
 581     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 582     int  *index,*cgindex;
 583     gmx_domdec_comm_t *comm;
 584     gmx_domdec_comm_dim_t *cd;
 585     gmx_domdec_ind_t *ind;
 586     rvec shift={0,0,0},*buf,*rbuf;
 587     gmx_bool bPBC,bScrew;
 588
 589     comm = dd->comm;
 590
 591     cgindex = dd->cgindex;
 592
 593     buf = comm->vbuf.v;
 594
 595     nzone = 1;
 596     nat_tot = dd->nat_home;
 597     for(d=0; d<dd->ndim; d++)
 598     {
 599         bPBC   = (dd->ci[dd->dim[d]] == 0);
 600         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 601         if (bPBC)
 602         {
 603             copy_rvec(box[dd->dim[d]],shift);
 604         }
 605         cd = &comm->cd[d];
 606         for(p=0; p<cd->np; p++)
 607         {
 608             ind = &cd->ind[p];
 609             index = ind->index;
 610             n = 0;
 611             if (!bPBC)
 612             {
 613                 for(i=0; i<ind->nsend[nzone]; i++)
 614                 {
 615                     at0 = cgindex[index[i]];
 616                     at1 = cgindex[index[i]+1];
 617                     for(j=at0; j<at1; j++)
 618                     {
 619                         copy_rvec(x[j],buf[n]);
 620                         n++;
 621                     }
 622                 }
 623             }
 624             else if (!bScrew)
 625             {
 626                 for(i=0; i<ind->nsend[nzone]; i++)
 627                 {
 628                     at0 = cgindex[index[i]];
 629                     at1 = cgindex[index[i]+1];
 630                     for(j=at0; j<at1; j++)
 631                     {
 632                         /* We need to shift the coordinates */
 633                         rvec_add(x[j],shift,buf[n]);
 634                         n++;
 635                     }
 636                 }
 637             }
 638             else
 639             {
 640                 for(i=0; i<ind->nsend[nzone]; i++)
 641                 {
 642                     at0 = cgindex[index[i]];
 643                     at1 = cgindex[index[i]+1];
 644                     for(j=at0; j<at1; j++)
 645                     {
 646                         /* Shift x */
 647                         buf[n][XX] = x[j][XX] + shift[XX];
 648                         /* Rotate y and z.
 649                          * This operation requires a special shift force
 650                          * treatment, which is performed in calc_vir.
 651                          */
 652                         buf[n][YY] = box[YY][YY] - x[j][YY];
 653                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 654                         n++;
 655                     }
 656                 }
 657             }
 658
 659             if (cd->bInPlace)
 660             {
 661                 rbuf = x + nat_tot;
 662             }
 663             else
 664             {
 665                 rbuf = comm->vbuf2.v;
 666             }
 667             /* Send and receive the coordinates */
 668             dd_sendrecv_rvec(dd, d, dddirBackward,
 669                              buf,  ind->nsend[nzone+1],
 670                              rbuf, ind->nrecv[nzone+1]);
 671             if (!cd->bInPlace)
 672             {
 673                 j = 0;
 674                 for(zone=0; zone<nzone; zone++)
 675                 {
 676                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 677                     {
 678                         copy_rvec(rbuf[j],x[i]);
 679                         j++;
 680                     }
 681                 }
 682             }
 683             nat_tot += ind->nrecv[nzone+1];
 684         }
 685         nzone += nzone;
 686     }
 687 }
 688
 689 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
 690 {
 691     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 692     int  *index,*cgindex;
 693     gmx_domdec_comm_t *comm;
 694     gmx_domdec_comm_dim_t *cd;
 695     gmx_domdec_ind_t *ind;
 696     rvec *buf,*sbuf;
 697     ivec vis;
 698     int  is;
 699     gmx_bool bPBC,bScrew;
 700
 701     comm = dd->comm;
 702
 703     cgindex = dd->cgindex;
 704
 705     buf = comm->vbuf.v;
 706
 707     n = 0;
 708     nzone = comm->zones.n/2;
 709     nat_tot = dd->nat_tot;
 710     for(d=dd->ndim-1; d>=0; d--)
 711     {
 712         bPBC   = (dd->ci[dd->dim[d]] == 0);
 713         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 714         if (fshift == NULL && !bScrew)
 715         {
 716             bPBC = FALSE;
 717         }
 718         /* Determine which shift vector we need */
 719         clear_ivec(vis);
 720         vis[dd->dim[d]] = 1;
 721         is = IVEC2IS(vis);
 722
 723         cd = &comm->cd[d];
 724         for(p=cd->np-1; p>=0; p--) {
 725             ind = &cd->ind[p];
 726             nat_tot -= ind->nrecv[nzone+1];
 727             if (cd->bInPlace)
 728             {
 729                 sbuf = f + nat_tot;
 730             }
 731             else
 732             {
 733                 sbuf = comm->vbuf2.v;
 734                 j = 0;
 735                 for(zone=0; zone<nzone; zone++)
 736                 {
 737                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 738                     {
 739                         copy_rvec(f[i],sbuf[j]);
 740                         j++;
 741                     }
 742                 }
 743             }
 744             /* Communicate the forces */
 745             dd_sendrecv_rvec(dd, d, dddirForward,
 746                              sbuf, ind->nrecv[nzone+1],
 747                              buf,  ind->nsend[nzone+1]);
 748             index = ind->index;
 749             /* Add the received forces */
 750             n = 0;
 751             if (!bPBC)
 752             {
 753                 for(i=0; i<ind->nsend[nzone]; i++)
 754                 {
 755                     at0 = cgindex[index[i]];
 756                     at1 = cgindex[index[i]+1];
 757                     for(j=at0; j<at1; j++)
 758                     {
 759                         rvec_inc(f[j],buf[n]);
 760                         n++;
 761                     }
 762                 }
 763             }
 764             else if (!bScrew)
 765             {
 766                 for(i=0; i<ind->nsend[nzone]; i++)
 767                 {
 768                     at0 = cgindex[index[i]];
 769                     at1 = cgindex[index[i]+1];
 770                     for(j=at0; j<at1; j++)
 771                     {
 772                         rvec_inc(f[j],buf[n]);
 773                         /* Add this force to the shift force */
 774                         rvec_inc(fshift[is],buf[n]);
 775                         n++;
 776                     }
 777                 }
 778             }
 779             else
 780             {
 781                 for(i=0; i<ind->nsend[nzone]; i++)
 782                 {
 783                     at0 = cgindex[index[i]];
 784                     at1 = cgindex[index[i]+1];
 785                     for(j=at0; j<at1; j++)
 786                     {
 787                         /* Rotate the force */
 788                         f[j][XX] += buf[n][XX];
 789                         f[j][YY] -= buf[n][YY];
 790                         f[j][ZZ] -= buf[n][ZZ];
 791                         if (fshift)
 792                         {
 793                             /* Add this force to the shift force */
 794                             rvec_inc(fshift[is],buf[n]);
 795                         }
 796                         n++;
 797                     }
 798                 }
 799             }
 800         }
 801         nzone /= 2;
 802     }
 803 }
 804
 805 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
 806 {
 807     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 808     int  *index,*cgindex;
 809     gmx_domdec_comm_t *comm;
 810     gmx_domdec_comm_dim_t *cd;
 811     gmx_domdec_ind_t *ind;
 812     real *buf,*rbuf;
 813
 814     comm = dd->comm;
 815
 816     cgindex = dd->cgindex;
 817
 818     buf = &comm->vbuf.v[0][0];
 819
 820     nzone = 1;
 821     nat_tot = dd->nat_home;
 822     for(d=0; d<dd->ndim; d++)
 823     {
 824         cd = &comm->cd[d];
 825         for(p=0; p<cd->np; p++)
 826         {
 827             ind = &cd->ind[p];
 828             index = ind->index;
 829             n = 0;
 830             for(i=0; i<ind->nsend[nzone]; i++)
 831             {
 832                 at0 = cgindex[index[i]];
 833                 at1 = cgindex[index[i]+1];
 834                 for(j=at0; j<at1; j++)
 835                 {
 836                     buf[n] = v[j];
 837                     n++;
 838                 }
 839             }
 840
 841             if (cd->bInPlace)
 842             {
 843                 rbuf = v + nat_tot;
 844             }
 845             else
 846             {
 847                 rbuf = &comm->vbuf2.v[0][0];
 848             }
 849             /* Send and receive the coordinates */
 850             dd_sendrecv_real(dd, d, dddirBackward,
 851                              buf,  ind->nsend[nzone+1],
 852                              rbuf, ind->nrecv[nzone+1]);
 853             if (!cd->bInPlace)
 854             {
 855                 j = 0;
 856                 for(zone=0; zone<nzone; zone++)
 857                 {
 858                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 859                     {
 860                         v[i] = rbuf[j];
 861                         j++;
 862                     }
 863                 }
 864             }
 865             nat_tot += ind->nrecv[nzone+1];
 866         }
 867         nzone += nzone;
 868     }
 869 }
 870
 871 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
 872 {
 873     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 874     int  *index,*cgindex;
 875     gmx_domdec_comm_t *comm;
 876     gmx_domdec_comm_dim_t *cd;
 877     gmx_domdec_ind_t *ind;
 878     real *buf,*sbuf;
 879
 880     comm = dd->comm;
 881
 882     cgindex = dd->cgindex;
 883
 884     buf = &comm->vbuf.v[0][0];
 885
 886     n = 0;
 887     nzone = comm->zones.n/2;
 888     nat_tot = dd->nat_tot;
 889     for(d=dd->ndim-1; d>=0; d--)
 890     {
 891         cd = &comm->cd[d];
 892         for(p=cd->np-1; p>=0; p--) {
 893             ind = &cd->ind[p];
 894             nat_tot -= ind->nrecv[nzone+1];
 895             if (cd->bInPlace)
 896             {
 897                 sbuf = v + nat_tot;
 898             }
 899             else
 900             {
 901                 sbuf = &comm->vbuf2.v[0][0];
 902                 j = 0;
 903                 for(zone=0; zone<nzone; zone++)
 904                 {
 905                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 906                     {
 907                         sbuf[j] = v[i];
 908                         j++;
 909                     }
 910                 }
 911             }
 912             /* Communicate the forces */
 913             dd_sendrecv_real(dd, d, dddirForward,
 914                              sbuf, ind->nrecv[nzone+1],
 915                              buf,  ind->nsend[nzone+1]);
 916             index = ind->index;
 917             /* Add the received forces */
 918             n = 0;
 919             for(i=0; i<ind->nsend[nzone]; i++)
 920             {
 921                 at0 = cgindex[index[i]];
 922                 at1 = cgindex[index[i]+1];
 923                 for(j=at0; j<at1; j++)
 924                 {
 925                     v[j] += buf[n];
 926                     n++;
 927                 }
 928             }
 929         }
 930         nzone /= 2;
 931     }
 932 }
 933
 934 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
 935 {
 936     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 937             d,i,j,
 938             zone->min0,zone->max1,
 939             zone->mch0,zone->mch0,
 940             zone->p1_0,zone->p1_1);
 941 }
 942
 943 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 944                                int ddimind,int direction,
 945                                gmx_ddzone_t *buf_s,int n_s,
 946                                gmx_ddzone_t *buf_r,int n_r)
 947 {
 948     rvec vbuf_s[5*2],vbuf_r[5*2];
 949     int i;
 950
 951     for(i=0; i<n_s; i++)
 952     {
 953         vbuf_s[i*2  ][0] = buf_s[i].min0;
 954         vbuf_s[i*2  ][1] = buf_s[i].max1;
 955         vbuf_s[i*2  ][2] = buf_s[i].mch0;
 956         vbuf_s[i*2+1][0] = buf_s[i].mch1;
 957         vbuf_s[i*2+1][1] = buf_s[i].p1_0;
 958         vbuf_s[i*2+1][2] = buf_s[i].p1_1;
 959     }
 960
 961     dd_sendrecv_rvec(dd, ddimind, direction,
 962                      vbuf_s, n_s*2,
 963                      vbuf_r, n_r*2);
 964
 965     for(i=0; i<n_r; i++)
 966     {
 967         buf_r[i].min0 = vbuf_r[i*2  ][0];
 968         buf_r[i].max1 = vbuf_r[i*2  ][1];
 969         buf_r[i].mch0 = vbuf_r[i*2  ][2];
 970         buf_r[i].mch1 = vbuf_r[i*2+1][0];
 971         buf_r[i].p1_0 = vbuf_r[i*2+1][1];
 972         buf_r[i].p1_1 = vbuf_r[i*2+1][2];
 973     }
 974 }
 975
 976 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 977                           rvec cell_ns_x0,rvec cell_ns_x1)
 978 {
 979     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
 980     gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
 981     rvec extr_s[2],extr_r[2];
 982     rvec dh;
 983     real dist_d,c=0,det;
 984     gmx_domdec_comm_t *comm;
 985     gmx_bool bPBC,bUse;
 986
 987     comm = dd->comm;
 988
 989     for(d=1; d<dd->ndim; d++)
 990     {
 991         dim = dd->dim[d];
 992         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 993         zp->min0 = cell_ns_x0[dim];
 994         zp->max1 = cell_ns_x1[dim];
 995         zp->mch0 = cell_ns_x0[dim];
 996         zp->mch1 = cell_ns_x1[dim];
 997         zp->p1_0 = cell_ns_x0[dim];
 998         zp->p1_1 = cell_ns_x1[dim];
 999     }
1000
1001     for(d=dd->ndim-2; d>=0; d--)
1002     {
1003         dim  = dd->dim[d];
1004         bPBC = (dim < ddbox->npbcdim);
1005
1006         /* Use an rvec to store two reals */
1007         extr_s[d][0] = comm->cell_f0[d+1];
1008         extr_s[d][1] = comm->cell_f1[d+1];
1009         extr_s[d][2] = 0;
1010
1011         pos = 0;
1012         /* Store the extremes in the backward sending buffer,
1013          * so the get updated separately from the forward communication.
1014          */
1015         for(d1=d; d1<dd->ndim-1; d1++)
1016         {
1017             /* We invert the order to be able to use the same loop for buf_e */
1018             buf_s[pos].min0 = extr_s[d1][1];
1019             buf_s[pos].max1 = extr_s[d1][0];
1020             buf_s[pos].mch0 = 0;
1021             buf_s[pos].mch1 = 0;
1022             /* Store the cell corner of the dimension we communicate along */
1023             buf_s[pos].p1_0 = comm->cell_x0[dim];
1024             buf_s[pos].p1_1 = 0;
1025             pos++;
1026         }
1027
1028         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1029         pos++;
1030
1031         if (dd->ndim == 3 && d == 0)
1032         {
1033             buf_s[pos] = comm->zone_d2[0][1];
1034             pos++;
1035             buf_s[pos] = comm->zone_d1[0];
1036             pos++;
1037         }
1038
1039         /* We only need to communicate the extremes
1040          * in the forward direction
1041          */
1042         npulse = comm->cd[d].np;
1043         if (bPBC)
1044         {
1045             /* Take the minimum to avoid double communication */
1046             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1047         }
1048         else
1049         {
1050             /* Without PBC we should really not communicate over
1051              * the boundaries, but implementing that complicates
1052              * the communication setup and therefore we simply
1053              * do all communication, but ignore some data.
1054              */
1055             npulse_min = npulse;
1056         }
1057         for(p=0; p<npulse_min; p++)
1058         {
1059             /* Communicate the extremes forward */
1060             bUse = (bPBC || dd->ci[dim] > 0);
1061
1062             dd_sendrecv_rvec(dd, d, dddirForward,
1063                              extr_s+d, dd->ndim-d-1,
1064                              extr_r+d, dd->ndim-d-1);
1065
1066             if (bUse)
1067             {
1068                 for(d1=d; d1<dd->ndim-1; d1++)
1069                 {
1070                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1071                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1072                 }
1073             }
1074         }
1075
1076         buf_size = pos;
1077         for(p=0; p<npulse; p++)
1078         {
1079             /* Communicate all the zone information backward */
1080             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1081
1082             dd_sendrecv_ddzone(dd, d, dddirBackward,
1083                                buf_s, buf_size,
1084                                buf_r, buf_size);
1085
1086             clear_rvec(dh);
1087             if (p > 0)
1088             {
1089                 for(d1=d+1; d1<dd->ndim; d1++)
1090                 {
1091                     /* Determine the decrease of maximum required
1092                      * communication height along d1 due to the distance along d,
1093                      * this avoids a lot of useless atom communication.
1094                      */
1095                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1096
1097                     if (ddbox->tric_dir[dim])
1098                     {
1099                         /* c is the off-diagonal coupling between the cell planes
1100                          * along directions d and d1.
1101                          */
1102                         c = ddbox->v[dim][dd->dim[d1]][dim];
1103                     }
1104                     else
1105                     {
1106                         c = 0;
1107                     }
1108                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1109                     if (det > 0)
1110                     {
1111                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1112                     }
1113                     else
1114                     {
1115                         /* A negative value signals out of range */
1116                         dh[d1] = -1;
1117                     }
1118                 }
1119             }
1120
1121             /* Accumulate the extremes over all pulses */
1122             for(i=0; i<buf_size; i++)
1123             {
1124                 if (p == 0)
1125                 {
1126                     buf_e[i] = buf_r[i];
1127                 }
1128                 else
1129                 {
1130                     if (bUse)
1131                     {
1132                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1133                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1134                     }
1135
1136                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1137                     {
1138                         d1 = 1;
1139                     }
1140                     else
1141                     {
1142                         d1 = d + 1;
1143                     }
1144                     if (bUse && dh[d1] >= 0)
1145                     {
1146                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1147                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1148                     }
1149                 }
1150                 /* Copy the received buffer to the send buffer,
1151                  * to pass the data through with the next pulse.
1152                  */
1153                 buf_s[i] = buf_r[i];
1154             }
1155             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1156                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1157             {
1158                 /* Store the extremes */
1159                 pos = 0;
1160
1161                 for(d1=d; d1<dd->ndim-1; d1++)
1162                 {
1163                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1164                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1165                     pos++;
1166                 }
1167
1168                 if (d == 1 || (d == 0 && dd->ndim == 3))
1169                 {
1170                     for(i=d; i<2; i++)
1171                     {
1172                         comm->zone_d2[1-d][i] = buf_e[pos];
1173                         pos++;
1174                     }
1175                 }
1176                 if (d == 0)
1177                 {
1178                     comm->zone_d1[1] = buf_e[pos];
1179                     pos++;
1180                 }
1181             }
1182         }
1183     }
1184
1185     if (dd->ndim >= 2)
1186     {
1187         dim = dd->dim[1];
1188         for(i=0; i<2; i++)
1189         {
1190             if (debug)
1191             {
1192                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1193             }
1194             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1195             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1196         }
1197     }
1198     if (dd->ndim >= 3)
1199     {
1200         dim = dd->dim[2];
1201         for(i=0; i<2; i++)
1202         {
1203             for(j=0; j<2; j++)
1204             {
1205                 if (debug)
1206                 {
1207                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1208                 }
1209                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1210                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1211             }
1212         }
1213     }
1214     for(d=1; d<dd->ndim; d++)
1215     {
1216         comm->cell_f_max0[d] = extr_s[d-1][0];
1217         comm->cell_f_min1[d] = extr_s[d-1][1];
1218         if (debug)
1219         {
1220             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1221                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1222         }
1223     }
1224 }
1225
1226 static void dd_collect_cg(gmx_domdec_t *dd,
1227                           t_state *state_local)
1228 {
1229     gmx_domdec_master_t *ma=NULL;
1230     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1231     t_block *cgs_gl;
1232
1233     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1234     {
1235         /* The master has the correct distribution */
1236         return;
1237     }
1238
1239     if (state_local->ddp_count == dd->ddp_count)
1240     {
1241         ncg_home = dd->ncg_home;
1242         cg       = dd->index_gl;
1243         nat_home = dd->nat_home;
1244     }
1245     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1246     {
1247         cgs_gl = &dd->comm->cgs_gl;
1248
1249         ncg_home = state_local->ncg_gl;
1250         cg       = state_local->cg_gl;
1251         nat_home = 0;
1252         for(i=0; i<ncg_home; i++)
1253         {
1254             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1255         }
1256     }
1257     else
1258     {
1259         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1260     }
1261
1262     buf2[0] = dd->ncg_home;
1263     buf2[1] = dd->nat_home;
1264     if (DDMASTER(dd))
1265     {
1266         ma = dd->ma;
1267         ibuf = ma->ibuf;
1268     }
1269     else
1270     {
1271         ibuf = NULL;
1272     }
1273     /* Collect the charge group and atom counts on the master */
1274     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1275
1276     if (DDMASTER(dd))
1277     {
1278         ma->index[0] = 0;
1279         for(i=0; i<dd->nnodes; i++)
1280         {
1281             ma->ncg[i] = ma->ibuf[2*i];
1282             ma->nat[i] = ma->ibuf[2*i+1];
1283             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1284
1285         }
1286         /* Make byte counts and indices */
1287         for(i=0; i<dd->nnodes; i++)
1288         {
1289             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1290             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1291         }
1292         if (debug)
1293         {
1294             fprintf(debug,"Initial charge group distribution: ");
1295             for(i=0; i<dd->nnodes; i++)
1296                 fprintf(debug," %d",ma->ncg[i]);
1297             fprintf(debug,"\n");
1298         }
1299     }
1300
1301     /* Collect the charge group indices on the master */
1302     dd_gatherv(dd,
1303                dd->ncg_home*sizeof(int),dd->index_gl,
1304                DDMASTER(dd) ? ma->ibuf : NULL,
1305                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1306                DDMASTER(dd) ? ma->cg : NULL);
1307
1308     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1309 }
1310
1311 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1312                                     rvec *lv,rvec *v)
1313 {
1314     gmx_domdec_master_t *ma;
1315     int  n,i,c,a,nalloc=0;
1316     rvec *buf=NULL;
1317     t_block *cgs_gl;
1318
1319     ma = dd->ma;
1320
1321     if (!DDMASTER(dd))
1322     {
1323 #ifdef GMX_MPI
1324         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1325                  dd->rank,dd->mpi_comm_all);
1326 #endif
1327     } else {
1328         /* Copy the master coordinates to the global array */
1329         cgs_gl = &dd->comm->cgs_gl;
1330
1331         n = DDMASTERRANK(dd);
1332         a = 0;
1333         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1334         {
1335             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1336             {
1337                 copy_rvec(lv[a++],v[c]);
1338             }
1339         }
1340
1341         for(n=0; n<dd->nnodes; n++)
1342         {
1343             if (n != dd->rank)
1344             {
1345                 if (ma->nat[n] > nalloc)
1346                 {
1347                     nalloc = over_alloc_dd(ma->nat[n]);
1348                     srenew(buf,nalloc);
1349                 }
1350 #ifdef GMX_MPI
1351                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1352                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1353 #endif
1354                 a = 0;
1355                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1356                 {
1357                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1358                     {
1359                         copy_rvec(buf[a++],v[c]);
1360                     }
1361                 }
1362             }
1363         }
1364         sfree(buf);
1365     }
1366 }
1367
1368 static void get_commbuffer_counts(gmx_domdec_t *dd,
1369                                   int **counts,int **disps)
1370 {
1371     gmx_domdec_master_t *ma;
1372     int n;
1373
1374     ma = dd->ma;
1375
1376     /* Make the rvec count and displacment arrays */
1377     *counts  = ma->ibuf;
1378     *disps   = ma->ibuf + dd->nnodes;
1379     for(n=0; n<dd->nnodes; n++)
1380     {
1381         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1382         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1383     }
1384 }
1385
1386 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1387                                    rvec *lv,rvec *v)
1388 {
1389     gmx_domdec_master_t *ma;
1390     int  *rcounts=NULL,*disps=NULL;
1391     int  n,i,c,a;
1392     rvec *buf=NULL;
1393     t_block *cgs_gl;
1394
1395     ma = dd->ma;
1396
1397     if (DDMASTER(dd))
1398     {
1399         get_commbuffer_counts(dd,&rcounts,&disps);
1400
1401         buf = ma->vbuf;
1402     }
1403
1404     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1405
1406     if (DDMASTER(dd))
1407     {
1408         cgs_gl = &dd->comm->cgs_gl;
1409
1410         a = 0;
1411         for(n=0; n<dd->nnodes; n++)
1412         {
1413             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1414             {
1415                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1416                 {
1417                     copy_rvec(buf[a++],v[c]);
1418                 }
1419             }
1420         }
1421     }
1422 }
1423
1424 void dd_collect_vec(gmx_domdec_t *dd,
1425                     t_state *state_local,rvec *lv,rvec *v)
1426 {
1427     gmx_domdec_master_t *ma;
1428     int  n,i,c,a,nalloc=0;
1429     rvec *buf=NULL;
1430
1431     dd_collect_cg(dd,state_local);
1432
1433     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1434     {
1435         dd_collect_vec_sendrecv(dd,lv,v);
1436     }
1437     else
1438     {
1439         dd_collect_vec_gatherv(dd,lv,v);
1440     }
1441 }
1442
1443
1444 void dd_collect_state(gmx_domdec_t *dd,
1445                       t_state *state_local,t_state *state)
1446 {
1447     int est,i,j,nh;
1448
1449     nh = state->nhchainlength;
1450
1451     if (DDMASTER(dd))
1452     {
1453         state->lambda = state_local->lambda;
1454         state->veta = state_local->veta;
1455         state->vol0 = state_local->vol0;
1456         copy_mat(state_local->box,state->box);
1457         copy_mat(state_local->boxv,state->boxv);
1458         copy_mat(state_local->svir_prev,state->svir_prev);
1459         copy_mat(state_local->fvir_prev,state->fvir_prev);
1460         copy_mat(state_local->pres_prev,state->pres_prev);
1461
1462
1463         for(i=0; i<state_local->ngtc; i++)
1464         {
1465             for(j=0; j<nh; j++) {
1466                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1467                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1468             }
1469             state->therm_integral[i] = state_local->therm_integral[i];
1470         }
1471         for(i=0; i<state_local->nnhpres; i++)
1472         {
1473             for(j=0; j<nh; j++) {
1474                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1475                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1476             }
1477         }
1478     }
1479     for(est=0; est<estNR; est++)
1480     {
1481         if (EST_DISTR(est) && state_local->flags & (1<<est))
1482         {
1483             switch (est) {
1484             case estX:
1485                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1486                 break;
1487             case estV:
1488                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1489                 break;
1490             case estSDX:
1491                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1492                 break;
1493             case estCGP:
1494                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1495                 break;
1496             case estLD_RNG:
1497                 if (state->nrngi == 1)
1498                 {
1499                     if (DDMASTER(dd))
1500                     {
1501                         for(i=0; i<state_local->nrng; i++)
1502                         {
1503                             state->ld_rng[i] = state_local->ld_rng[i];
1504                         }
1505                     }
1506                 }
1507                 else
1508                 {
1509                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1510                               state_local->ld_rng,state->ld_rng);
1511                 }
1512                 break;
1513             case estLD_RNGI:
1514                 if (state->nrngi == 1)
1515                 {
1516                    if (DDMASTER(dd))
1517                     {
1518                         state->ld_rngi[0] = state_local->ld_rngi[0];
1519                     }
1520                 }
1521                 else
1522                 {
1523                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1524                               state_local->ld_rngi,state->ld_rngi);
1525                 }
1526                 break;
1527             case estDISRE_INITF:
1528             case estDISRE_RM3TAV:
1529             case estORIRE_INITF:
1530             case estORIRE_DTAV:
1531                 break;
1532             default:
1533                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1534             }
1535         }
1536     }
1537 }
1538
1539 static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
1540 {
1541     if (debug)
1542     {
1543         fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1544     }
1545     fr->cg_nalloc = over_alloc_dd(nalloc);
1546     srenew(fr->cg_cm,fr->cg_nalloc);
1547     srenew(fr->cginfo,fr->cg_nalloc);
1548 }
1549
1550 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1551 {
1552     int est;
1553
1554     if (debug)
1555     {
1556         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1557     }
1558
1559     state->nalloc = over_alloc_dd(nalloc);
1560
1561     for(est=0; est<estNR; est++)
1562     {
1563         if (EST_DISTR(est) && state->flags & (1<<est))
1564         {
1565             switch(est) {
1566             case estX:
1567                 srenew(state->x,state->nalloc);
1568                 break;
1569             case estV:
1570                 srenew(state->v,state->nalloc);
1571                 break;
1572             case estSDX:
1573                 srenew(state->sd_X,state->nalloc);
1574                 break;
1575             case estCGP:
1576                 srenew(state->cg_p,state->nalloc);
1577                 break;
1578             case estLD_RNG:
1579             case estLD_RNGI:
1580             case estDISRE_INITF:
1581             case estDISRE_RM3TAV:
1582             case estORIRE_INITF:
1583             case estORIRE_DTAV:
1584                 /* No reallocation required */
1585                 break;
1586             default:
1587                 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1588             }
1589         }
1590     }
1591
1592     if (f != NULL)
1593     {
1594         srenew(*f,state->nalloc);
1595     }
1596 }
1597
1598 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1599                                        rvec *v,rvec *lv)
1600 {
1601     gmx_domdec_master_t *ma;
1602     int  n,i,c,a,nalloc=0;
1603     rvec *buf=NULL;
1604
1605     if (DDMASTER(dd))
1606     {
1607         ma  = dd->ma;
1608
1609         for(n=0; n<dd->nnodes; n++)
1610         {
1611             if (n != dd->rank)
1612             {
1613                 if (ma->nat[n] > nalloc)
1614                 {
1615                     nalloc = over_alloc_dd(ma->nat[n]);
1616                     srenew(buf,nalloc);
1617                 }
1618                 /* Use lv as a temporary buffer */
1619                 a = 0;
1620                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1621                 {
1622                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1623                     {
1624                         copy_rvec(v[c],buf[a++]);
1625                     }
1626                 }
1627                 if (a != ma->nat[n])
1628                 {
1629                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1630                               a,ma->nat[n]);
1631                 }
1632
1633 #ifdef GMX_MPI
1634                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1635                          DDRANK(dd,n),n,dd->mpi_comm_all);
1636 #endif
1637             }
1638         }
1639         sfree(buf);
1640         n = DDMASTERRANK(dd);
1641         a = 0;
1642         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1643         {
1644             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1645             {
1646                 copy_rvec(v[c],lv[a++]);
1647             }
1648         }
1649     }
1650     else
1651     {
1652 #ifdef GMX_MPI
1653         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1654                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1655 #endif
1656     }
1657 }
1658
1659 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1660                                        rvec *v,rvec *lv)
1661 {
1662     gmx_domdec_master_t *ma;
1663     int  *scounts=NULL,*disps=NULL;
1664     int  n,i,c,a,nalloc=0;
1665     rvec *buf=NULL;
1666
1667     if (DDMASTER(dd))
1668     {
1669         ma  = dd->ma;
1670
1671         get_commbuffer_counts(dd,&scounts,&disps);
1672
1673         buf = ma->vbuf;
1674         a = 0;
1675         for(n=0; n<dd->nnodes; n++)
1676         {
1677             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1678             {
1679                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1680                 {
1681                     copy_rvec(v[c],buf[a++]);
1682                 }
1683             }
1684         }
1685     }
1686
1687     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1688 }
1689
1690 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1691 {
1692     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1693     {
1694         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1695     }
1696     else
1697     {
1698         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1699     }
1700 }
1701
1702 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1703                                 t_state *state,t_state *state_local,
1704                                 rvec **f)
1705 {
1706     int  i,j,ngtch,ngtcp,nh;
1707
1708     nh = state->nhchainlength;
1709
1710     if (DDMASTER(dd))
1711     {
1712         state_local->lambda = state->lambda;
1713         state_local->veta   = state->veta;
1714         state_local->vol0   = state->vol0;
1715         copy_mat(state->box,state_local->box);
1716         copy_mat(state->box_rel,state_local->box_rel);
1717         copy_mat(state->boxv,state_local->boxv);
1718         copy_mat(state->svir_prev,state_local->svir_prev);
1719         copy_mat(state->fvir_prev,state_local->fvir_prev);
1720         for(i=0; i<state_local->ngtc; i++)
1721         {
1722             for(j=0; j<nh; j++) {
1723                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1724                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1725             }
1726             state_local->therm_integral[i] = state->therm_integral[i];
1727         }
1728         for(i=0; i<state_local->nnhpres; i++)
1729         {
1730             for(j=0; j<nh; j++) {
1731                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1732                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1733             }
1734         }
1735     }
1736     dd_bcast(dd,sizeof(real),&state_local->lambda);
1737     dd_bcast(dd,sizeof(real),&state_local->veta);
1738     dd_bcast(dd,sizeof(real),&state_local->vol0);
1739     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1740     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1741     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1742     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1743     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1744     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1745     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1746     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1747     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1748     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1749
1750     if (dd->nat_home > state_local->nalloc)
1751     {
1752         dd_realloc_state(state_local,f,dd->nat_home);
1753     }
1754     for(i=0; i<estNR; i++)
1755     {
1756         if (EST_DISTR(i) && state_local->flags & (1<<i))
1757         {
1758             switch (i) {
1759             case estX:
1760                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1761                 break;
1762             case estV:
1763                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1764                 break;
1765             case estSDX:
1766                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1767                 break;
1768             case estCGP:
1769                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1770                 break;
1771             case estLD_RNG:
1772                 if (state->nrngi == 1)
1773                 {
1774                     dd_bcastc(dd,
1775                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1776                               state->ld_rng,state_local->ld_rng);
1777                 }
1778                 else
1779                 {
1780                     dd_scatter(dd,
1781                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1782                                state->ld_rng,state_local->ld_rng);
1783                 }
1784                 break;
1785             case estLD_RNGI:
1786                 if (state->nrngi == 1)
1787                 {
1788                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1789                               state->ld_rngi,state_local->ld_rngi);
1790                 }
1791                 else
1792                 {
1793                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1794                                state->ld_rngi,state_local->ld_rngi);
1795                 }
1796                 break;
1797             case estDISRE_INITF:
1798             case estDISRE_RM3TAV:
1799             case estORIRE_INITF:
1800             case estORIRE_DTAV:
1801                 /* Not implemented yet */
1802                 break;
1803             default:
1804                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1805             }
1806         }
1807     }
1808 }
1809
1810 static char dim2char(int dim)
1811 {
1812     char c='?';
1813
1814     switch (dim)
1815     {
1816     case XX: c = 'X'; break;
1817     case YY: c = 'Y'; break;
1818     case ZZ: c = 'Z'; break;
1819     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1820     }
1821
1822     return c;
1823 }
1824
1825 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1826                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1827 {
1828     rvec grid_s[2],*grid_r=NULL,cx,r;
1829     char fname[STRLEN],format[STRLEN],buf[22];
1830     FILE *out;
1831     int  a,i,d,z,y,x;
1832     matrix tric;
1833     real vol;
1834
1835     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1836     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1837
1838     if (DDMASTER(dd))
1839     {
1840         snew(grid_r,2*dd->nnodes);
1841     }
1842
1843     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1844
1845     if (DDMASTER(dd))
1846     {
1847         for(d=0; d<DIM; d++)
1848         {
1849             for(i=0; i<DIM; i++)
1850             {
1851                 if (d == i)
1852                 {
1853                     tric[d][i] = 1;
1854                 }
1855                 else
1856                 {
1857                     if (dd->nc[d] > 1 && d < ddbox->npbcdim)
1858                     {
1859                         tric[d][i] = box[i][d]/box[i][i];
1860                     }
1861                     else
1862                     {
1863                         tric[d][i] = 0;
1864                     }
1865                 }
1866             }
1867         }
1868         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1869         sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1870         out = gmx_fio_fopen(fname,"w");
1871         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1872         a = 1;
1873         for(i=0; i<dd->nnodes; i++)
1874         {
1875             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1876             for(d=0; d<DIM; d++)
1877             {
1878                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1879             }
1880             for(z=0; z<2; z++)
1881             {
1882                 for(y=0; y<2; y++)
1883                 {
1884                     for(x=0; x<2; x++)
1885                     {
1886                         cx[XX] = grid_r[i*2+x][XX];
1887                         cx[YY] = grid_r[i*2+y][YY];
1888                         cx[ZZ] = grid_r[i*2+z][ZZ];
1889                         mvmul(tric,cx,r);
1890                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1891                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1892                     }
1893                 }
1894             }
1895             for(d=0; d<DIM; d++)
1896             {
1897                 for(x=0; x<4; x++)
1898                 {
1899                     switch(d)
1900                     {
1901                     case 0: y = 1 + i*8 + 2*x; break;
1902                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1903                     case 2: y = 1 + i*8 + x; break;
1904                     }
1905                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1906                 }
1907             }
1908         }
1909         gmx_fio_fclose(out);
1910         sfree(grid_r);
1911     }
1912 }
1913
1914 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1915                   gmx_mtop_t *mtop,t_commrec *cr,
1916                   int natoms,rvec x[],matrix box)
1917 {
1918     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1919     FILE *out;
1920     int  i,ii,resnr,c;
1921     char *atomname,*resname;
1922     real b;
1923     gmx_domdec_t *dd;
1924
1925     dd = cr->dd;
1926     if (natoms == -1)
1927     {
1928         natoms = dd->comm->nat[ddnatVSITE];
1929     }
1930
1931     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
1932
1933     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1934     sprintf(format4,"%s%s\n",pdbformat4,"%6.2f%6.2f");
1935
1936     out = gmx_fio_fopen(fname,"w");
1937
1938     fprintf(out,"TITLE     %s\n",title);
1939     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1940     for(i=0; i<natoms; i++)
1941     {
1942         ii = dd->gatindex[i];
1943         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
1944         if (i < dd->comm->nat[ddnatZONE])
1945         {
1946             c = 0;
1947             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1948             {
1949                 c++;
1950             }
1951             b = c;
1952         }
1953         else if (i < dd->comm->nat[ddnatVSITE])
1954         {
1955             b = dd->comm->zones.n;
1956         }
1957         else
1958         {
1959             b = dd->comm->zones.n + 1;
1960         }
1961         fprintf(out,strlen(atomname)<4 ? format : format4,
1962                 "ATOM",(ii+1)%100000,
1963                 atomname,resname,' ',resnr%10000,' ',
1964                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
1965     }
1966     fprintf(out,"TER\n");
1967
1968     gmx_fio_fclose(out);
1969 }
1970
1971 real dd_cutoff_mbody(gmx_domdec_t *dd)
1972 {
1973     gmx_domdec_comm_t *comm;
1974     int  di;
1975     real r;
1976
1977     comm = dd->comm;
1978
1979     r = -1;
1980     if (comm->bInterCGBondeds)
1981     {
1982         if (comm->cutoff_mbody > 0)
1983         {
1984             r = comm->cutoff_mbody;
1985         }
1986         else
1987         {
1988             /* cutoff_mbody=0 means we do not have DLB */
1989             r = comm->cellsize_min[dd->dim[0]];
1990             for(di=1; di<dd->ndim; di++)
1991             {
1992                 r = min(r,comm->cellsize_min[dd->dim[di]]);
1993             }
1994             if (comm->bBondComm)
1995             {
1996                 r = max(r,comm->cutoff_mbody);
1997             }
1998             else
1999             {
2000                 r = min(r,comm->cutoff);
2001             }
2002         }
2003     }
2004
2005     return r;
2006 }
2007
2008 real dd_cutoff_twobody(gmx_domdec_t *dd)
2009 {
2010     real r_mb;
2011
2012     r_mb = dd_cutoff_mbody(dd);
2013
2014     return max(dd->comm->cutoff,r_mb);
2015 }
2016
2017
2018 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2019 {
2020     int nc,ntot;
2021
2022     nc   = dd->nc[dd->comm->cartpmedim];
2023     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2024     copy_ivec(coord,coord_pme);
2025     coord_pme[dd->comm->cartpmedim] =
2026         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2027 }
2028
2029 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2030 {
2031     /* Here we assign a PME node to communicate with this DD node
2032      * by assuming that the major index of both is x.
2033      * We add cr->npmenodes/2 to obtain an even distribution.
2034      */
2035     return (ddindex*npme + npme/2)/ndd;
2036 }
2037
2038 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2039 {
2040     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2041 }
2042
2043 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2044 {
2045     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2046 }
2047
2048 static int *dd_pmenodes(t_commrec *cr)
2049 {
2050     int *pmenodes;
2051     int n,i,p0,p1;
2052
2053     snew(pmenodes,cr->npmenodes);
2054     n = 0;
2055     for(i=0; i<cr->dd->nnodes; i++) {
2056         p0 = cr_ddindex2pmeindex(cr,i);
2057         p1 = cr_ddindex2pmeindex(cr,i+1);
2058         if (i+1 == cr->dd->nnodes || p1 > p0) {
2059             if (debug)
2060                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2061             pmenodes[n] = i + 1 + n;
2062             n++;
2063         }
2064     }
2065
2066     return pmenodes;
2067 }
2068
2069 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2070 {
2071     gmx_domdec_t *dd;
2072     ivec coords,coords_pme,nc;
2073     int  slab;
2074
2075     dd = cr->dd;
2076     /*
2077       if (dd->comm->bCartesian) {
2078       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2079       dd_coords2pmecoords(dd,coords,coords_pme);
2080       copy_ivec(dd->ntot,nc);
2081       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2082       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2083
2084       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2085       } else {
2086       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2087       }
2088     */
2089     coords[XX] = x;
2090     coords[YY] = y;
2091     coords[ZZ] = z;
2092     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2093
2094     return slab;
2095 }
2096
2097 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2098 {
2099     gmx_domdec_comm_t *comm;
2100     ivec coords;
2101     int  ddindex,nodeid=-1;
2102
2103     comm = cr->dd->comm;
2104
2105     coords[XX] = x;
2106     coords[YY] = y;
2107     coords[ZZ] = z;
2108     if (comm->bCartesianPP_PME)
2109     {
2110 #ifdef GMX_MPI
2111         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2112 #endif
2113     }
2114     else
2115     {
2116         ddindex = dd_index(cr->dd->nc,coords);
2117         if (comm->bCartesianPP)
2118         {
2119             nodeid = comm->ddindex2simnodeid[ddindex];
2120         }
2121         else
2122         {
2123             if (comm->pmenodes)
2124             {
2125                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2126             }
2127             else
2128             {
2129                 nodeid = ddindex;
2130             }
2131         }
2132     }
2133
2134     return nodeid;
2135 }
2136
2137 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2138 {
2139     gmx_domdec_t *dd;
2140     gmx_domdec_comm_t *comm;
2141     ivec coord,coord_pme;
2142     int  i;
2143     int  pmenode=-1;
2144
2145     dd = cr->dd;
2146     comm = dd->comm;
2147
2148     /* This assumes a uniform x domain decomposition grid cell size */
2149     if (comm->bCartesianPP_PME)
2150     {
2151 #ifdef GMX_MPI
2152         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2153         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2154         {
2155             /* This is a PP node */
2156             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2157             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2158         }
2159 #endif
2160     }
2161     else if (comm->bCartesianPP)
2162     {
2163         if (sim_nodeid < dd->nnodes)
2164         {
2165             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2166         }
2167     }
2168     else
2169     {
2170         /* This assumes DD cells with identical x coordinates
2171          * are numbered sequentially.
2172          */
2173         if (dd->comm->pmenodes == NULL)
2174         {
2175             if (sim_nodeid < dd->nnodes)
2176             {
2177                 /* The DD index equals the nodeid */
2178                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2179             }
2180         }
2181         else
2182         {
2183             i = 0;
2184             while (sim_nodeid > dd->comm->pmenodes[i])
2185             {
2186                 i++;
2187             }
2188             if (sim_nodeid < dd->comm->pmenodes[i])
2189             {
2190                 pmenode = dd->comm->pmenodes[i];
2191             }
2192         }
2193     }
2194
2195     return pmenode;
2196 }
2197
2198 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2199 {
2200     gmx_bool bPMEOnlyNode;
2201
2202     if (DOMAINDECOMP(cr))
2203     {
2204         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2205     }
2206     else
2207     {
2208         bPMEOnlyNode = FALSE;
2209     }
2210
2211     return bPMEOnlyNode;
2212 }
2213
2214 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2215                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2216 {
2217     gmx_domdec_t *dd;
2218     int x,y,z;
2219     ivec coord,coord_pme;
2220
2221     dd = cr->dd;
2222
2223     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2224
2225     *nmy_ddnodes = 0;
2226     for(x=0; x<dd->nc[XX]; x++)
2227     {
2228         for(y=0; y<dd->nc[YY]; y++)
2229         {
2230             for(z=0; z<dd->nc[ZZ]; z++)
2231             {
2232                 if (dd->comm->bCartesianPP_PME)
2233                 {
2234                     coord[XX] = x;
2235                     coord[YY] = y;
2236                     coord[ZZ] = z;
2237                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2238                     if (dd->ci[XX] == coord_pme[XX] &&
2239                         dd->ci[YY] == coord_pme[YY] &&
2240                         dd->ci[ZZ] == coord_pme[ZZ])
2241                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2242                 }
2243                 else
2244                 {
2245                     /* The slab corresponds to the nodeid in the PME group */
2246                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2247                     {
2248                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2249                     }
2250                 }
2251             }
2252         }
2253     }
2254
2255     /* The last PP-only node is the peer node */
2256     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2257
2258     if (debug)
2259     {
2260         fprintf(debug,"Receive coordinates from PP nodes:");
2261         for(x=0; x<*nmy_ddnodes; x++)
2262         {
2263             fprintf(debug," %d",(*my_ddnodes)[x]);
2264         }
2265         fprintf(debug,"\n");
2266     }
2267 }
2268
2269 static gmx_bool receive_vir_ener(t_commrec *cr)
2270 {
2271     gmx_domdec_comm_t *comm;
2272     int  pmenode,coords[DIM],rank;
2273     gmx_bool bReceive;
2274
2275     bReceive = TRUE;
2276     if (cr->npmenodes < cr->dd->nnodes)
2277     {
2278         comm = cr->dd->comm;
2279         if (comm->bCartesianPP_PME)
2280         {
2281             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2282 #ifdef GMX_MPI
2283             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2284             coords[comm->cartpmedim]++;
2285             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2286             {
2287                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2288                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2289                 {
2290                     /* This is not the last PP node for pmenode */
2291                     bReceive = FALSE;
2292                 }
2293             }
2294 #endif
2295         }
2296         else
2297         {
2298             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2299             if (cr->sim_nodeid+1 < cr->nnodes &&
2300                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2301             {
2302                 /* This is not the last PP node for pmenode */
2303                 bReceive = FALSE;
2304             }
2305         }
2306     }
2307
2308     return bReceive;
2309 }
2310
2311 static void set_zones_ncg_home(gmx_domdec_t *dd)
2312 {
2313     gmx_domdec_zones_t *zones;
2314     int i;
2315
2316     zones = &dd->comm->zones;
2317
2318     zones->cg_range[0] = 0;
2319     for(i=1; i<zones->n+1; i++)
2320     {
2321         zones->cg_range[i] = dd->ncg_home;
2322     }
2323 }
2324
2325 static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
2326 {
2327     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2328
2329     ind = state->cg_gl;
2330     dd_cg_gl = dd->index_gl;
2331     cgindex  = dd->cgindex;
2332     nat = 0;
2333     cgindex[0] = nat;
2334     for(i=0; i<state->ncg_gl; i++)
2335     {
2336         cgindex[i] = nat;
2337         cg_gl = ind[i];
2338         dd_cg_gl[i] = cg_gl;
2339         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2340     }
2341     cgindex[i] = nat;
2342
2343     dd->ncg_home = state->ncg_gl;
2344     dd->nat_home = nat;
2345
2346     set_zones_ncg_home(dd);
2347 }
2348
2349 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2350 {
2351     while (cg >= cginfo_mb->cg_end)
2352     {
2353         cginfo_mb++;
2354     }
2355
2356     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2357 }
2358
2359 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2360                           t_forcerec *fr,char *bLocalCG)
2361 {
2362     cginfo_mb_t *cginfo_mb;
2363     int *cginfo;
2364     int cg;
2365
2366     if (fr != NULL)
2367     {
2368         cginfo_mb = fr->cginfo_mb;
2369         cginfo    = fr->cginfo;
2370
2371         for(cg=cg0; cg<cg1; cg++)
2372         {
2373             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2374         }
2375     }
2376
2377     if (bLocalCG != NULL)
2378     {
2379         for(cg=cg0; cg<cg1; cg++)
2380         {
2381             bLocalCG[index_gl[cg]] = TRUE;
2382         }
2383     }
2384 }
2385
2386 static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
2387 {
2388     int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
2389     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2390     gmx_ga2la_t *ga2la;
2391     char *bLocalCG;
2392
2393     bLocalCG = dd->comm->bLocalCG;
2394
2395     if (dd->nat_tot > dd->gatindex_nalloc)
2396     {
2397         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2398         srenew(dd->gatindex,dd->gatindex_nalloc);
2399     }
2400
2401     nzone      = dd->comm->zones.n;
2402     zone2cg    = dd->comm->zones.cg_range;
2403     zone_ncg1  = dd->comm->zone_ncg1;
2404     index_gl   = dd->index_gl;
2405     gatindex   = dd->gatindex;
2406
2407     if (zone2cg[1] != dd->ncg_home)
2408     {
2409         gmx_incons("dd->ncg_zone is not up to date");
2410     }
2411
2412     /* Make the local to global and global to local atom index */
2413     a = dd->cgindex[cg_start];
2414     for(zone=0; zone<nzone; zone++)
2415     {
2416         if (zone == 0)
2417         {
2418             cg0 = cg_start;
2419         }
2420         else
2421         {
2422             cg0 = zone2cg[zone];
2423         }
2424         for(cg=cg0; cg<zone2cg[zone+1]; cg++)
2425         {
2426             zone1 = zone;
2427             if (cg - cg0 >= zone_ncg1[zone])
2428             {
2429                 /* Signal that this cg is from more than one zone away */
2430                 zone1 += nzone;
2431             }
2432             cg_gl = index_gl[cg];
2433             for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2434             {
2435                 gatindex[a] = a_gl;
2436                 ga2la_set(dd->ga2la,a_gl,a,zone1);
2437                 a++;
2438             }
2439         }
2440     }
2441 }
2442
2443 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2444                           const char *where)
2445 {
2446     int ncg,i,ngl,nerr;
2447
2448     nerr = 0;
2449     if (bLocalCG == NULL)
2450     {
2451         return nerr;
2452     }
2453     for(i=0; i<dd->ncg_tot; i++)
2454     {
2455         if (!bLocalCG[dd->index_gl[i]])
2456         {
2457             fprintf(stderr,
2458                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2459             nerr++;
2460         }
2461     }
2462     ngl = 0;
2463     for(i=0; i<ncg_sys; i++)
2464     {
2465         if (bLocalCG[i])
2466         {
2467             ngl++;
2468         }
2469     }
2470     if (ngl != dd->ncg_tot)
2471     {
2472         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2473         nerr++;
2474     }
2475
2476     return nerr;
2477 }
2478
2479 static void check_index_consistency(gmx_domdec_t *dd,
2480                                     int natoms_sys,int ncg_sys,
2481                                     const char *where)
2482 {
2483     int  nerr,ngl,i,a,cell;
2484     int  *have;
2485
2486     nerr = 0;
2487
2488     if (dd->comm->DD_debug > 1)
2489     {
2490         snew(have,natoms_sys);
2491         for(a=0; a<dd->nat_tot; a++)
2492         {
2493             if (have[dd->gatindex[a]] > 0)
2494             {
2495                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2496             }
2497             else
2498             {
2499                 have[dd->gatindex[a]] = a + 1;
2500             }
2501         }
2502         sfree(have);
2503     }
2504
2505     snew(have,dd->nat_tot);
2506
2507     ngl  = 0;
2508     for(i=0; i<natoms_sys; i++)
2509     {
2510         if (ga2la_get(dd->ga2la,i,&a,&cell))
2511         {
2512             if (a >= dd->nat_tot)
2513             {
2514                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2515                 nerr++;
2516             }
2517             else
2518             {
2519                 have[a] = 1;
2520                 if (dd->gatindex[a] != i)
2521                 {
2522                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2523                     nerr++;
2524                 }
2525             }
2526             ngl++;
2527         }
2528     }
2529     if (ngl != dd->nat_tot)
2530     {
2531         fprintf(stderr,
2532                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2533                 dd->rank,where,ngl,dd->nat_tot);
2534     }
2535     for(a=0; a<dd->nat_tot; a++)
2536     {
2537         if (have[a] == 0)
2538         {
2539             fprintf(stderr,
2540                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2541                     dd->rank,where,a+1,dd->gatindex[a]+1);
2542         }
2543     }
2544     sfree(have);
2545
2546     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2547
2548     if (nerr > 0) {
2549         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2550                   dd->rank,where,nerr);
2551     }
2552 }
2553
2554 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2555 {
2556     int  i;
2557     char *bLocalCG;
2558
2559     if (a_start == 0)
2560     {
2561         /* Clear the whole list without searching */
2562         ga2la_clear(dd->ga2la);
2563     }
2564     else
2565     {
2566         for(i=a_start; i<dd->nat_tot; i++)
2567         {
2568             ga2la_del(dd->ga2la,dd->gatindex[i]);
2569         }
2570     }
2571
2572     bLocalCG = dd->comm->bLocalCG;
2573     if (bLocalCG)
2574     {
2575         for(i=cg_start; i<dd->ncg_tot; i++)
2576         {
2577             bLocalCG[dd->index_gl[i]] = FALSE;
2578         }
2579     }
2580
2581     dd_clear_local_vsite_indices(dd);
2582
2583     if (dd->constraints)
2584     {
2585         dd_clear_local_constraint_indices(dd);
2586     }
2587 }
2588
2589 static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
2590 {
2591     real grid_jump_limit;
2592
2593     /* The distance between the boundaries of cells at distance
2594      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2595      * and by the fact that cells should not be shifted by more than
2596      * half their size, such that cg's only shift by one cell
2597      * at redecomposition.
2598      */
2599     grid_jump_limit = comm->cellsize_limit;
2600     if (!comm->bVacDLBNoLimit)
2601     {
2602         grid_jump_limit = max(grid_jump_limit,
2603                               comm->cutoff/comm->cd[dim_ind].np);
2604     }
2605
2606     return grid_jump_limit;
2607 }
2608
2609 static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2610 {
2611     gmx_domdec_comm_t *comm;
2612     int  d,dim;
2613     real limit,bfac;
2614
2615     comm = dd->comm;
2616
2617     for(d=1; d<dd->ndim; d++)
2618     {
2619         dim = dd->dim[d];
2620         limit = grid_jump_limit(comm,d);
2621         bfac = ddbox->box_size[dim];
2622         if (ddbox->tric_dir[dim])
2623         {
2624             bfac *= ddbox->skew_fac[dim];
2625         }
2626         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2627             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2628         {
2629             char buf[22];
2630             gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
2631                       gmx_step_str(step,buf),
2632                       dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2633         }
2634     }
2635 }
2636
2637 static int dd_load_count(gmx_domdec_comm_t *comm)
2638 {
2639     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2640 }
2641
2642 static float dd_force_load(gmx_domdec_comm_t *comm)
2643 {
2644     float load;
2645
2646     if (comm->eFlop)
2647     {
2648         load = comm->flop;
2649         if (comm->eFlop > 1)
2650         {
2651             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2652         }
2653     }
2654     else
2655     {
2656         load = comm->cycl[ddCyclF];
2657         if (comm->cycl_n[ddCyclF] > 1)
2658         {
2659             /* Subtract the maximum of the last n cycle counts
2660              * to get rid of possible high counts due to other soures,
2661              * for instance system activity, that would otherwise
2662              * affect the dynamic load balancing.
2663              */
2664             load -= comm->cycl_max[ddCyclF];
2665         }
2666     }
2667
2668     return load;
2669 }
2670
2671 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2672 {
2673     gmx_domdec_comm_t *comm;
2674     int i;
2675
2676     comm = dd->comm;
2677
2678     snew(*dim_f,dd->nc[dim]+1);
2679     (*dim_f)[0] = 0;
2680     for(i=1; i<dd->nc[dim]; i++)
2681     {
2682         if (comm->slb_frac[dim])
2683         {
2684             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2685         }
2686         else
2687         {
2688             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2689         }
2690     }
2691     (*dim_f)[dd->nc[dim]] = 1;
2692 }
2693
2694 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2695 {
2696     int  pmeindex,slab,nso,i;
2697     ivec xyz;
2698
2699     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2700     {
2701         ddpme->dim = YY;
2702     }
2703     else
2704     {
2705         ddpme->dim = dimind;
2706     }
2707     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2708
2709     ddpme->nslab = (ddpme->dim == 0 ?
2710                     dd->comm->npmenodes_x :
2711                     dd->comm->npmenodes_y);
2712
2713     if (ddpme->nslab <= 1)
2714     {
2715         return;
2716     }
2717
2718     nso = dd->comm->npmenodes/ddpme->nslab;
2719     /* Determine for each PME slab the PP location range for dimension dim */
2720     snew(ddpme->pp_min,ddpme->nslab);
2721     snew(ddpme->pp_max,ddpme->nslab);
2722     for(slab=0; slab<ddpme->nslab; slab++) {
2723         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2724         ddpme->pp_max[slab] = 0;
2725     }
2726     for(i=0; i<dd->nnodes; i++) {
2727         ddindex2xyz(dd->nc,i,xyz);
2728         /* For y only use our y/z slab.
2729          * This assumes that the PME x grid size matches the DD grid size.
2730          */
2731         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2732             pmeindex = ddindex2pmeindex(dd,i);
2733             if (dimind == 0) {
2734                 slab = pmeindex/nso;
2735             } else {
2736                 slab = pmeindex % ddpme->nslab;
2737             }
2738             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2739             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2740         }
2741     }
2742
2743     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2744 }
2745
2746 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2747 {
2748     if (dd->comm->ddpme[0].dim == XX)
2749     {
2750         return dd->comm->ddpme[0].maxshift;
2751     }
2752     else
2753     {
2754         return 0;
2755     }
2756 }
2757
2758 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2759 {
2760     if (dd->comm->ddpme[0].dim == YY)
2761     {
2762         return dd->comm->ddpme[0].maxshift;
2763     }
2764     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2765     {
2766         return dd->comm->ddpme[1].maxshift;
2767     }
2768     else
2769     {
2770         return 0;
2771     }
2772 }
2773
2774 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2775                              gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2776 {
2777     gmx_domdec_comm_t *comm;
2778     int  nc,ns,s;
2779     int  *xmin,*xmax;
2780     real range,pme_boundary;
2781     int  sh;
2782
2783     comm = dd->comm;
2784     nc  = dd->nc[ddpme->dim];
2785     ns  = ddpme->nslab;
2786
2787     if (!ddpme->dim_match)
2788     {
2789         /* PP decomposition is not along dim: the worst situation */
2790         sh = ns/2;
2791     }
2792     else if (ns <= 3 || (bUniform && ns == nc))
2793     {
2794         /* The optimal situation */
2795         sh = 1;
2796     }
2797     else
2798     {
2799         /* We need to check for all pme nodes which nodes they
2800          * could possibly need to communicate with.
2801          */
2802         xmin = ddpme->pp_min;
2803         xmax = ddpme->pp_max;
2804         /* Allow for atoms to be maximally 2/3 times the cut-off
2805          * out of their DD cell. This is a reasonable balance between
2806          * between performance and support for most charge-group/cut-off
2807          * combinations.
2808          */
2809         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2810         /* Avoid extra communication when we are exactly at a boundary */
2811         range *= 0.999;
2812
2813         sh = 1;
2814         for(s=0; s<ns; s++)
2815         {
2816             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2817             pme_boundary = (real)s/ns;
2818             while (sh+1 < ns &&
2819                    ((s-(sh+1) >= 0 &&
2820                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2821                     (s-(sh+1) <  0 &&
2822                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2823             {
2824                 sh++;
2825             }
2826             pme_boundary = (real)(s+1)/ns;
2827             while (sh+1 < ns &&
2828                    ((s+(sh+1) <  ns &&
2829                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2830                     (s+(sh+1) >= ns &&
2831                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2832             {
2833                 sh++;
2834             }
2835         }
2836     }
2837
2838     ddpme->maxshift = sh;
2839
2840     if (debug)
2841     {
2842         fprintf(debug,"PME slab communication range for dim %d is %d\n",
2843                 ddpme->dim,ddpme->maxshift);
2844     }
2845 }
2846
2847 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2848 {
2849     int d,dim;
2850
2851     for(d=0; d<dd->ndim; d++)
2852     {
2853         dim = dd->dim[d];
2854         if (dim < ddbox->nboundeddim &&
2855             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2856             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2857         {
2858             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2859                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2860                       dd->nc[dim],dd->comm->cellsize_limit);
2861         }
2862     }
2863 }
2864
2865 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2866                                   gmx_bool bMaster,ivec npulse)
2867 {
2868     gmx_domdec_comm_t *comm;
2869     int  d,j;
2870     rvec cellsize_min;
2871     real *cell_x,cell_dx,cellsize;
2872
2873     comm = dd->comm;
2874
2875     for(d=0; d<DIM; d++)
2876     {
2877         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2878         npulse[d] = 1;
2879         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2880         {
2881             /* Uniform grid */
2882             cell_dx = ddbox->box_size[d]/dd->nc[d];
2883             if (bMaster)
2884             {
2885                 for(j=0; j<dd->nc[d]+1; j++)
2886                 {
2887                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2888                 }
2889             }
2890             else
2891             {
2892                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2893                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2894             }
2895             cellsize = cell_dx*ddbox->skew_fac[d];
2896             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
2897             {
2898                 npulse[d]++;
2899             }
2900             cellsize_min[d] = cellsize;
2901         }
2902         else
2903         {
2904             /* Statically load balanced grid */
2905             /* Also when we are not doing a master distribution we determine
2906              * all cell borders in a loop to obtain identical values
2907              * to the master distribution case and to determine npulse.
2908              */
2909             if (bMaster)
2910             {
2911                 cell_x = dd->ma->cell_x[d];
2912             }
2913             else
2914             {
2915                 snew(cell_x,dd->nc[d]+1);
2916             }
2917             cell_x[0] = ddbox->box0[d];
2918             for(j=0; j<dd->nc[d]; j++)
2919             {
2920                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2921                 cell_x[j+1] = cell_x[j] + cell_dx;
2922                 cellsize = cell_dx*ddbox->skew_fac[d];
2923                 while (cellsize*npulse[d] < comm->cutoff &&
2924                        npulse[d] < dd->nc[d]-1)
2925                 {
2926                     npulse[d]++;
2927                 }
2928                 cellsize_min[d] = min(cellsize_min[d],cellsize);
2929             }
2930             if (!bMaster)
2931             {
2932                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2933                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2934                 sfree(cell_x);
2935             }
2936         }
2937         /* The following limitation is to avoid that a cell would receive
2938          * some of its own home charge groups back over the periodic boundary.
2939          * Double charge groups cause trouble with the global indices.
2940          */
2941         if (d < ddbox->npbcdim &&
2942             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2943         {
2944             gmx_fatal_collective(FARGS,NULL,dd,
2945                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2946                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
2947                                  comm->cutoff,
2948                                  dd->nc[d],dd->nc[d],
2949                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
2950         }
2951     }
2952
2953     if (!comm->bDynLoadBal)
2954     {
2955         copy_rvec(cellsize_min,comm->cellsize_min);
2956     }
2957
2958     for(d=0; d<comm->npmedecompdim; d++)
2959     {
2960         set_pme_maxshift(dd,&comm->ddpme[d],
2961                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
2962                          comm->ddpme[d].slb_dim_f);
2963     }
2964 }
2965
2966
2967 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2968                                        int d,int dim,gmx_domdec_root_t *root,
2969                                        gmx_ddbox_t *ddbox,
2970                                        gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
2971 {
2972     gmx_domdec_comm_t *comm;
2973     int  ncd,i,j,nmin,nmin_old;
2974     gmx_bool bLimLo,bLimHi;
2975     real *cell_size;
2976     real fac,halfway,cellsize_limit_f_i,region_size;
2977     gmx_bool bPBC,bLastHi=FALSE;
2978     int nrange[]={range[0],range[1]};
2979
2980     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];
2981
2982     comm = dd->comm;
2983
2984     ncd = dd->nc[dim];
2985
2986     bPBC = (dim < ddbox->npbcdim);
2987
2988     cell_size = root->buf_ncd;
2989
2990     if (debug)
2991     {
2992         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
2993     }
2994
2995     /* First we need to check if the scaling does not make cells
2996      * smaller than the smallest allowed size.
2997      * We need to do this iteratively, since if a cell is too small,
2998      * it needs to be enlarged, which makes all the other cells smaller,
2999      * which could in turn make another cell smaller than allowed.
3000      */
3001     for(i=range[0]; i<range[1]; i++)
3002     {
3003         root->bCellMin[i] = FALSE;
3004     }
3005     nmin = 0;
3006     do
3007     {
3008         nmin_old = nmin;
3009         /* We need the total for normalization */
3010         fac = 0;
3011         for(i=range[0]; i<range[1]; i++)
3012         {
3013             if (root->bCellMin[i] == FALSE)
3014             {
3015                 fac += cell_size[i];
3016             }
3017         }
3018         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3019         /* Determine the cell boundaries */
3020         for(i=range[0]; i<range[1]; i++)
3021         {
3022             if (root->bCellMin[i] == FALSE)
3023             {
3024                 cell_size[i] *= fac;
3025                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3026                 {
3027                     cellsize_limit_f_i = 0;
3028                 }
3029                 else
3030                 {
3031                     cellsize_limit_f_i = cellsize_limit_f;
3032                 }
3033                 if (cell_size[i] < cellsize_limit_f_i)
3034                 {
3035                     root->bCellMin[i] = TRUE;
3036                     cell_size[i] = cellsize_limit_f_i;
3037                     nmin++;
3038                 }
3039             }
3040             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3041         }
3042     }
3043     while (nmin > nmin_old);
3044
3045     i=range[1]-1;
3046     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3047     /* For this check we should not use DD_CELL_MARGIN,
3048      * but a slightly smaller factor,
3049      * since rounding could get use below the limit.
3050      */
3051     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3052     {
3053         char buf[22];
3054         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3055                   gmx_step_str(step,buf),
3056                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3057                   ncd,comm->cellsize_min[dim]);
3058     }
3059
3060     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3061
3062     if (!bUniform)
3063     {
3064         /* Check if the boundary did not displace more than halfway
3065          * each of the cells it bounds, as this could cause problems,
3066          * especially when the differences between cell sizes are large.
3067          * If changes are applied, they will not make cells smaller
3068          * than the cut-off, as we check all the boundaries which
3069          * might be affected by a change and if the old state was ok,
3070          * the cells will at most be shrunk back to their old size.
3071          */
3072         for(i=range[0]+1; i<range[1]; i++)
3073         {
3074             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3075             if (root->cell_f[i] < halfway)
3076             {
3077                 root->cell_f[i] = halfway;
3078                 /* Check if the change also causes shifts of the next boundaries */
3079                 for(j=i+1; j<range[1]; j++)
3080                 {
3081                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3082                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3083                 }
3084             }
3085             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3086             if (root->cell_f[i] > halfway)
3087             {
3088                 root->cell_f[i] = halfway;
3089                 /* Check if the change also causes shifts of the next boundaries */
3090                 for(j=i-1; j>=range[0]+1; j--)
3091                 {
3092                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3093                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3094                 }
3095             }
3096         }
3097     }
3098
3099     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3100     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3101      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3102      * for a and b nrange is used */
3103     if (d > 0)
3104     {
3105         /* Take care of the staggering of the cell boundaries */
3106         if (bUniform)
3107         {
3108             for(i=range[0]; i<range[1]; i++)
3109             {
3110                 root->cell_f_max0[i] = root->cell_f[i];
3111                 root->cell_f_min1[i] = root->cell_f[i+1];
3112             }
3113         }
3114         else
3115         {
3116             for(i=range[0]+1; i<range[1]; i++)
3117             {
3118                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3119                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3120                 if (bLimLo && bLimHi)
3121                 {
3122                     /* Both limits violated, try the best we can */
3123                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3124                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3125                     nrange[0]=range[0];
3126                     nrange[1]=i;
3127                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3128
3129                     nrange[0]=i;
3130                     nrange[1]=range[1];
3131                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3132
3133                     return;
3134                 }
3135                 else if (bLimLo)
3136                 {
3137                     /* root->cell_f[i] = root->bound_min[i]; */
3138                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3139                     bLastHi=FALSE;
3140                 }
3141                 else if (bLimHi && !bLastHi)
3142                 {
3143                     bLastHi=TRUE;
3144                     if (nrange[1] < range[1])   /* found a LimLo before */
3145                     {
3146                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3147                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3148                         nrange[0]=nrange[1];
3149                     }
3150                     root->cell_f[i] = root->bound_max[i];
3151                     nrange[1]=i;
3152                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3153                     nrange[0]=i;
3154                     nrange[1]=range[1];
3155                 }
3156             }
3157             if (nrange[1] < range[1])   /* found last a LimLo */
3158             {
3159                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3160                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3161                 nrange[0]=nrange[1];
3162                 nrange[1]=range[1];
3163                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3164             }
3165             else if (nrange[0] > range[0]) /* found at least one LimHi */
3166             {
3167                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3168             }
3169         }
3170     }
3171 }
3172
3173
3174 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3175                                        int d,int dim,gmx_domdec_root_t *root,
3176                                        gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3177                                        gmx_bool bUniform,gmx_large_int_t step)
3178 {
3179     gmx_domdec_comm_t *comm;
3180     int  ncd,d1,i,j,pos;
3181     real *cell_size;
3182     real load_aver,load_i,imbalance,change,change_max,sc;
3183     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3184     real change_limit = 0.1;
3185     real relax = 0.5;
3186     gmx_bool bPBC;
3187     int range[] = { 0, 0 };
3188
3189     comm = dd->comm;
3190
3191     ncd = dd->nc[dim];
3192
3193     bPBC = (dim < ddbox->npbcdim);
3194
3195     cell_size = root->buf_ncd;
3196
3197     /* Store the original boundaries */
3198     for(i=0; i<ncd+1; i++)
3199     {
3200         root->old_cell_f[i] = root->cell_f[i];
3201     }
3202     if (bUniform) {
3203         for(i=0; i<ncd; i++)
3204         {
3205             cell_size[i] = 1.0/ncd;
3206         }
3207     }
3208     else if (dd_load_count(comm))
3209     {
3210         load_aver = comm->load[d].sum_m/ncd;
3211         change_max = 0;
3212         for(i=0; i<ncd; i++)
3213         {
3214             /* Determine the relative imbalance of cell i */
3215             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3216             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3217             /* Determine the change of the cell size using underrelaxation */
3218             change = -relax*imbalance;
3219             change_max = max(change_max,max(change,-change));
3220         }
3221         /* Limit the amount of scaling.
3222          * We need to use the same rescaling for all cells in one row,
3223          * otherwise the load balancing might not converge.
3224          */
3225         sc = relax;
3226         if (change_max > change_limit)
3227         {
3228             sc *= change_limit/change_max;
3229         }
3230         for(i=0; i<ncd; i++)
3231         {
3232             /* Determine the relative imbalance of cell i */
3233             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3234             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3235             /* Determine the change of the cell size using underrelaxation */
3236             change = -sc*imbalance;
3237             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3238         }
3239     }
3240
3241     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3242     cellsize_limit_f *= DD_CELL_MARGIN;
3243     dist_min_f_hard        = grid_jump_limit(comm,d)/ddbox->box_size[dim];
3244     dist_min_f       = dist_min_f_hard * DD_CELL_MARGIN;
3245     if (ddbox->tric_dir[dim])
3246     {
3247         cellsize_limit_f /= ddbox->skew_fac[dim];
3248         dist_min_f       /= ddbox->skew_fac[dim];
3249     }
3250     if (bDynamicBox && d > 0)
3251     {
3252         dist_min_f *= DD_PRES_SCALE_MARGIN;
3253     }
3254     if (d > 0 && !bUniform)
3255     {
3256         /* Make sure that the grid is not shifted too much */
3257         for(i=1; i<ncd; i++) {
3258             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3259             {
3260                 gmx_incons("Inconsistent DD boundary staggering limits!");
3261             }
3262             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3263             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3264             if (space > 0) {
3265                 root->bound_min[i] += 0.5*space;
3266             }
3267             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3268             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3269             if (space < 0) {
3270                 root->bound_max[i] += 0.5*space;
3271             }
3272             if (debug)
3273             {
3274                 fprintf(debug,
3275                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3276                         d,i,
3277                         root->cell_f_max0[i-1] + dist_min_f,
3278                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3279                         root->cell_f_min1[i] - dist_min_f);
3280             }
3281         }
3282     }
3283     range[1]=ncd;
3284     root->cell_f[0] = 0;
3285     root->cell_f[ncd] = 1;
3286     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3287
3288
3289     /* After the checks above, the cells should obey the cut-off
3290      * restrictions, but it does not hurt to check.
3291      */
3292     for(i=0; i<ncd; i++)
3293     {
3294         if (debug)
3295         {
3296             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3297                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3298         }
3299
3300         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3301             root->cell_f[i+1] - root->cell_f[i] <
3302             cellsize_limit_f/DD_CELL_MARGIN)
3303         {
3304             char buf[22];
3305             fprintf(stderr,
3306                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3307                     gmx_step_str(step,buf),dim2char(dim),i,
3308                     (root->cell_f[i+1] - root->cell_f[i])
3309                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3310         }
3311     }
3312
3313     pos = ncd + 1;
3314     /* Store the cell boundaries of the lower dimensions at the end */
3315     for(d1=0; d1<d; d1++)
3316     {
3317         root->cell_f[pos++] = comm->cell_f0[d1];
3318         root->cell_f[pos++] = comm->cell_f1[d1];
3319     }
3320
3321     if (d < comm->npmedecompdim)
3322     {
3323         /* The master determines the maximum shift for
3324          * the coordinate communication between separate PME nodes.
3325          */
3326         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3327     }
3328     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3329     if (d >= 1)
3330     {
3331         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3332     }
3333 }
3334
3335 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3336                                              gmx_ddbox_t *ddbox,int dimind)
3337 {
3338     gmx_domdec_comm_t *comm;
3339     int dim;
3340
3341     comm = dd->comm;
3342
3343     /* Set the cell dimensions */
3344     dim = dd->dim[dimind];
3345     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3346     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3347     if (dim >= ddbox->nboundeddim)
3348     {
3349         comm->cell_x0[dim] += ddbox->box0[dim];
3350         comm->cell_x1[dim] += ddbox->box0[dim];
3351     }
3352 }
3353
3354 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3355                                          int d,int dim,real *cell_f_row,
3356                                          gmx_ddbox_t *ddbox)
3357 {
3358     gmx_domdec_comm_t *comm;
3359     int d1,dim1,pos;
3360
3361     comm = dd->comm;
3362
3363 #ifdef GMX_MPI
3364     /* Each node would only need to know two fractions,
3365      * but it is probably cheaper to broadcast the whole array.
3366      */
3367     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3368               0,comm->mpi_comm_load[d]);
3369 #endif
3370     /* Copy the fractions for this dimension from the buffer */
3371     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3372     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3373     /* The whole array was communicated, so set the buffer position */
3374     pos = dd->nc[dim] + 1;
3375     for(d1=0; d1<=d; d1++)
3376     {
3377         if (d1 < d)
3378         {
3379             /* Copy the cell fractions of the lower dimensions */
3380             comm->cell_f0[d1] = cell_f_row[pos++];
3381             comm->cell_f1[d1] = cell_f_row[pos++];
3382         }
3383         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3384     }
3385     /* Convert the communicated shift from float to int */
3386     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3387     if (d >= 1)
3388     {
3389         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3390     }
3391 }
3392
3393 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3394                                          gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3395                                          gmx_bool bUniform,gmx_large_int_t step)
3396 {
3397     gmx_domdec_comm_t *comm;
3398     int d,dim,d1;
3399     gmx_bool bRowMember,bRowRoot;
3400     real *cell_f_row;
3401
3402     comm = dd->comm;
3403
3404     for(d=0; d<dd->ndim; d++)
3405     {
3406         dim = dd->dim[d];
3407         bRowMember = TRUE;
3408         bRowRoot = TRUE;
3409         for(d1=d; d1<dd->ndim; d1++)
3410         {
3411             if (dd->ci[dd->dim[d1]] > 0)
3412             {
3413                 if (d1 > d)
3414                 {
3415                     bRowMember = FALSE;
3416                 }
3417                 bRowRoot = FALSE;
3418             }
3419         }
3420         if (bRowMember)
3421         {
3422             if (bRowRoot)
3423             {
3424                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3425                                            ddbox,bDynamicBox,bUniform,step);
3426                 cell_f_row = comm->root[d]->cell_f;
3427             }
3428             else
3429             {
3430                 cell_f_row = comm->cell_f_row;
3431             }
3432             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3433         }
3434     }
3435 }
3436
3437 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3438 {
3439     int d;
3440
3441     /* This function assumes the box is static and should therefore
3442      * not be called when the box has changed since the last
3443      * call to dd_partition_system.
3444      */
3445     for(d=0; d<dd->ndim; d++)
3446     {
3447         relative_to_absolute_cell_bounds(dd,ddbox,d);
3448     }
3449 }
3450
3451
3452
3453 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3454                                   gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3455                                   gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3456                                   gmx_wallcycle_t wcycle)
3457 {
3458     gmx_domdec_comm_t *comm;
3459     int dim;
3460
3461     comm = dd->comm;
3462
3463     if (bDoDLB)
3464     {
3465         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3466         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3467         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3468     }
3469     else if (bDynamicBox)
3470     {
3471         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3472     }
3473
3474     /* Set the dimensions for which no DD is used */
3475     for(dim=0; dim<DIM; dim++) {
3476         if (dd->nc[dim] == 1) {
3477             comm->cell_x0[dim] = 0;
3478             comm->cell_x1[dim] = ddbox->box_size[dim];
3479             if (dim >= ddbox->nboundeddim)
3480             {
3481                 comm->cell_x0[dim] += ddbox->box0[dim];
3482                 comm->cell_x1[dim] += ddbox->box0[dim];
3483             }
3484         }
3485     }
3486 }
3487
3488 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3489 {
3490     int d,np,i;
3491     gmx_domdec_comm_dim_t *cd;
3492
3493     for(d=0; d<dd->ndim; d++)
3494     {
3495         cd = &dd->comm->cd[d];
3496         np = npulse[dd->dim[d]];
3497         if (np > cd->np_nalloc)
3498         {
3499             if (debug)
3500             {
3501                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3502                         dim2char(dd->dim[d]),np);
3503             }
3504             if (DDMASTER(dd) && cd->np_nalloc > 0)
3505             {
3506                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3507             }
3508             srenew(cd->ind,np);
3509             for(i=cd->np_nalloc; i<np; i++)
3510             {
3511                 cd->ind[i].index  = NULL;
3512                 cd->ind[i].nalloc = 0;
3513             }
3514             cd->np_nalloc = np;
3515         }
3516         cd->np = np;
3517     }
3518 }
3519
3520
3521 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3522                               gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3523                               gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3524                               gmx_wallcycle_t wcycle)
3525 {
3526     gmx_domdec_comm_t *comm;
3527     int  d;
3528     ivec npulse;
3529
3530     comm = dd->comm;
3531
3532     /* Copy the old cell boundaries for the cg displacement check */
3533     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3534     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3535
3536     if (comm->bDynLoadBal)
3537     {
3538         if (DDMASTER(dd))
3539         {
3540             check_box_size(dd,ddbox);
3541         }
3542         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3543     }
3544     else
3545     {
3546         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3547         realloc_comm_ind(dd,npulse);
3548     }
3549
3550     if (debug)
3551     {
3552         for(d=0; d<DIM; d++)
3553         {
3554             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3555                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3556         }
3557     }
3558 }
3559
3560 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3561                                   gmx_ddbox_t *ddbox,
3562                                   rvec cell_ns_x0,rvec cell_ns_x1,
3563                                   gmx_large_int_t step)
3564 {
3565     gmx_domdec_comm_t *comm;
3566     int dim_ind,dim;
3567
3568     comm = dd->comm;
3569
3570     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3571     {
3572         dim = dd->dim[dim_ind];
3573
3574         /* Without PBC we don't have restrictions on the outer cells */
3575         if (!(dim >= ddbox->npbcdim &&
3576               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3577             comm->bDynLoadBal &&
3578             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3579             comm->cellsize_min[dim])
3580         {
3581             char buf[22];
3582             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3583                       gmx_step_str(step,buf),dim2char(dim),
3584                       comm->cell_x1[dim] - comm->cell_x0[dim],
3585                       ddbox->skew_fac[dim],
3586                       dd->comm->cellsize_min[dim],
3587                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3588         }
3589     }
3590
3591     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3592     {
3593         /* Communicate the boundaries and update cell_ns_x0/1 */
3594         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3595         if (dd->bGridJump && dd->ndim > 1)
3596         {
3597             check_grid_jump(step,dd,ddbox);
3598         }
3599     }
3600 }
3601
3602 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3603 {
3604     if (YY < npbcdim)
3605     {
3606         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3607     }
3608     else
3609     {
3610         tcm[YY][XX] = 0;
3611     }
3612     if (ZZ < npbcdim)
3613     {
3614         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3615         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3616     }
3617     else
3618     {
3619         tcm[ZZ][XX] = 0;
3620         tcm[ZZ][YY] = 0;
3621     }
3622 }
3623
3624 static void check_screw_box(matrix box)
3625 {
3626     /* Mathematical limitation */
3627     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3628     {
3629         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3630     }
3631
3632     /* Limitation due to the asymmetry of the eighth shell method */
3633     if (box[ZZ][YY] != 0)
3634     {
3635         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3636     }
3637 }
3638
3639 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3640                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3641                           gmx_domdec_t *dd)
3642 {
3643     gmx_domdec_master_t *ma;
3644     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3645     int  i,icg,j,k,k0,k1,d,npbcdim;
3646     matrix tcm;
3647     rvec box_size,cg_cm;
3648     ivec ind;
3649     real nrcg,inv_ncg,pos_d;
3650     atom_id *cgindex;
3651     gmx_bool bUnbounded,bScrew;
3652
3653     ma = dd->ma;
3654
3655     if (tmp_ind == NULL)
3656     {
3657         snew(tmp_nalloc,dd->nnodes);
3658         snew(tmp_ind,dd->nnodes);
3659         for(i=0; i<dd->nnodes; i++)
3660         {
3661             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3662             snew(tmp_ind[i],tmp_nalloc[i]);
3663         }
3664     }
3665
3666     /* Clear the count */
3667     for(i=0; i<dd->nnodes; i++)
3668     {
3669         ma->ncg[i] = 0;
3670         ma->nat[i] = 0;
3671     }
3672
3673     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3674
3675     cgindex = cgs->index;
3676
3677     /* Compute the center of geometry for all charge groups */
3678     for(icg=0; icg<cgs->nr; icg++)
3679     {
3680         k0      = cgindex[icg];
3681         k1      = cgindex[icg+1];
3682         nrcg    = k1 - k0;
3683         if (nrcg == 1)
3684         {
3685             copy_rvec(pos[k0],cg_cm);
3686         }
3687         else
3688         {
3689             inv_ncg = 1.0/nrcg;
3690
3691             clear_rvec(cg_cm);
3692             for(k=k0; (k<k1); k++)
3693             {
3694                 rvec_inc(cg_cm,pos[k]);
3695             }
3696             for(d=0; (d<DIM); d++)
3697             {
3698                 cg_cm[d] *= inv_ncg;
3699             }
3700         }
3701         /* Put the charge group in the box and determine the cell index */
3702         for(d=DIM-1; d>=0; d--) {
3703             pos_d = cg_cm[d];
3704             if (d < dd->npbcdim)
3705             {
3706                 bScrew = (dd->bScrewPBC && d == XX);
3707                 if (tric_dir[d] && dd->nc[d] > 1)
3708                 {
3709                     /* Use triclinic coordintates for this dimension */
3710                     for(j=d+1; j<DIM; j++)
3711                     {
3712                         pos_d += cg_cm[j]*tcm[j][d];
3713                     }
3714                 }
3715                 while(pos_d >= box[d][d])
3716                 {
3717                     pos_d -= box[d][d];
3718                     rvec_dec(cg_cm,box[d]);
3719                     if (bScrew)
3720                     {
3721                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3722                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3723                     }
3724                     for(k=k0; (k<k1); k++)
3725                     {
3726                         rvec_dec(pos[k],box[d]);
3727                         if (bScrew)
3728                         {
3729                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3730                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3731                         }
3732                     }
3733                 }
3734                 while(pos_d < 0)
3735                 {
3736                     pos_d += box[d][d];
3737                     rvec_inc(cg_cm,box[d]);
3738                     if (bScrew)
3739                     {
3740                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3741                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3742                     }
3743                     for(k=k0; (k<k1); k++)
3744                     {
3745                         rvec_inc(pos[k],box[d]);
3746                         if (bScrew) {
3747                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3748                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3749                         }
3750                     }
3751                 }
3752             }
3753             /* This could be done more efficiently */
3754             ind[d] = 0;
3755             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3756             {
3757                 ind[d]++;
3758             }
3759         }
3760         i = dd_index(dd->nc,ind);
3761         if (ma->ncg[i] == tmp_nalloc[i])
3762         {
3763             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3764             srenew(tmp_ind[i],tmp_nalloc[i]);
3765         }
3766         tmp_ind[i][ma->ncg[i]] = icg;
3767         ma->ncg[i]++;
3768         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3769     }
3770
3771     k1 = 0;
3772     for(i=0; i<dd->nnodes; i++)
3773     {
3774         ma->index[i] = k1;
3775         for(k=0; k<ma->ncg[i]; k++)
3776         {
3777             ma->cg[k1++] = tmp_ind[i][k];
3778         }
3779     }
3780     ma->index[dd->nnodes] = k1;
3781
3782     for(i=0; i<dd->nnodes; i++)
3783     {
3784         sfree(tmp_ind[i]);
3785     }
3786     sfree(tmp_ind);
3787     sfree(tmp_nalloc);
3788
3789     if (fplog)
3790     {
3791         char buf[22];
3792         fprintf(fplog,"Charge group distribution at step %s:",
3793                 gmx_step_str(step,buf));
3794         for(i=0; i<dd->nnodes; i++)
3795         {
3796             fprintf(fplog," %d",ma->ncg[i]);
3797         }
3798         fprintf(fplog,"\n");
3799     }
3800 }
3801
3802 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3803                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3804                                 rvec pos[])
3805 {
3806     gmx_domdec_master_t *ma=NULL;
3807     ivec npulse;
3808     int  i,cg_gl;
3809     int  *ibuf,buf2[2] = { 0, 0 };
3810
3811     if (DDMASTER(dd))
3812     {
3813         ma = dd->ma;
3814
3815         if (dd->bScrewPBC)
3816         {
3817             check_screw_box(box);
3818         }
3819
3820         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3821
3822         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3823         for(i=0; i<dd->nnodes; i++)
3824         {
3825             ma->ibuf[2*i]   = ma->ncg[i];
3826             ma->ibuf[2*i+1] = ma->nat[i];
3827         }
3828         ibuf = ma->ibuf;
3829     }
3830     else
3831     {
3832         ibuf = NULL;
3833     }
3834     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3835
3836     dd->ncg_home = buf2[0];
3837     dd->nat_home = buf2[1];
3838     dd->ncg_tot  = dd->ncg_home;
3839     dd->nat_tot  = dd->nat_home;
3840     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3841     {
3842         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3843         srenew(dd->index_gl,dd->cg_nalloc);
3844         srenew(dd->cgindex,dd->cg_nalloc+1);
3845     }
3846     if (DDMASTER(dd))
3847     {
3848         for(i=0; i<dd->nnodes; i++)
3849         {
3850             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3851             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3852         }
3853     }
3854
3855     dd_scatterv(dd,
3856                 DDMASTER(dd) ? ma->ibuf : NULL,
3857                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3858                 DDMASTER(dd) ? ma->cg : NULL,
3859                 dd->ncg_home*sizeof(int),dd->index_gl);
3860
3861     /* Determine the home charge group sizes */
3862     dd->cgindex[0] = 0;
3863     for(i=0; i<dd->ncg_home; i++)
3864     {
3865         cg_gl = dd->index_gl[i];
3866         dd->cgindex[i+1] =
3867             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3868     }
3869
3870     if (debug)
3871     {
3872         fprintf(debug,"Home charge groups:\n");
3873         for(i=0; i<dd->ncg_home; i++)
3874         {
3875             fprintf(debug," %d",dd->index_gl[i]);
3876             if (i % 10 == 9)
3877                 fprintf(debug,"\n");
3878         }
3879         fprintf(debug,"\n");
3880     }
3881 }
3882
3883 static int compact_and_copy_vec_at(int ncg,int *move,
3884                                    int *cgindex,
3885                                    int nvec,int vec,
3886                                    rvec *src,gmx_domdec_comm_t *comm,
3887                                    gmx_bool bCompact)
3888 {
3889     int m,icg,i,i0,i1,nrcg;
3890     int home_pos;
3891     int pos_vec[DIM*2];
3892
3893     home_pos = 0;
3894
3895     for(m=0; m<DIM*2; m++)
3896     {
3897         pos_vec[m] = 0;
3898     }
3899
3900     i0 = 0;
3901     for(icg=0; icg<ncg; icg++)
3902     {
3903         i1 = cgindex[icg+1];
3904         m = move[icg];
3905         if (m == -1)
3906         {
3907             if (bCompact)
3908             {
3909                 /* Compact the home array in place */
3910                 for(i=i0; i<i1; i++)
3911                 {
3912                     copy_rvec(src[i],src[home_pos++]);
3913                 }
3914             }
3915         }
3916         else
3917         {
3918             /* Copy to the communication buffer */
3919             nrcg = i1 - i0;
3920             pos_vec[m] += 1 + vec*nrcg;
3921             for(i=i0; i<i1; i++)
3922             {
3923                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
3924             }
3925             pos_vec[m] += (nvec - vec - 1)*nrcg;
3926         }
3927         if (!bCompact)
3928         {
3929             home_pos += i1 - i0;
3930         }
3931         i0 = i1;
3932     }
3933
3934     return home_pos;
3935 }
3936
3937 static int compact_and_copy_vec_cg(int ncg,int *move,
3938                                    int *cgindex,
3939                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
3940                                    gmx_bool bCompact)
3941 {
3942     int m,icg,i0,i1,nrcg;
3943     int home_pos;
3944     int pos_vec[DIM*2];
3945
3946     home_pos = 0;
3947
3948     for(m=0; m<DIM*2; m++)
3949     {
3950         pos_vec[m] = 0;
3951     }
3952
3953     i0 = 0;
3954     for(icg=0; icg<ncg; icg++)
3955     {
3956         i1 = cgindex[icg+1];
3957         m = move[icg];
3958         if (m == -1)
3959         {
3960             if (bCompact)
3961             {
3962                 /* Compact the home array in place */
3963                 copy_rvec(src[icg],src[home_pos++]);
3964             }
3965         }
3966         else
3967         {
3968             nrcg = i1 - i0;
3969             /* Copy to the communication buffer */
3970             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
3971             pos_vec[m] += 1 + nrcg*nvec;
3972         }
3973         i0 = i1;
3974     }
3975     if (!bCompact)
3976     {
3977         home_pos = ncg;
3978     }
3979
3980     return home_pos;
3981 }
3982
3983 static int compact_ind(int ncg,int *move,
3984                        int *index_gl,int *cgindex,
3985                        int *gatindex,
3986                        gmx_ga2la_t ga2la,char *bLocalCG,
3987                        int *cginfo)
3988 {
3989     int cg,nat,a0,a1,a,a_gl;
3990     int home_pos;
3991
3992     home_pos = 0;
3993     nat = 0;
3994     for(cg=0; cg<ncg; cg++)
3995     {
3996         a0 = cgindex[cg];
3997         a1 = cgindex[cg+1];
3998         if (move[cg] == -1)
3999         {
4000             /* Compact the home arrays in place.
4001              * Anything that can be done here avoids access to global arrays.
4002              */
4003             cgindex[home_pos] = nat;
4004             for(a=a0; a<a1; a++)
4005             {
4006                 a_gl = gatindex[a];
4007                 gatindex[nat] = a_gl;
4008                 /* The cell number stays 0, so we don't need to set it */
4009                 ga2la_change_la(ga2la,a_gl,nat);
4010                 nat++;
4011             }
4012             index_gl[home_pos] = index_gl[cg];
4013             cginfo[home_pos]   = cginfo[cg];
4014             /* The charge group remains local, so bLocalCG does not change */
4015             home_pos++;
4016         }
4017         else
4018         {
4019             /* Clear the global indices */
4020             for(a=a0; a<a1; a++)
4021             {
4022                 ga2la_del(ga2la,gatindex[a]);
4023             }
4024             if (bLocalCG)
4025             {
4026                 bLocalCG[index_gl[cg]] = FALSE;
4027             }
4028         }
4029     }
4030     cgindex[home_pos] = nat;
4031
4032     return home_pos;
4033 }
4034
4035 static void clear_and_mark_ind(int ncg,int *move,
4036                                int *index_gl,int *cgindex,int *gatindex,
4037                                gmx_ga2la_t ga2la,char *bLocalCG,
4038                                int *cell_index)
4039 {
4040     int cg,a0,a1,a;
4041
4042     for(cg=0; cg<ncg; cg++)
4043     {
4044         if (move[cg] >= 0)
4045         {
4046             a0 = cgindex[cg];
4047             a1 = cgindex[cg+1];
4048             /* Clear the global indices */
4049             for(a=a0; a<a1; a++)
4050             {
4051                 ga2la_del(ga2la,gatindex[a]);
4052             }
4053             if (bLocalCG)
4054             {
4055                 bLocalCG[index_gl[cg]] = FALSE;
4056             }
4057             /* Signal that this cg has moved using the ns cell index.
4058              * Here we set it to -1.
4059              * fill_grid will change it from -1 to 4*grid->ncells.
4060              */
4061             cell_index[cg] = -1;
4062         }
4063     }
4064 }
4065
4066 static void print_cg_move(FILE *fplog,
4067                           gmx_domdec_t *dd,
4068                           gmx_large_int_t step,int cg,int dim,int dir,
4069                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4070                           rvec cm_old,rvec cm_new,real pos_d)
4071 {
4072     gmx_domdec_comm_t *comm;
4073     char buf[22];
4074
4075     comm = dd->comm;
4076
4077     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4078     if (bHaveLimitdAndCMOld)
4079     {
4080         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition (%f) in direction %c\n",
4081                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4082     }
4083     else
4084     {
4085         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4086                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4087     }
4088     fprintf(fplog,"distance out of cell %f\n",
4089             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4090     if (bHaveLimitdAndCMOld)
4091     {
4092         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4093                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4094     }
4095     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4096             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4097     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4098             dim2char(dim),
4099             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4100     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4101             dim2char(dim),
4102             comm->cell_x0[dim],comm->cell_x1[dim]);
4103 }
4104
4105 static void cg_move_error(FILE *fplog,
4106                           gmx_domdec_t *dd,
4107                           gmx_large_int_t step,int cg,int dim,int dir,
4108                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4109                           rvec cm_old,rvec cm_new,real pos_d)
4110 {
4111     if (fplog)
4112     {
4113         print_cg_move(fplog, dd,step,cg,dim,dir,
4114                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4115     }
4116     print_cg_move(stderr,dd,step,cg,dim,dir,
4117                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4118     gmx_fatal(FARGS,
4119               "A charge group moved too far between two domain decomposition steps\n"
4120               "This usually means that your system is not well equilibrated");
4121 }
4122
4123 static void rotate_state_atom(t_state *state,int a)
4124 {
4125     int est;
4126
4127     for(est=0; est<estNR; est++)
4128     {
4129         if (EST_DISTR(est) && state->flags & (1<<est)) {
4130             switch (est) {
4131             case estX:
4132                 /* Rotate the complete state; for a rectangular box only */
4133                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4134                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4135                 break;
4136             case estV:
4137                 state->v[a][YY] = -state->v[a][YY];
4138                 state->v[a][ZZ] = -state->v[a][ZZ];
4139                 break;
4140             case estSDX:
4141                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4142                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4143                 break;
4144             case estCGP:
4145                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4146                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4147                 break;
4148             case estDISRE_INITF:
4149             case estDISRE_RM3TAV:
4150             case estORIRE_INITF:
4151             case estORIRE_DTAV:
4152                 /* These are distances, so not affected by rotation */
4153                 break;
4154             default:
4155                 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4156             }
4157         }
4158     }
4159 }
4160
4161 static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4162                               gmx_domdec_t *dd,ivec tric_dir,
4163                               t_state *state,rvec **f,
4164                               t_forcerec *fr,t_mdatoms *md,
4165                               gmx_bool bCompact,
4166                               t_nrnb *nrnb)
4167 {
4168     int  *move;
4169     int  npbcdim;
4170     int  ncg[DIM*2],nat[DIM*2];
4171     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4172     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4173     int  sbuf[2],rbuf[2];
4174     int  home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
4175     int  flag;
4176     gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4177     gmx_bool bScrew;
4178     ivec dev;
4179     real inv_ncg,pos_d;
4180     matrix tcm;
4181     rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4182     atom_id *cgindex;
4183     cginfo_mb_t *cginfo_mb;
4184     gmx_domdec_comm_t *comm;
4185
4186     if (dd->bScrewPBC)
4187     {
4188         check_screw_box(state->box);
4189     }
4190
4191     comm  = dd->comm;
4192     cg_cm = fr->cg_cm;
4193
4194     for(i=0; i<estNR; i++)
4195     {
4196         if (EST_DISTR(i))
4197         {
4198             switch (i)
4199             {
4200             case estX:   /* Always present */            break;
4201             case estV:   bV   = (state->flags & (1<<i)); break;
4202             case estSDX: bSDX = (state->flags & (1<<i)); break;
4203             case estCGP: bCGP = (state->flags & (1<<i)); break;
4204             case estLD_RNG:
4205             case estLD_RNGI:
4206             case estDISRE_INITF:
4207             case estDISRE_RM3TAV:
4208             case estORIRE_INITF:
4209             case estORIRE_DTAV:
4210                 /* No processing required */
4211                 break;
4212             default:
4213             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4214             }
4215         }
4216     }
4217
4218     if (dd->ncg_tot > comm->nalloc_int)
4219     {
4220         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4221         srenew(comm->buf_int,comm->nalloc_int);
4222     }
4223     move = comm->buf_int;
4224
4225     /* Clear the count */
4226     for(c=0; c<dd->ndim*2; c++)
4227     {
4228         ncg[c] = 0;
4229         nat[c] = 0;
4230     }
4231
4232     npbcdim = dd->npbcdim;
4233
4234     for(d=0; (d<DIM); d++)
4235     {
4236         limitd[d] = dd->comm->cellsize_min[d];
4237         if (d >= npbcdim && dd->ci[d] == 0)
4238         {
4239             cell_x0[d] = -GMX_FLOAT_MAX;
4240         }
4241         else
4242         {
4243             cell_x0[d] = comm->cell_x0[d];
4244         }
4245         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4246         {
4247             cell_x1[d] = GMX_FLOAT_MAX;
4248         }
4249         else
4250         {
4251             cell_x1[d] = comm->cell_x1[d];
4252         }
4253         if (d < npbcdim)
4254         {
4255             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4256             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4257         }
4258         else
4259         {
4260             /* We check after communication if a charge group moved
4261              * more than one cell. Set the pre-comm check limit to float_max.
4262              */
4263             limit0[d] = -GMX_FLOAT_MAX;
4264             limit1[d] =  GMX_FLOAT_MAX;
4265         }
4266     }
4267
4268     make_tric_corr_matrix(npbcdim,state->box,tcm);
4269
4270     cgindex = dd->cgindex;
4271
4272     /* Compute the center of geometry for all home charge groups
4273      * and put them in the box and determine where they should go.
4274      */
4275     for(cg=0; cg<dd->ncg_home; cg++)
4276     {
4277         k0   = cgindex[cg];
4278         k1   = cgindex[cg+1];
4279         nrcg = k1 - k0;
4280         if (nrcg == 1)
4281         {
4282             copy_rvec(state->x[k0],cm_new);
4283         }
4284         else
4285         {
4286             inv_ncg = 1.0/nrcg;
4287
4288             clear_rvec(cm_new);
4289             for(k=k0; (k<k1); k++)
4290             {
4291                 rvec_inc(cm_new,state->x[k]);
4292             }
4293             for(d=0; (d<DIM); d++)
4294             {
4295                 cm_new[d] = inv_ncg*cm_new[d];
4296             }
4297         }
4298
4299         clear_ivec(dev);
4300         /* Do pbc and check DD cell boundary crossings */
4301         for(d=DIM-1; d>=0; d--)
4302         {
4303             if (dd->nc[d] > 1)
4304             {
4305                 bScrew = (dd->bScrewPBC && d == XX);
4306                 /* Determine the location of this cg in lattice coordinates */
4307                 pos_d = cm_new[d];
4308                 if (tric_dir[d])
4309                 {
4310                     for(d2=d+1; d2<DIM; d2++)
4311                     {
4312                         pos_d += cm_new[d2]*tcm[d2][d];
4313                     }
4314                 }
4315                 /* Put the charge group in the triclinic unit-cell */
4316                 if (pos_d >= cell_x1[d])
4317                 {
4318                     if (pos_d >= limit1[d])
4319                     {
4320                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4321                                       cg_cm[cg],cm_new,pos_d);
4322                     }
4323                     dev[d] = 1;
4324                     if (dd->ci[d] == dd->nc[d] - 1)
4325                     {
4326                         rvec_dec(cm_new,state->box[d]);
4327                         if (bScrew)
4328                         {
4329                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4330                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4331                         }
4332                         for(k=k0; (k<k1); k++)
4333                         {
4334                             rvec_dec(state->x[k],state->box[d]);
4335                             if (bScrew)
4336                             {
4337                                 rotate_state_atom(state,k);
4338                             }
4339                         }
4340                     }
4341                 }
4342                 else if (pos_d < cell_x0[d])
4343                 {
4344                     if (pos_d < limit0[d])
4345                     {
4346                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4347                                       cg_cm[cg],cm_new,pos_d);
4348                     }
4349                     dev[d] = -1;
4350                     if (dd->ci[d] == 0)
4351                     {
4352                         rvec_inc(cm_new,state->box[d]);
4353                         if (bScrew)
4354                         {
4355                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4356                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4357                         }
4358                         for(k=k0; (k<k1); k++)
4359                         {
4360                             rvec_inc(state->x[k],state->box[d]);
4361                             if (bScrew)
4362                             {
4363                                 rotate_state_atom(state,k);
4364                             }
4365                         }
4366                     }
4367                 }
4368             }
4369             else if (d < npbcdim)
4370             {
4371                 /* Put the charge group in the rectangular unit-cell */
4372                 while (cm_new[d] >= state->box[d][d])
4373                 {
4374                     rvec_dec(cm_new,state->box[d]);
4375                     for(k=k0; (k<k1); k++)
4376                     {
4377                         rvec_dec(state->x[k],state->box[d]);
4378                     }
4379                 }
4380                 while (cm_new[d] < 0)
4381                 {
4382                     rvec_inc(cm_new,state->box[d]);
4383                     for(k=k0; (k<k1); k++)
4384                     {
4385                         rvec_inc(state->x[k],state->box[d]);
4386                     }
4387                 }
4388             }
4389         }
4390
4391         copy_rvec(cm_new,cg_cm[cg]);
4392
4393         /* Determine where this cg should go */
4394         flag = 0;
4395         mc = -1;
4396         for(d=0; d<dd->ndim; d++)
4397         {
4398             dim = dd->dim[d];
4399             if (dev[dim] == 1)
4400             {
4401                 flag |= DD_FLAG_FW(d);
4402                 if (mc == -1)
4403                 {
4404                     mc = d*2;
4405                 }
4406             }
4407             else if (dev[dim] == -1)
4408             {
4409                 flag |= DD_FLAG_BW(d);
4410                 if (mc == -1) {
4411                     if (dd->nc[dim] > 2)
4412                     {
4413                         mc = d*2 + 1;
4414                     }
4415                     else
4416                     {
4417                         mc = d*2;
4418                     }
4419                 }
4420             }
4421         }
4422         move[cg] = mc;
4423         if (mc >= 0)
4424         {
4425             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4426             {
4427                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4428                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4429             }
4430             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4431             /* We store the cg size in the lower 16 bits
4432              * and the place where the charge group should go
4433              * in the next 6 bits. This saves some communication volume.
4434              */
4435             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4436             ncg[mc] += 1;
4437             nat[mc] += nrcg;
4438         }
4439     }
4440
4441     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4442     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4443
4444     nvec = 1;
4445     if (bV)
4446     {
4447         nvec++;
4448     }
4449     if (bSDX)
4450     {
4451         nvec++;
4452     }
4453     if (bCGP)
4454     {
4455         nvec++;
4456     }
4457
4458     /* Make sure the communication buffers are large enough */
4459     for(mc=0; mc<dd->ndim*2; mc++)
4460     {
4461         nvr = ncg[mc] + nat[mc]*nvec;
4462         if (nvr > comm->cgcm_state_nalloc[mc])
4463         {
4464             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4465             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4466         }
4467     }
4468
4469     /* Recalculating cg_cm might be cheaper than communicating,
4470      * but that could give rise to rounding issues.
4471      */
4472     home_pos_cg =
4473         compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4474                                 nvec,cg_cm,comm,bCompact);
4475
4476     vec = 0;
4477     home_pos_at =
4478         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4479                                 nvec,vec++,state->x,comm,bCompact);
4480     if (bV)
4481     {
4482         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4483                                 nvec,vec++,state->v,comm,bCompact);
4484     }
4485     if (bSDX)
4486     {
4487         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4488                                 nvec,vec++,state->sd_X,comm,bCompact);
4489     }
4490     if (bCGP)
4491     {
4492         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4493                                 nvec,vec++,state->cg_p,comm,bCompact);
4494     }
4495
4496     if (bCompact)
4497     {
4498         compact_ind(dd->ncg_home,move,
4499                     dd->index_gl,dd->cgindex,dd->gatindex,
4500                     dd->ga2la,comm->bLocalCG,
4501                     fr->cginfo);
4502     }
4503     else
4504     {
4505         clear_and_mark_ind(dd->ncg_home,move,
4506                            dd->index_gl,dd->cgindex,dd->gatindex,
4507                            dd->ga2la,comm->bLocalCG,
4508                            fr->ns.grid->cell_index);
4509     }
4510
4511     cginfo_mb = fr->cginfo_mb;
4512
4513     ncg_stay_home = home_pos_cg;
4514     for(d=0; d<dd->ndim; d++)
4515     {
4516         dim = dd->dim[d];
4517         ncg_recv = 0;
4518         nat_recv = 0;
4519         nvr      = 0;
4520         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4521         {
4522             cdd = d*2 + dir;
4523             /* Communicate the cg and atom counts */
4524             sbuf[0] = ncg[cdd];
4525             sbuf[1] = nat[cdd];
4526             if (debug)
4527             {
4528                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4529                         d,dir,sbuf[0],sbuf[1]);
4530             }
4531             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4532
4533             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4534             {
4535                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4536                 srenew(comm->buf_int,comm->nalloc_int);
4537             }
4538
4539             /* Communicate the charge group indices, sizes and flags */
4540             dd_sendrecv_int(dd, d, dir,
4541                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4542                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4543
4544             nvs = ncg[cdd] + nat[cdd]*nvec;
4545             i   = rbuf[0]  + rbuf[1] *nvec;
4546             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4547
4548             /* Communicate cgcm and state */
4549             dd_sendrecv_rvec(dd, d, dir,
4550                              comm->cgcm_state[cdd], nvs,
4551                              comm->vbuf.v+nvr, i);
4552             ncg_recv += rbuf[0];
4553             nat_recv += rbuf[1];
4554             nvr      += i;
4555         }
4556
4557         /* Process the received charge groups */
4558         buf_pos = 0;
4559         for(cg=0; cg<ncg_recv; cg++)
4560         {
4561             flag = comm->buf_int[cg*DD_CGIBS+1];
4562
4563             if (dim >= npbcdim && dd->nc[dim] > 2)
4564             {
4565                 /* No pbc in this dim and more than one domain boundary.
4566                  * We to a separate check if a charge did not move too far.
4567                  */
4568                 if (((flag & DD_FLAG_FW(d)) &&
4569                      comm->vbuf.v[buf_pos][d] > cell_x1[dim]) ||
4570                     ((flag & DD_FLAG_BW(d)) &&
4571                      comm->vbuf.v[buf_pos][d] < cell_x0[dim]))
4572                 {
4573                     cg_move_error(fplog,dd,step,cg,d,
4574                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4575                                    FALSE,0,
4576                                    comm->vbuf.v[buf_pos],
4577                                    comm->vbuf.v[buf_pos],
4578                                    comm->vbuf.v[buf_pos][d]);
4579                 }
4580             }
4581
4582             mc = -1;
4583             if (d < dd->ndim-1)
4584             {
4585                 /* Check which direction this cg should go */
4586                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4587                 {
4588                     if (dd->bGridJump)
4589                     {
4590                         /* The cell boundaries for dimension d2 are not equal
4591                          * for each cell row of the lower dimension(s),
4592                          * therefore we might need to redetermine where
4593                          * this cg should go.
4594                          */
4595                         dim2 = dd->dim[d2];
4596                         /* If this cg crosses the box boundary in dimension d2
4597                          * we can use the communicated flag, so we do not
4598                          * have to worry about pbc.
4599                          */
4600                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4601                                (flag & DD_FLAG_FW(d2))) ||
4602                               (dd->ci[dim2] == 0 &&
4603                                (flag & DD_FLAG_BW(d2)))))
4604                         {
4605                             /* Clear the two flags for this dimension */
4606                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4607                             /* Determine the location of this cg
4608                              * in lattice coordinates
4609                              */
4610                             pos_d = comm->vbuf.v[buf_pos][dim2];
4611                             if (tric_dir[dim2])
4612                             {
4613                                 for(d3=dim2+1; d3<DIM; d3++)
4614                                 {
4615                                     pos_d +=
4616                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4617                                 }
4618                             }
4619                             /* Check of we are not at the box edge.
4620                              * pbc is only handled in the first step above,
4621                              * but this check could move over pbc while
4622                              * the first step did not due to different rounding.
4623                              */
4624                             if (pos_d >= cell_x1[dim2] &&
4625                                 dd->ci[dim2] != dd->nc[dim2]-1)
4626                             {
4627                                 flag |= DD_FLAG_FW(d2);
4628                             }
4629                             else if (pos_d < cell_x0[dim2] &&
4630                                      dd->ci[dim2] != 0)
4631                             {
4632                                 flag |= DD_FLAG_BW(d2);
4633                             }
4634                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4635                         }
4636                     }
4637                     /* Set to which neighboring cell this cg should go */
4638                     if (flag & DD_FLAG_FW(d2))
4639                     {
4640                         mc = d2*2;
4641                     }
4642                     else if (flag & DD_FLAG_BW(d2))
4643                     {
4644                         if (dd->nc[dd->dim[d2]] > 2)
4645                         {
4646                             mc = d2*2+1;
4647                         }
4648                         else
4649                         {
4650                             mc = d2*2;
4651                         }
4652                     }
4653                 }
4654             }
4655
4656             nrcg = flag & DD_FLAG_NRCG;
4657             if (mc == -1)
4658             {
4659                 if (home_pos_cg+1 > dd->cg_nalloc)
4660                 {
4661                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4662                     srenew(dd->index_gl,dd->cg_nalloc);
4663                     srenew(dd->cgindex,dd->cg_nalloc+1);
4664                 }
4665                 /* Set the global charge group index and size */
4666                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4667                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4668                 /* Copy the state from the buffer */
4669                 if (home_pos_cg >= fr->cg_nalloc)
4670                 {
4671                     dd_realloc_fr_cg(fr,home_pos_cg+1);
4672                     cg_cm = fr->cg_cm;
4673                 }
4674                 copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
4675                 /* Set the cginfo */
4676                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4677                                                    dd->index_gl[home_pos_cg]);
4678                 if (comm->bLocalCG)
4679                 {
4680                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4681                 }
4682
4683                 if (home_pos_at+nrcg > state->nalloc)
4684                 {
4685                     dd_realloc_state(state,f,home_pos_at+nrcg);
4686                 }
4687                 for(i=0; i<nrcg; i++)
4688                 {
4689                     copy_rvec(comm->vbuf.v[buf_pos++],
4690                               state->x[home_pos_at+i]);
4691                 }
4692                 if (bV)
4693                 {
4694                     for(i=0; i<nrcg; i++)
4695                     {
4696                         copy_rvec(comm->vbuf.v[buf_pos++],
4697                                   state->v[home_pos_at+i]);
4698                     }
4699                 }
4700                 if (bSDX)
4701                 {
4702                     for(i=0; i<nrcg; i++)
4703                     {
4704                         copy_rvec(comm->vbuf.v[buf_pos++],
4705                                   state->sd_X[home_pos_at+i]);
4706                     }
4707                 }
4708                 if (bCGP)
4709                 {
4710                     for(i=0; i<nrcg; i++)
4711                     {
4712                         copy_rvec(comm->vbuf.v[buf_pos++],
4713                                   state->cg_p[home_pos_at+i]);
4714                     }
4715                 }
4716                 home_pos_cg += 1;
4717                 home_pos_at += nrcg;
4718             }
4719             else
4720             {
4721                 /* Reallocate the buffers if necessary  */
4722                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4723                 {
4724                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4725                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4726                 }
4727                 nvr = ncg[mc] + nat[mc]*nvec;
4728                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4729                 {
4730                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4731                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4732                 }
4733                 /* Copy from the receive to the send buffers */
4734                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4735                        comm->buf_int + cg*DD_CGIBS,
4736                        DD_CGIBS*sizeof(int));
4737                 memcpy(comm->cgcm_state[mc][nvr],
4738                        comm->vbuf.v[buf_pos],
4739                        (1+nrcg*nvec)*sizeof(rvec));
4740                 buf_pos += 1 + nrcg*nvec;
4741                 ncg[mc] += 1;
4742                 nat[mc] += nrcg;
4743             }
4744         }
4745     }
4746
4747     /* With sorting (!bCompact) the indices are now only partially up to date
4748      * and ncg_home and nat_home are not the real count, since there are
4749      * "holes" in the arrays for the charge groups that moved to neighbors.
4750      */
4751     dd->ncg_home = home_pos_cg;
4752     dd->nat_home = home_pos_at;
4753
4754     if (debug)
4755     {
4756         fprintf(debug,"Finished repartitioning\n");
4757     }
4758
4759     return ncg_stay_home;
4760 }
4761
4762 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4763 {
4764     dd->comm->cycl[ddCycl] += cycles;
4765     dd->comm->cycl_n[ddCycl]++;
4766     if (cycles > dd->comm->cycl_max[ddCycl])
4767     {
4768         dd->comm->cycl_max[ddCycl] = cycles;
4769     }
4770 }
4771
4772 static double force_flop_count(t_nrnb *nrnb)
4773 {
4774     int i;
4775     double sum;
4776     const char *name;
4777
4778     sum = 0;
4779     for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
4780     {
4781         /* To get closer to the real timings, we half the count
4782          * for the normal loops and again half it for water loops.
4783          */
4784         name = nrnb_str(i);
4785         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4786         {
4787             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4788         }
4789         else
4790         {
4791             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4792         }
4793     }
4794     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
4795     {
4796         name = nrnb_str(i);
4797         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4798         sum += nrnb->n[i]*cost_nrnb(i);
4799     }
4800     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
4801     {
4802         sum += nrnb->n[i]*cost_nrnb(i);
4803     }
4804
4805     return sum;
4806 }
4807
4808 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
4809 {
4810     if (dd->comm->eFlop)
4811     {
4812         dd->comm->flop -= force_flop_count(nrnb);
4813     }
4814 }
4815 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
4816 {
4817     if (dd->comm->eFlop)
4818     {
4819         dd->comm->flop += force_flop_count(nrnb);
4820         dd->comm->flop_n++;
4821     }
4822 }
4823
4824 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4825 {
4826     int i;
4827
4828     for(i=0; i<ddCyclNr; i++)
4829     {
4830         dd->comm->cycl[i] = 0;
4831         dd->comm->cycl_n[i] = 0;
4832         dd->comm->cycl_max[i] = 0;
4833     }
4834     dd->comm->flop = 0;
4835     dd->comm->flop_n = 0;
4836 }
4837
4838 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
4839 {
4840     gmx_domdec_comm_t *comm;
4841     gmx_domdec_load_t *load;
4842     gmx_domdec_root_t *root=NULL;
4843     int  d,dim,cid,i,pos;
4844     float cell_frac=0,sbuf[DD_NLOAD_MAX];
4845     gmx_bool bSepPME;
4846
4847     if (debug)
4848     {
4849         fprintf(debug,"get_load_distribution start\n");
4850     }
4851
4852     wallcycle_start(wcycle,ewcDDCOMMLOAD);
4853
4854     comm = dd->comm;
4855
4856     bSepPME = (dd->pme_nodeid >= 0);
4857
4858     for(d=dd->ndim-1; d>=0; d--)
4859     {
4860         dim = dd->dim[d];
4861         /* Check if we participate in the communication in this dimension */
4862         if (d == dd->ndim-1 ||
4863             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
4864         {
4865             load = &comm->load[d];
4866             if (dd->bGridJump)
4867             {
4868                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4869             }
4870             pos = 0;
4871             if (d == dd->ndim-1)
4872             {
4873                 sbuf[pos++] = dd_force_load(comm);
4874                 sbuf[pos++] = sbuf[0];
4875                 if (dd->bGridJump)
4876                 {
4877                     sbuf[pos++] = sbuf[0];
4878                     sbuf[pos++] = cell_frac;
4879                     if (d > 0)
4880                     {
4881                         sbuf[pos++] = comm->cell_f_max0[d];
4882                         sbuf[pos++] = comm->cell_f_min1[d];
4883                     }
4884                 }
4885                 if (bSepPME)
4886                 {
4887                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4888                     sbuf[pos++] = comm->cycl[ddCyclPME];
4889                 }
4890             }
4891             else
4892             {
4893                 sbuf[pos++] = comm->load[d+1].sum;
4894                 sbuf[pos++] = comm->load[d+1].max;
4895                 if (dd->bGridJump)
4896                 {
4897                     sbuf[pos++] = comm->load[d+1].sum_m;
4898                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4899                     sbuf[pos++] = comm->load[d+1].flags;
4900                     if (d > 0)
4901                     {
4902                         sbuf[pos++] = comm->cell_f_max0[d];
4903                         sbuf[pos++] = comm->cell_f_min1[d];
4904                     }
4905                 }
4906                 if (bSepPME)
4907                 {
4908                     sbuf[pos++] = comm->load[d+1].mdf;
4909                     sbuf[pos++] = comm->load[d+1].pme;
4910                 }
4911             }
4912             load->nload = pos;
4913             /* Communicate a row in DD direction d.
4914              * The communicators are setup such that the root always has rank 0.
4915              */
4916 #ifdef GMX_MPI
4917             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
4918                        load->load,load->nload*sizeof(float),MPI_BYTE,
4919                        0,comm->mpi_comm_load[d]);
4920 #endif
4921             if (dd->ci[dim] == dd->master_ci[dim])
4922             {
4923                 /* We are the root, process this row */
4924                 if (comm->bDynLoadBal)
4925                 {
4926                     root = comm->root[d];
4927                 }
4928                 load->sum = 0;
4929                 load->max = 0;
4930                 load->sum_m = 0;
4931                 load->cvol_min = 1;
4932                 load->flags = 0;
4933                 load->mdf = 0;
4934                 load->pme = 0;
4935                 pos = 0;
4936                 for(i=0; i<dd->nc[dim]; i++)
4937                 {
4938                     load->sum += load->load[pos++];
4939                     load->max = max(load->max,load->load[pos]);
4940                     pos++;
4941                     if (dd->bGridJump)
4942                     {
4943                         if (root->bLimited)
4944                         {
4945                             /* This direction could not be load balanced properly,
4946                              * therefore we need to use the maximum iso the average load.
4947                              */
4948                             load->sum_m = max(load->sum_m,load->load[pos]);
4949                         }
4950                         else
4951                         {
4952                             load->sum_m += load->load[pos];
4953                         }
4954                         pos++;
4955                         load->cvol_min = min(load->cvol_min,load->load[pos]);
4956                         pos++;
4957                         if (d < dd->ndim-1)
4958                         {
4959                             load->flags = (int)(load->load[pos++] + 0.5);
4960                         }
4961                         if (d > 0)
4962                         {
4963                             root->cell_f_max0[i] = load->load[pos++];
4964                             root->cell_f_min1[i] = load->load[pos++];
4965                         }
4966                     }
4967                     if (bSepPME)
4968                     {
4969                         load->mdf = max(load->mdf,load->load[pos]);
4970                         pos++;
4971                         load->pme = max(load->pme,load->load[pos]);
4972                         pos++;
4973                     }
4974                 }
4975                 if (comm->bDynLoadBal && root->bLimited)
4976                 {
4977                     load->sum_m *= dd->nc[dim];
4978                     load->flags |= (1<<d);
4979                 }
4980             }
4981         }
4982     }
4983
4984     if (DDMASTER(dd))
4985     {
4986         comm->nload      += dd_load_count(comm);
4987         comm->load_step  += comm->cycl[ddCyclStep];
4988         comm->load_sum   += comm->load[0].sum;
4989         comm->load_max   += comm->load[0].max;
4990         if (comm->bDynLoadBal)
4991         {
4992             for(d=0; d<dd->ndim; d++)
4993             {
4994                 if (comm->load[0].flags & (1<<d))
4995                 {
4996                     comm->load_lim[d]++;
4997                 }
4998             }
4999         }
5000         if (bSepPME)
5001         {
5002             comm->load_mdf += comm->load[0].mdf;
5003             comm->load_pme += comm->load[0].pme;
5004         }
5005     }
5006
5007     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5008
5009     if (debug)
5010     {
5011         fprintf(debug,"get_load_distribution finished\n");
5012     }
5013 }
5014
5015 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5016 {
5017     /* Return the relative performance loss on the total run time
5018      * due to the force calculation load imbalance.
5019      */
5020     if (dd->comm->nload > 0)
5021     {
5022         return
5023             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5024             (dd->comm->load_step*dd->nnodes);
5025     }
5026     else
5027     {
5028         return 0;
5029     }
5030 }
5031
5032 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5033 {
5034     char  buf[STRLEN];
5035     int   npp,npme,nnodes,d,limp;
5036     float imbal,pme_f_ratio,lossf,lossp=0;
5037     gmx_bool  bLim;
5038     gmx_domdec_comm_t *comm;
5039
5040     comm = dd->comm;
5041     if (DDMASTER(dd) && comm->nload > 0)
5042     {
5043         npp    = dd->nnodes;
5044         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5045         nnodes = npp + npme;
5046         imbal = comm->load_max*npp/comm->load_sum - 1;
5047         lossf = dd_force_imb_perf_loss(dd);
5048         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5049         fprintf(fplog,"%s",buf);
5050         fprintf(stderr,"\n");
5051         fprintf(stderr,"%s",buf);
5052         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5053         fprintf(fplog,"%s",buf);
5054         fprintf(stderr,"%s",buf);
5055         bLim = FALSE;
5056         if (comm->bDynLoadBal)
5057         {
5058             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5059             for(d=0; d<dd->ndim; d++)
5060             {
5061                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5062                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5063                 if (limp >= 50)
5064                 {
5065                     bLim = TRUE;
5066                 }
5067             }
5068             sprintf(buf+strlen(buf),"\n");
5069             fprintf(fplog,"%s",buf);
5070             fprintf(stderr,"%s",buf);
5071         }
5072         if (npme > 0)
5073         {
5074             pme_f_ratio = comm->load_pme/comm->load_mdf;
5075             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5076             if (lossp <= 0)
5077             {
5078                 lossp *= (float)npme/(float)nnodes;
5079             }
5080             else
5081             {
5082                 lossp *= (float)npp/(float)nnodes;
5083             }
5084             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5085             fprintf(fplog,"%s",buf);
5086             fprintf(stderr,"%s",buf);
5087             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5088             fprintf(fplog,"%s",buf);
5089             fprintf(stderr,"%s",buf);
5090         }
5091         fprintf(fplog,"\n");
5092         fprintf(stderr,"\n");
5093
5094         if (lossf >= DD_PERF_LOSS)
5095         {
5096             sprintf(buf,
5097                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
5098                     "      in the domain decomposition.\n",lossf*100);
5099             if (!comm->bDynLoadBal)
5100             {
5101                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5102             }
5103             else if (bLim)
5104             {
5105                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5106             }
5107             fprintf(fplog,"%s\n",buf);
5108             fprintf(stderr,"%s\n",buf);
5109         }
5110         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5111         {
5112             sprintf(buf,
5113                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5114                     "      had %s work to do than the PP nodes.\n"
5115                     "      You might want to %s the number of PME nodes\n"
5116                     "      or %s the cut-off and the grid spacing.\n",
5117                     fabs(lossp*100),
5118                     (lossp < 0) ? "less"     : "more",
5119                     (lossp < 0) ? "decrease" : "increase",
5120                     (lossp < 0) ? "decrease" : "increase");
5121             fprintf(fplog,"%s\n",buf);
5122             fprintf(stderr,"%s\n",buf);
5123         }
5124     }
5125 }
5126
5127 static float dd_vol_min(gmx_domdec_t *dd)
5128 {
5129     return dd->comm->load[0].cvol_min*dd->nnodes;
5130 }
5131
5132 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5133 {
5134     return dd->comm->load[0].flags;
5135 }
5136
5137 static float dd_f_imbal(gmx_domdec_t *dd)
5138 {
5139     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5140 }
5141
5142 static float dd_pme_f_ratio(gmx_domdec_t *dd)
5143 {
5144     return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5145 }
5146
5147 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5148 {
5149     int flags,d;
5150     char buf[22];
5151
5152     flags = dd_load_flags(dd);
5153     if (flags)
5154     {
5155         fprintf(fplog,
5156                 "DD  load balancing is limited by minimum cell size in dimension");
5157         for(d=0; d<dd->ndim; d++)
5158         {
5159             if (flags & (1<<d))
5160             {
5161                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5162             }
5163         }
5164         fprintf(fplog,"\n");
5165     }
5166     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5167     if (dd->comm->bDynLoadBal)
5168     {
5169         fprintf(fplog,"  vol min/aver %5.3f%c",
5170                 dd_vol_min(dd),flags ? '!' : ' ');
5171     }
5172     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5173     if (dd->comm->cycl_n[ddCyclPME])
5174     {
5175         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5176     }
5177     fprintf(fplog,"\n\n");
5178 }
5179
5180 static void dd_print_load_verbose(gmx_domdec_t *dd)
5181 {
5182     if (dd->comm->bDynLoadBal)
5183     {
5184         fprintf(stderr,"vol %4.2f%c ",
5185                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5186     }
5187     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5188     if (dd->comm->cycl_n[ddCyclPME])
5189     {
5190         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5191     }
5192 }
5193
5194 #ifdef GMX_MPI
5195 static void make_load_communicator(gmx_domdec_t *dd,MPI_Group g_all,
5196                                    int dim_ind,ivec loc)
5197 {
5198     MPI_Group g_row;
5199     MPI_Comm  c_row;
5200     int  dim,i,*rank;
5201     ivec loc_c;
5202     gmx_domdec_root_t *root;
5203
5204     dim = dd->dim[dim_ind];
5205     copy_ivec(loc,loc_c);
5206     snew(rank,dd->nc[dim]);
5207     for(i=0; i<dd->nc[dim]; i++)
5208     {
5209         loc_c[dim] = i;
5210         rank[i] = dd_index(dd->nc,loc_c);
5211     }
5212     /* Here we create a new group, that does not necessarily
5213      * include our process. But MPI_Comm_create needs to be
5214      * called by all the processes in the original communicator.
5215      * Calling MPI_Group_free afterwards gives errors, so I assume
5216      * also the group is needed by all processes. (B. Hess)
5217      */
5218     MPI_Group_incl(g_all,dd->nc[dim],rank,&g_row);
5219     MPI_Comm_create(dd->mpi_comm_all,g_row,&c_row);
5220     if (c_row != MPI_COMM_NULL)
5221     {
5222         /* This process is part of the group */
5223         dd->comm->mpi_comm_load[dim_ind] = c_row;
5224         if (dd->comm->eDLB != edlbNO)
5225         {
5226             if (dd->ci[dim] == dd->master_ci[dim])
5227             {
5228                 /* This is the root process of this row */
5229                 snew(dd->comm->root[dim_ind],1);
5230                 root = dd->comm->root[dim_ind];
5231                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5232                 snew(root->old_cell_f,dd->nc[dim]+1);
5233                 snew(root->bCellMin,dd->nc[dim]);
5234                 if (dim_ind > 0)
5235                 {
5236                     snew(root->cell_f_max0,dd->nc[dim]);
5237                     snew(root->cell_f_min1,dd->nc[dim]);
5238                     snew(root->bound_min,dd->nc[dim]);
5239                     snew(root->bound_max,dd->nc[dim]);
5240                 }
5241                 snew(root->buf_ncd,dd->nc[dim]);
5242             }
5243             else
5244             {
5245                 /* This is not a root process, we only need to receive cell_f */
5246                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5247             }
5248         }
5249         if (dd->ci[dim] == dd->master_ci[dim])
5250         {
5251             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5252         }
5253     }
5254     sfree(rank);
5255 }
5256 #endif
5257
5258 static void make_load_communicators(gmx_domdec_t *dd)
5259 {
5260 #ifdef GMX_MPI
5261   MPI_Group g_all;
5262   int  dim0,dim1,i,j;
5263   ivec loc;
5264
5265   if (debug)
5266     fprintf(debug,"Making load communicators\n");
5267
5268   MPI_Comm_group(dd->mpi_comm_all,&g_all);
5269
5270   snew(dd->comm->load,dd->ndim);
5271   snew(dd->comm->mpi_comm_load,dd->ndim);
5272
5273   clear_ivec(loc);
5274   make_load_communicator(dd,g_all,0,loc);
5275   if (dd->ndim > 1) {
5276     dim0 = dd->dim[0];
5277     for(i=0; i<dd->nc[dim0]; i++) {
5278       loc[dim0] = i;
5279       make_load_communicator(dd,g_all,1,loc);
5280     }
5281   }
5282   if (dd->ndim > 2) {
5283     dim0 = dd->dim[0];
5284     for(i=0; i<dd->nc[dim0]; i++) {
5285       loc[dim0] = i;
5286       dim1 = dd->dim[1];
5287       for(j=0; j<dd->nc[dim1]; j++) {
5288           loc[dim1] = j;
5289           make_load_communicator(dd,g_all,2,loc);
5290       }
5291     }
5292   }
5293
5294   MPI_Group_free(&g_all);
5295
5296   if (debug)
5297     fprintf(debug,"Finished making load communicators\n");
5298 #endif
5299 }
5300
5301 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5302 {
5303     gmx_bool bZYX;
5304     int  d,dim,i,j,m;
5305     ivec tmp,s;
5306     int  nzone,nzonep;
5307     ivec dd_zp[DD_MAXIZONE];
5308     gmx_domdec_zones_t *zones;
5309     gmx_domdec_ns_ranges_t *izone;
5310
5311     for(d=0; d<dd->ndim; d++)
5312     {
5313         dim = dd->dim[d];
5314         copy_ivec(dd->ci,tmp);
5315         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5316         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5317         copy_ivec(dd->ci,tmp);
5318         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5319         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5320         if (debug)
5321         {
5322             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5323                     dd->rank,dim,
5324                     dd->neighbor[d][0],
5325                     dd->neighbor[d][1]);
5326         }
5327     }
5328
5329     if (DDMASTER(dd))
5330     {
5331         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5332             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5333     }
5334     if (fplog)
5335     {
5336         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5337                 dd->ndim,
5338                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5339                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5340     }
5341     switch (dd->ndim)
5342     {
5343     case 3:
5344         nzone  = dd_z3n;
5345         nzonep = dd_zp3n;
5346         for(i=0; i<nzonep; i++)
5347         {
5348             copy_ivec(dd_zp3[i],dd_zp[i]);
5349         }
5350         break;
5351     case 2:
5352         nzone  = dd_z2n;
5353         nzonep = dd_zp2n;
5354         for(i=0; i<nzonep; i++)
5355         {
5356             copy_ivec(dd_zp2[i],dd_zp[i]);
5357         }
5358         break;
5359     case 1:
5360         nzone  = dd_z1n;
5361         nzonep = dd_zp1n;
5362         for(i=0; i<nzonep; i++)
5363         {
5364             copy_ivec(dd_zp1[i],dd_zp[i]);
5365         }
5366         break;
5367     default:
5368         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5369         nzone = 0;
5370         nzonep = 0;
5371     }
5372
5373     zones = &dd->comm->zones;
5374
5375     for(i=0; i<nzone; i++)
5376     {
5377         m = 0;
5378         clear_ivec(zones->shift[i]);
5379         for(d=0; d<dd->ndim; d++)
5380         {
5381             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5382         }
5383     }
5384
5385     zones->n = nzone;
5386     for(i=0; i<nzone; i++)
5387     {
5388         for(d=0; d<DIM; d++)
5389         {
5390             s[d] = dd->ci[d] - zones->shift[i][d];
5391             if (s[d] < 0)
5392             {
5393                 s[d] += dd->nc[d];
5394             }
5395             else if (s[d] >= dd->nc[d])
5396             {
5397                 s[d] -= dd->nc[d];
5398             }
5399         }
5400     }
5401     zones->nizone = nzonep;
5402     for(i=0; i<zones->nizone; i++)
5403     {
5404         if (dd_zp[i][0] != i)
5405         {
5406             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5407         }
5408         izone = &zones->izone[i];
5409         izone->j0 = dd_zp[i][1];
5410         izone->j1 = dd_zp[i][2];
5411         for(dim=0; dim<DIM; dim++)
5412         {
5413             if (dd->nc[dim] == 1)
5414             {
5415                 /* All shifts should be allowed */
5416                 izone->shift0[dim] = -1;
5417                 izone->shift1[dim] = 1;
5418             }
5419             else
5420             {
5421                 /*
5422                   izone->shift0[d] = 0;
5423                   izone->shift1[d] = 0;
5424                   for(j=izone->j0; j<izone->j1; j++) {
5425                   if (dd->shift[j][d] > dd->shift[i][d])
5426                   izone->shift0[d] = -1;
5427                   if (dd->shift[j][d] < dd->shift[i][d])
5428                   izone->shift1[d] = 1;
5429                   }
5430                 */
5431
5432                 int shift_diff;
5433
5434                 /* Assume the shift are not more than 1 cell */
5435                 izone->shift0[dim] = 1;
5436                 izone->shift1[dim] = -1;
5437                 for(j=izone->j0; j<izone->j1; j++)
5438                 {
5439                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5440                     if (shift_diff < izone->shift0[dim])
5441                     {
5442                         izone->shift0[dim] = shift_diff;
5443                     }
5444                     if (shift_diff > izone->shift1[dim])
5445                     {
5446                         izone->shift1[dim] = shift_diff;
5447                     }
5448                 }
5449             }
5450         }
5451     }
5452
5453     if (dd->comm->eDLB != edlbNO)
5454     {
5455         snew(dd->comm->root,dd->ndim);
5456     }
5457
5458     if (dd->comm->bRecordLoad)
5459     {
5460         make_load_communicators(dd);
5461     }
5462 }
5463
5464 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5465 {
5466     gmx_domdec_t *dd;
5467     gmx_domdec_comm_t *comm;
5468     int  i,rank,*buf;
5469     ivec periods;
5470 #ifdef GMX_MPI
5471     MPI_Comm comm_cart;
5472 #endif
5473
5474     dd = cr->dd;
5475     comm = dd->comm;
5476
5477 #ifdef GMX_MPI
5478     if (comm->bCartesianPP)
5479     {
5480         /* Set up cartesian communication for the particle-particle part */
5481         if (fplog)
5482         {
5483             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5484                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5485         }
5486
5487         for(i=0; i<DIM; i++)
5488         {
5489             periods[i] = TRUE;
5490         }
5491         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5492                         &comm_cart);
5493         /* We overwrite the old communicator with the new cartesian one */
5494         cr->mpi_comm_mygroup = comm_cart;
5495     }
5496
5497     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5498     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5499
5500     if (comm->bCartesianPP_PME)
5501     {
5502         /* Since we want to use the original cartesian setup for sim,
5503          * and not the one after split, we need to make an index.
5504          */
5505         snew(comm->ddindex2ddnodeid,dd->nnodes);
5506         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5507         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5508         /* Get the rank of the DD master,
5509          * above we made sure that the master node is a PP node.
5510          */
5511         if (MASTER(cr))
5512         {
5513             rank = dd->rank;
5514         }
5515         else
5516         {
5517             rank = 0;
5518         }
5519         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5520     }
5521     else if (comm->bCartesianPP)
5522     {
5523         if (cr->npmenodes == 0)
5524         {
5525             /* The PP communicator is also
5526              * the communicator for this simulation
5527              */
5528             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5529         }
5530         cr->nodeid = dd->rank;
5531
5532         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5533
5534         /* We need to make an index to go from the coordinates
5535          * to the nodeid of this simulation.
5536          */
5537         snew(comm->ddindex2simnodeid,dd->nnodes);
5538         snew(buf,dd->nnodes);
5539         if (cr->duty & DUTY_PP)
5540         {
5541             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5542         }
5543         /* Communicate the ddindex to simulation nodeid index */
5544         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5545                       cr->mpi_comm_mysim);
5546         sfree(buf);
5547
5548         /* Determine the master coordinates and rank.
5549          * The DD master should be the same node as the master of this sim.
5550          */
5551         for(i=0; i<dd->nnodes; i++)
5552         {
5553             if (comm->ddindex2simnodeid[i] == 0)
5554             {
5555                 ddindex2xyz(dd->nc,i,dd->master_ci);
5556                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5557             }
5558         }
5559         if (debug)
5560         {
5561             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5562         }
5563     }
5564     else
5565     {
5566         /* No Cartesian communicators */
5567         /* We use the rank in dd->comm->all as DD index */
5568         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5569         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5570         dd->masterrank = 0;
5571         clear_ivec(dd->master_ci);
5572     }
5573 #endif
5574
5575     if (fplog)
5576     {
5577         fprintf(fplog,
5578                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5579                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5580     }
5581     if (debug)
5582     {
5583         fprintf(debug,
5584                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5585                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5586     }
5587 }
5588
5589 static void receive_ddindex2simnodeid(t_commrec *cr)
5590 {
5591     gmx_domdec_t *dd;
5592
5593     gmx_domdec_comm_t *comm;
5594     int  *buf;
5595
5596     dd = cr->dd;
5597     comm = dd->comm;
5598
5599 #ifdef GMX_MPI
5600     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5601     {
5602         snew(comm->ddindex2simnodeid,dd->nnodes);
5603         snew(buf,dd->nnodes);
5604         if (cr->duty & DUTY_PP)
5605         {
5606             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5607         }
5608 #ifdef GMX_MPI
5609         /* Communicate the ddindex to simulation nodeid index */
5610         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5611                       cr->mpi_comm_mysim);
5612 #endif
5613         sfree(buf);
5614     }
5615 #endif
5616 }
5617
5618 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5619                                                      int ncg,int natoms)
5620 {
5621     gmx_domdec_master_t *ma;
5622     int i;
5623
5624     snew(ma,1);
5625
5626     snew(ma->ncg,dd->nnodes);
5627     snew(ma->index,dd->nnodes+1);
5628     snew(ma->cg,ncg);
5629     snew(ma->nat,dd->nnodes);
5630     snew(ma->ibuf,dd->nnodes*2);
5631     snew(ma->cell_x,DIM);
5632     for(i=0; i<DIM; i++)
5633     {
5634         snew(ma->cell_x[i],dd->nc[i]+1);
5635     }
5636
5637     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5638     {
5639         ma->vbuf = NULL;
5640     }
5641     else
5642     {
5643         snew(ma->vbuf,natoms);
5644     }
5645
5646     return ma;
5647 }
5648
5649 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5650                                int reorder)
5651 {
5652     gmx_domdec_t *dd;
5653     gmx_domdec_comm_t *comm;
5654     int  i,rank;
5655     gmx_bool bDiv[DIM];
5656     ivec periods;
5657 #ifdef GMX_MPI
5658     MPI_Comm comm_cart;
5659 #endif
5660
5661     dd = cr->dd;
5662     comm = dd->comm;
5663
5664     if (comm->bCartesianPP)
5665     {
5666         for(i=1; i<DIM; i++)
5667         {
5668             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5669         }
5670         if (bDiv[YY] || bDiv[ZZ])
5671         {
5672             comm->bCartesianPP_PME = TRUE;
5673             /* If we have 2D PME decomposition, which is always in x+y,
5674              * we stack the PME only nodes in z.
5675              * Otherwise we choose the direction that provides the thinnest slab
5676              * of PME only nodes as this will have the least effect
5677              * on the PP communication.
5678              * But for the PME communication the opposite might be better.
5679              */
5680             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5681                              !bDiv[YY] ||
5682                              dd->nc[YY] > dd->nc[ZZ]))
5683             {
5684                 comm->cartpmedim = ZZ;
5685             }
5686             else
5687             {
5688                 comm->cartpmedim = YY;
5689             }
5690             comm->ntot[comm->cartpmedim]
5691                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5692         }
5693         else if (fplog)
5694         {
5695             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5696             fprintf(fplog,
5697                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5698         }
5699     }
5700
5701 #ifdef GMX_MPI
5702     if (comm->bCartesianPP_PME)
5703     {
5704         if (fplog)
5705         {
5706             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5707         }
5708
5709         for(i=0; i<DIM; i++)
5710         {
5711             periods[i] = TRUE;
5712         }
5713         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5714                         &comm_cart);
5715
5716         MPI_Comm_rank(comm_cart,&rank);
5717         if (MASTERNODE(cr) && rank != 0)
5718         {
5719             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5720         }
5721
5722         /* With this assigment we loose the link to the original communicator
5723          * which will usually be MPI_COMM_WORLD, unless have multisim.
5724          */
5725         cr->mpi_comm_mysim = comm_cart;
5726         cr->sim_nodeid = rank;
5727
5728         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5729
5730         if (fplog)
5731         {
5732             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5733                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5734         }
5735
5736         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5737         {
5738             cr->duty = DUTY_PP;
5739         }
5740         if (cr->npmenodes == 0 ||
5741             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5742         {
5743             cr->duty = DUTY_PME;
5744         }
5745
5746         /* Split the sim communicator into PP and PME only nodes */
5747         MPI_Comm_split(cr->mpi_comm_mysim,
5748                        cr->duty,
5749                        dd_index(comm->ntot,dd->ci),
5750                        &cr->mpi_comm_mygroup);
5751     }
5752     else
5753     {
5754         switch (dd_node_order)
5755         {
5756         case ddnoPP_PME:
5757             if (fplog)
5758             {
5759                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5760             }
5761             break;
5762         case ddnoINTERLEAVE:
5763             /* Interleave the PP-only and PME-only nodes,
5764              * as on clusters with dual-core machines this will double
5765              * the communication bandwidth of the PME processes
5766              * and thus speed up the PP <-> PME and inter PME communication.
5767              */
5768             if (fplog)
5769             {
5770                 fprintf(fplog,"Interleaving PP and PME nodes\n");
5771             }
5772             comm->pmenodes = dd_pmenodes(cr);
5773             break;
5774         case ddnoCARTESIAN:
5775             break;
5776         default:
5777             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
5778         }
5779
5780         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
5781         {
5782             cr->duty = DUTY_PME;
5783         }
5784         else
5785         {
5786             cr->duty = DUTY_PP;
5787         }
5788
5789         /* Split the sim communicator into PP and PME only nodes */
5790         MPI_Comm_split(cr->mpi_comm_mysim,
5791                        cr->duty,
5792                        cr->nodeid,
5793                        &cr->mpi_comm_mygroup);
5794         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
5795     }
5796 #endif
5797
5798     if (fplog)
5799     {
5800         fprintf(fplog,"This is a %s only node\n\n",
5801                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5802     }
5803 }
5804
5805 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
5806 {
5807     gmx_domdec_t *dd;
5808     gmx_domdec_comm_t *comm;
5809     int CartReorder;
5810
5811     dd = cr->dd;
5812     comm = dd->comm;
5813
5814     copy_ivec(dd->nc,comm->ntot);
5815
5816     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
5817     comm->bCartesianPP_PME = FALSE;
5818
5819     /* Reorder the nodes by default. This might change the MPI ranks.
5820      * Real reordering is only supported on very few architectures,
5821      * Blue Gene is one of them.
5822      */
5823     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
5824
5825     if (cr->npmenodes > 0)
5826     {
5827         /* Split the communicator into a PP and PME part */
5828         split_communicator(fplog,cr,dd_node_order,CartReorder);
5829         if (comm->bCartesianPP_PME)
5830         {
5831             /* We (possibly) reordered the nodes in split_communicator,
5832              * so it is no longer required in make_pp_communicator.
5833              */
5834             CartReorder = FALSE;
5835         }
5836     }
5837     else
5838     {
5839         /* All nodes do PP and PME */
5840 #ifdef GMX_MPI
5841         /* We do not require separate communicators */
5842         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5843 #endif
5844     }
5845
5846     if (cr->duty & DUTY_PP)
5847     {
5848         /* Copy or make a new PP communicator */
5849         make_pp_communicator(fplog,cr,CartReorder);
5850     }
5851     else
5852     {
5853         receive_ddindex2simnodeid(cr);
5854     }
5855
5856     if (!(cr->duty & DUTY_PME))
5857     {
5858         /* Set up the commnuication to our PME node */
5859         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
5860         dd->pme_receive_vir_ener = receive_vir_ener(cr);
5861         if (debug)
5862         {
5863             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
5864                     dd->pme_nodeid,dd->pme_receive_vir_ener);
5865         }
5866     }
5867     else
5868     {
5869         dd->pme_nodeid = -1;
5870     }
5871
5872     if (DDMASTER(dd))
5873     {
5874         dd->ma = init_gmx_domdec_master_t(dd,
5875                                           comm->cgs_gl.nr,
5876                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
5877     }
5878 }
5879
5880 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
5881 {
5882     real *slb_frac,tot;
5883     int  i,n;
5884     double dbl;
5885
5886     slb_frac = NULL;
5887     if (nc > 1 && size_string != NULL)
5888     {
5889         if (fplog)
5890         {
5891             fprintf(fplog,"Using static load balancing for the %s direction\n",
5892                     dir);
5893         }
5894         snew(slb_frac,nc);
5895         tot = 0;
5896         for (i=0; i<nc; i++)
5897         {
5898             dbl = 0;
5899             sscanf(size_string,"%lf%n",&dbl,&n);
5900             if (dbl == 0)
5901             {
5902                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
5903             }
5904             slb_frac[i] = dbl;
5905             size_string += n;
5906             tot += slb_frac[i];
5907         }
5908         /* Normalize */
5909         if (fplog)
5910         {
5911             fprintf(fplog,"Relative cell sizes:");
5912         }
5913         for (i=0; i<nc; i++)
5914         {
5915             slb_frac[i] /= tot;
5916             if (fplog)
5917             {
5918                 fprintf(fplog," %5.3f",slb_frac[i]);
5919             }
5920         }
5921         if (fplog)
5922         {
5923             fprintf(fplog,"\n");
5924         }
5925     }
5926
5927     return slb_frac;
5928 }
5929
5930 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
5931 {
5932     int n,nmol,ftype;
5933     gmx_mtop_ilistloop_t iloop;
5934     t_ilist *il;
5935
5936     n = 0;
5937     iloop = gmx_mtop_ilistloop_init(mtop);
5938     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
5939     {
5940         for(ftype=0; ftype<F_NRE; ftype++)
5941         {
5942             if ((interaction_function[ftype].flags & IF_BOND) &&
5943                 NRAL(ftype) >  2)
5944             {
5945                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
5946             }
5947         }
5948   }
5949
5950   return n;
5951 }
5952
5953 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
5954 {
5955     char *val;
5956     int  nst;
5957
5958     nst = def;
5959     val = getenv(env_var);
5960     if (val)
5961     {
5962         if (sscanf(val,"%d",&nst) <= 0)
5963         {
5964             nst = 1;
5965         }
5966         if (fplog)
5967         {
5968             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
5969                     env_var,val,nst);
5970         }
5971     }
5972
5973     return nst;
5974 }
5975
5976 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
5977 {
5978     if (MASTER(cr))
5979     {
5980         fprintf(stderr,"\n%s\n",warn_string);
5981     }
5982     if (fplog)
5983     {
5984         fprintf(fplog,"\n%s\n",warn_string);
5985     }
5986 }
5987
5988 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
5989                                   t_inputrec *ir,FILE *fplog)
5990 {
5991     if (ir->ePBC == epbcSCREW &&
5992         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
5993     {
5994         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
5995     }
5996
5997     if (ir->ns_type == ensSIMPLE)
5998     {
5999         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6000     }
6001
6002     if (ir->nstlist == 0)
6003     {
6004         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6005     }
6006
6007     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6008     {
6009         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6010     }
6011 }
6012
6013 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6014 {
6015     int  di,d;
6016     real r;
6017
6018     r = ddbox->box_size[XX];
6019     for(di=0; di<dd->ndim; di++)
6020     {
6021         d = dd->dim[di];
6022         /* Check using the initial average cell size */
6023         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6024     }
6025
6026     return r;
6027 }
6028
6029 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6030                              const char *dlb_opt,gmx_bool bRecordLoad,
6031                              unsigned long Flags,t_inputrec *ir)
6032 {
6033     gmx_domdec_t *dd;
6034     int  eDLB=-1;
6035     char buf[STRLEN];
6036
6037     switch (dlb_opt[0])
6038     {
6039     case 'a': eDLB = edlbAUTO; break;
6040     case 'n': eDLB = edlbNO;   break;
6041     case 'y': eDLB = edlbYES;  break;
6042     default: gmx_incons("Unknown dlb_opt");
6043     }
6044
6045     if (Flags & MD_RERUN)
6046     {
6047         return edlbNO;
6048     }
6049
6050     if (!EI_DYNAMICS(ir->eI))
6051     {
6052         if (eDLB == edlbYES)
6053         {
6054             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6055             dd_warning(cr,fplog,buf);
6056         }
6057
6058         return edlbNO;
6059     }
6060
6061     if (!bRecordLoad)
6062     {
6063         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6064
6065         return edlbNO;
6066     }
6067
6068     if (Flags & MD_REPRODUCIBLE)
6069     {
6070         switch (eDLB)
6071         {
6072                         case edlbNO:
6073                                 break;
6074                         case edlbAUTO:
6075                                 dd_warning(cr,fplog,"NOTE: reproducability requested, will not use dynamic load balancing\n");
6076                                 eDLB = edlbNO;
6077                                 break;
6078                         case edlbYES:
6079                                 dd_warning(cr,fplog,"WARNING: reproducability requested with dynamic load balancing, the simulation will NOT be binary reproducable\n");
6080                                 break;
6081                         default:
6082                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6083                                 break;
6084         }
6085     }
6086
6087     return eDLB;
6088 }
6089
6090 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6091 {
6092     int dim;
6093
6094     dd->ndim = 0;
6095     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6096     {
6097         /* Decomposition order z,y,x */
6098         if (fplog)
6099         {
6100             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6101         }
6102         for(dim=DIM-1; dim>=0; dim--)
6103         {
6104             if (dd->nc[dim] > 1)
6105             {
6106                 dd->dim[dd->ndim++] = dim;
6107             }
6108         }
6109     }
6110     else
6111     {
6112         /* Decomposition order x,y,z */
6113         for(dim=0; dim<DIM; dim++)
6114         {
6115             if (dd->nc[dim] > 1)
6116             {
6117                 dd->dim[dd->ndim++] = dim;
6118             }
6119         }
6120     }
6121 }
6122
6123 static gmx_domdec_comm_t *init_dd_comm()
6124 {
6125     gmx_domdec_comm_t *comm;
6126     int  i;
6127
6128     snew(comm,1);
6129     snew(comm->cggl_flag,DIM*2);
6130     snew(comm->cgcm_state,DIM*2);
6131     for(i=0; i<DIM*2; i++)
6132     {
6133         comm->cggl_flag_nalloc[i]  = 0;
6134         comm->cgcm_state_nalloc[i] = 0;
6135     }
6136
6137     comm->nalloc_int = 0;
6138     comm->buf_int    = NULL;
6139
6140     vec_rvec_init(&comm->vbuf);
6141
6142     comm->n_load_have    = 0;
6143     comm->n_load_collect = 0;
6144
6145     for(i=0; i<ddnatNR-ddnatZONE; i++)
6146     {
6147         comm->sum_nat[i] = 0;
6148     }
6149     comm->ndecomp = 0;
6150     comm->nload   = 0;
6151     comm->load_step = 0;
6152     comm->load_sum  = 0;
6153     comm->load_max  = 0;
6154     clear_ivec(comm->load_lim);
6155     comm->load_mdf  = 0;
6156     comm->load_pme  = 0;
6157
6158     return comm;
6159 }
6160
6161 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6162                                         unsigned long Flags,
6163                                         ivec nc,
6164                                         real comm_distance_min,real rconstr,
6165                                         const char *dlb_opt,real dlb_scale,
6166                                         const char *sizex,const char *sizey,const char *sizez,
6167                                         gmx_mtop_t *mtop,t_inputrec *ir,
6168                                         matrix box,rvec *x,
6169                                         gmx_ddbox_t *ddbox,
6170                                         int *npme_x,int *npme_y)
6171 {
6172     gmx_domdec_t *dd;
6173     gmx_domdec_comm_t *comm;
6174     int  recload;
6175     int  d,i,j;
6176     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6177     gmx_bool bC;
6178     char buf[STRLEN];
6179
6180     if (fplog)
6181     {
6182         fprintf(fplog,
6183                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6184     }
6185
6186     snew(dd,1);
6187
6188     dd->comm = init_dd_comm();
6189     comm = dd->comm;
6190     snew(comm->cggl_flag,DIM*2);
6191     snew(comm->cgcm_state,DIM*2);
6192
6193     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6194     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6195
6196     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6197     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6198     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6199     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6200     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6201     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6202     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6203
6204     dd->pme_recv_f_alloc = 0;
6205     dd->pme_recv_f_buf = NULL;
6206
6207     if (dd->bSendRecv2 && fplog)
6208     {
6209         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6210     }
6211     if (comm->eFlop)
6212     {
6213         if (fplog)
6214         {
6215             fprintf(fplog,"Will load balance based on FLOP count\n");
6216         }
6217         if (comm->eFlop > 1)
6218         {
6219             srand(1+cr->nodeid);
6220         }
6221         comm->bRecordLoad = TRUE;
6222     }
6223     else
6224     {
6225         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6226
6227     }
6228
6229     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6230
6231     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6232     if (fplog)
6233     {
6234         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6235     }
6236     dd->bGridJump = comm->bDynLoadBal;
6237
6238     if (comm->nstSortCG)
6239     {
6240         if (fplog)
6241         {
6242             if (comm->nstSortCG == 1)
6243             {
6244                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6245             }
6246             else
6247             {
6248                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6249                         comm->nstSortCG);
6250             }
6251         }
6252         snew(comm->sort,1);
6253     }
6254     else
6255     {
6256         if (fplog)
6257         {
6258             fprintf(fplog,"Will not sort the charge groups\n");
6259         }
6260     }
6261
6262     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6263     if (comm->bInterCGBondeds)
6264     {
6265         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6266     }
6267     else
6268     {
6269         comm->bInterCGMultiBody = FALSE;
6270     }
6271
6272     dd->bInterCGcons = inter_charge_group_constraints(mtop);
6273
6274     if (ir->rlistlong == 0)
6275     {
6276         /* Set the cut-off to some very large value,
6277          * so we don't need if statements everywhere in the code.
6278          * We use sqrt, since the cut-off is squared in some places.
6279          */
6280         comm->cutoff   = GMX_CUTOFF_INF;
6281     }
6282     else
6283     {
6284         comm->cutoff   = ir->rlistlong;
6285     }
6286     comm->cutoff_mbody = 0;
6287
6288     comm->cellsize_limit = 0;
6289     comm->bBondComm = FALSE;
6290
6291     if (comm->bInterCGBondeds)
6292     {
6293         if (comm_distance_min > 0)
6294         {
6295             comm->cutoff_mbody = comm_distance_min;
6296             if (Flags & MD_DDBONDCOMM)
6297             {
6298                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6299             }
6300             else
6301             {
6302                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6303             }
6304             r_bonded_limit = comm->cutoff_mbody;
6305         }
6306         else if (ir->bPeriodicMols)
6307         {
6308             /* Can not easily determine the required cut-off */
6309             dd_warning(cr,fplog,"NOTE: Periodic molecules: can not easily determine the required minimum bonded cut-off, using half the non-bonded cut-off\n");
6310             comm->cutoff_mbody = comm->cutoff/2;
6311             r_bonded_limit = comm->cutoff_mbody;
6312         }
6313         else
6314         {
6315             if (MASTER(cr))
6316             {
6317                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6318                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6319             }
6320             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6321             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6322
6323             /* We use an initial margin of 10% for the minimum cell size,
6324              * except when we are just below the non-bonded cut-off.
6325              */
6326             if (Flags & MD_DDBONDCOMM)
6327             {
6328                 if (max(r_2b,r_mb) > comm->cutoff)
6329                 {
6330                     r_bonded       = max(r_2b,r_mb);
6331                     r_bonded_limit = 1.1*r_bonded;
6332                     comm->bBondComm = TRUE;
6333                 }
6334                 else
6335                 {
6336                     r_bonded       = r_mb;
6337                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6338                 }
6339                 /* We determine cutoff_mbody later */
6340             }
6341             else
6342             {
6343                 /* No special bonded communication,
6344                  * simply increase the DD cut-off.
6345                  */
6346                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6347                 comm->cutoff_mbody = r_bonded_limit;
6348                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6349             }
6350         }
6351         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6352         if (fplog)
6353         {
6354             fprintf(fplog,
6355                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6356                     comm->cellsize_limit);
6357         }
6358     }
6359
6360     if (dd->bInterCGcons && rconstr <= 0)
6361     {
6362         /* There is a cell size limit due to the constraints (P-LINCS) */
6363         rconstr = constr_r_max(fplog,mtop,ir);
6364         if (fplog)
6365         {
6366             fprintf(fplog,
6367                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6368                     rconstr);
6369             if (rconstr > comm->cellsize_limit)
6370             {
6371                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6372             }
6373         }
6374     }
6375     else if (rconstr > 0 && fplog)
6376     {
6377         /* Here we do not check for dd->bInterCGcons,
6378          * because one can also set a cell size limit for virtual sites only
6379          * and at this point we don't know yet if there are intercg v-sites.
6380          */
6381         fprintf(fplog,
6382                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6383                 rconstr);
6384     }
6385     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6386
6387     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6388
6389     if (nc[XX] > 0)
6390     {
6391         copy_ivec(nc,dd->nc);
6392         set_dd_dim(fplog,dd);
6393         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6394
6395         if (cr->npmenodes == -1)
6396         {
6397             cr->npmenodes = 0;
6398         }
6399         acs = average_cellsize_min(dd,ddbox);
6400         if (acs < comm->cellsize_limit)
6401         {
6402             if (fplog)
6403             {
6404                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6405             }
6406             gmx_fatal_collective(FARGS,cr,NULL,
6407                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6408                                  acs,comm->cellsize_limit);
6409         }
6410     }
6411     else
6412     {
6413         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6414
6415         /* We need to choose the optimal DD grid and possibly PME nodes */
6416         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6417                                comm->eDLB!=edlbNO,dlb_scale,
6418                                comm->cellsize_limit,comm->cutoff,
6419                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6420
6421         if (dd->nc[XX] == 0)
6422         {
6423             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6424             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6425                     !bC ? "-rdd" : "-rcon",
6426                     comm->eDLB!=edlbNO ? " or -dds" : "",
6427                     bC ? " or your LINCS settings" : "");
6428
6429             gmx_fatal_collective(FARGS,cr,NULL,
6430                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6431                                  "%s\n"
6432                                  "Look in the log file for details on the domain decomposition",
6433                                  cr->nnodes-cr->npmenodes,limit,buf);
6434         }
6435         set_dd_dim(fplog,dd);
6436     }
6437
6438     if (fplog)
6439     {
6440         fprintf(fplog,
6441                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6442                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6443     }
6444
6445     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6446     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6447     {
6448         gmx_fatal_collective(FARGS,cr,NULL,
6449                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6450                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6451     }
6452     if (cr->npmenodes > dd->nnodes)
6453     {
6454         gmx_fatal_collective(FARGS,cr,NULL,
6455                              "The number of separate PME node (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6456     }
6457     if (cr->npmenodes > 0)
6458     {
6459         comm->npmenodes = cr->npmenodes;
6460     }
6461     else
6462     {
6463         comm->npmenodes = dd->nnodes;
6464     }
6465
6466     if (EEL_PME(ir->coulombtype))
6467     {
6468         /* The following choices should match those
6469          * in comm_cost_est in domdec_setup.c.
6470          * Note that here the checks have to take into account
6471          * that the decomposition might occur in a different order than xyz
6472          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6473          * in which case they will not match those in comm_cost_est,
6474          * but since that is mainly for testing purposes that's fine.
6475          */
6476         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6477             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6478             getenv("GMX_PMEONEDD") == NULL)
6479         {
6480             comm->npmedecompdim = 2;
6481             comm->npmenodes_x   = dd->nc[XX];
6482             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6483         }
6484         else
6485         {
6486             /* In case nc is 1 in both x and y we could still choose to
6487              * decompose pme in y instead of x, but we use x for simplicity.
6488              */
6489             comm->npmedecompdim = 1;
6490             if (dd->dim[0] == YY)
6491             {
6492                 comm->npmenodes_x = 1;
6493                 comm->npmenodes_y = comm->npmenodes;
6494             }
6495             else
6496             {
6497                 comm->npmenodes_x = comm->npmenodes;
6498                 comm->npmenodes_y = 1;
6499             }
6500         }
6501         if (fplog)
6502         {
6503             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6504                     comm->npmenodes_x,comm->npmenodes_y,1);
6505         }
6506     }
6507     else
6508     {
6509         comm->npmedecompdim = 0;
6510         comm->npmenodes_x   = 0;
6511         comm->npmenodes_y   = 0;
6512     }
6513
6514     /* Technically we don't need both of these,
6515      * but it simplifies code not having to recalculate it.
6516      */
6517     *npme_x = comm->npmenodes_x;
6518     *npme_y = comm->npmenodes_y;
6519
6520     snew(comm->slb_frac,DIM);
6521     if (comm->eDLB == edlbNO)
6522     {
6523         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6524         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6525         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6526     }
6527
6528     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6529     {
6530         if (comm->bBondComm || comm->eDLB != edlbNO)
6531         {
6532             /* Set the bonded communication distance to halfway
6533              * the minimum and the maximum,
6534              * since the extra communication cost is nearly zero.
6535              */
6536             acs = average_cellsize_min(dd,ddbox);
6537             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6538             if (comm->eDLB != edlbNO)
6539             {
6540                 /* Check if this does not limit the scaling */
6541                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6542             }
6543             if (!comm->bBondComm)
6544             {
6545                 /* Without bBondComm do not go beyond the n.b. cut-off */
6546                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6547                 if (comm->cellsize_limit >= comm->cutoff)
6548                 {
6549                     /* We don't loose a lot of efficieny
6550                      * when increasing it to the n.b. cut-off.
6551                      * It can even be slightly faster, because we need
6552                      * less checks for the communication setup.
6553                      */
6554                     comm->cutoff_mbody = comm->cutoff;
6555                 }
6556             }
6557             /* Check if we did not end up below our original limit */
6558             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6559
6560             if (comm->cutoff_mbody > comm->cellsize_limit)
6561             {
6562                 comm->cellsize_limit = comm->cutoff_mbody;
6563             }
6564         }
6565         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6566     }
6567
6568     if (debug)
6569     {
6570         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6571                 "cellsize limit %f\n",
6572                 comm->bBondComm,comm->cellsize_limit);
6573     }
6574
6575     if (MASTER(cr))
6576     {
6577         check_dd_restrictions(cr,dd,ir,fplog);
6578     }
6579
6580     comm->partition_step = INT_MIN;
6581     dd->ddp_count = 0;
6582
6583     clear_dd_cycle_counts(dd);
6584
6585     return dd;
6586 }
6587
6588 static void set_dlb_limits(gmx_domdec_t *dd)
6589
6590 {
6591     int d;
6592
6593     for(d=0; d<dd->ndim; d++)
6594     {
6595         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6596         dd->comm->cellsize_min[dd->dim[d]] =
6597             dd->comm->cellsize_min_dlb[dd->dim[d]];
6598     }
6599 }
6600
6601
6602 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6603 {
6604     gmx_domdec_t *dd;
6605     gmx_domdec_comm_t *comm;
6606     real cellsize_min;
6607     int  d,nc,i;
6608     char buf[STRLEN];
6609
6610     dd = cr->dd;
6611     comm = dd->comm;
6612
6613     if (fplog)
6614     {
6615         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6616     }
6617
6618     cellsize_min = comm->cellsize_min[dd->dim[0]];
6619     for(d=1; d<dd->ndim; d++)
6620     {
6621         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6622     }
6623
6624     if (cellsize_min < comm->cellsize_limit*1.05)
6625     {
6626         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6627
6628         /* Change DLB from "auto" to "no". */
6629         comm->eDLB = edlbNO;
6630
6631         return;
6632     }
6633
6634     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6635     comm->bDynLoadBal = TRUE;
6636     dd->bGridJump = TRUE;
6637
6638     set_dlb_limits(dd);
6639
6640     /* We can set the required cell size info here,
6641      * so we do not need to communicate this.
6642      * The grid is completely uniform.
6643      */
6644     for(d=0; d<dd->ndim; d++)
6645     {
6646         if (comm->root[d])
6647         {
6648             comm->load[d].sum_m = comm->load[d].sum;
6649
6650             nc = dd->nc[dd->dim[d]];
6651             for(i=0; i<nc; i++)
6652             {
6653                 comm->root[d]->cell_f[i]    = i/(real)nc;
6654                 if (d > 0)
6655                 {
6656                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6657                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6658                 }
6659             }
6660             comm->root[d]->cell_f[nc] = 1.0;
6661         }
6662     }
6663 }
6664
6665 static char *init_bLocalCG(gmx_mtop_t *mtop)
6666 {
6667     int  ncg,cg;
6668     char *bLocalCG;
6669
6670     ncg = ncg_mtop(mtop);
6671     snew(bLocalCG,ncg);
6672     for(cg=0; cg<ncg; cg++)
6673     {
6674         bLocalCG[cg] = FALSE;
6675     }
6676
6677     return bLocalCG;
6678 }
6679
6680 void dd_init_bondeds(FILE *fplog,
6681                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6682                      gmx_vsite_t *vsite,gmx_constr_t constr,
6683                      t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6684 {
6685     gmx_domdec_comm_t *comm;
6686     gmx_bool bBondComm;
6687     int  d;
6688
6689     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6690
6691     comm = dd->comm;
6692
6693     if (comm->bBondComm)
6694     {
6695         /* Communicate atoms beyond the cut-off for bonded interactions */
6696         comm = dd->comm;
6697
6698         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6699
6700         comm->bLocalCG = init_bLocalCG(mtop);
6701     }
6702     else
6703     {
6704         /* Only communicate atoms based on cut-off */
6705         comm->cglink   = NULL;
6706         comm->bLocalCG = NULL;
6707     }
6708 }
6709
6710 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6711                               t_inputrec *ir,
6712                               gmx_bool bDynLoadBal,real dlb_scale,
6713                               gmx_ddbox_t *ddbox)
6714 {
6715     gmx_domdec_comm_t *comm;
6716     int  d;
6717     ivec np;
6718     real limit,shrink;
6719     char buf[64];
6720
6721     if (fplog == NULL)
6722     {
6723         return;
6724     }
6725
6726     comm = dd->comm;
6727
6728     if (bDynLoadBal)
6729     {
6730         fprintf(fplog,"The maximum number of communication pulses is:");
6731         for(d=0; d<dd->ndim; d++)
6732         {
6733             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6734         }
6735         fprintf(fplog,"\n");
6736         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6737         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6738         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6739         for(d=0; d<DIM; d++)
6740         {
6741             if (dd->nc[d] > 1)
6742             {
6743                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6744                 {
6745                     shrink = 0;
6746                 }
6747                 else
6748                 {
6749                     shrink =
6750                         comm->cellsize_min_dlb[d]/
6751                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6752                 }
6753                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6754             }
6755         }
6756         fprintf(fplog,"\n");
6757     }
6758     else
6759     {
6760         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6761         fprintf(fplog,"The initial number of communication pulses is:");
6762         for(d=0; d<dd->ndim; d++)
6763         {
6764             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
6765         }
6766         fprintf(fplog,"\n");
6767         fprintf(fplog,"The initial domain decomposition cell size is:");
6768         for(d=0; d<DIM; d++) {
6769             if (dd->nc[d] > 1)
6770             {
6771                 fprintf(fplog," %c %.2f nm",
6772                         dim2char(d),dd->comm->cellsize_min[d]);
6773             }
6774         }
6775         fprintf(fplog,"\n\n");
6776     }
6777
6778     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
6779     {
6780         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
6781         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6782                 "non-bonded interactions","",comm->cutoff);
6783
6784         if (bDynLoadBal)
6785         {
6786             limit = dd->comm->cellsize_limit;
6787         }
6788         else
6789         {
6790             if (dynamic_dd_box(ddbox,ir))
6791             {
6792                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
6793             }
6794             limit = dd->comm->cellsize_min[XX];
6795             for(d=1; d<DIM; d++)
6796             {
6797                 limit = min(limit,dd->comm->cellsize_min[d]);
6798             }
6799         }
6800
6801         if (comm->bInterCGBondeds)
6802         {
6803             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6804                     "two-body bonded interactions","(-rdd)",
6805                     max(comm->cutoff,comm->cutoff_mbody));
6806             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6807                     "multi-body bonded interactions","(-rdd)",
6808                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
6809         }
6810         if (dd->vsite_comm)
6811         {
6812             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6813                     "virtual site constructions","(-rcon)",limit);
6814         }
6815         if (dd->constraint_comm)
6816         {
6817             sprintf(buf,"atoms separated by up to %d constraints",
6818                     1+ir->nProjOrder);
6819             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6820                     buf,"(-rcon)",limit);
6821         }
6822         fprintf(fplog,"\n");
6823     }
6824
6825     fflush(fplog);
6826 }
6827
6828 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
6829                        t_inputrec *ir,t_forcerec *fr,
6830                        gmx_ddbox_t *ddbox)
6831 {
6832     gmx_domdec_comm_t *comm;
6833     int  d,dim,npulse,npulse_d_max,npulse_d;
6834     gmx_bool bNoCutOff;
6835     int  natoms_tot;
6836     real vol_frac;
6837
6838     comm = dd->comm;
6839
6840     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6841
6842     if (EEL_PME(ir->coulombtype))
6843     {
6844         init_ddpme(dd,&comm->ddpme[0],0);
6845         if (comm->npmedecompdim >= 2)
6846         {
6847             init_ddpme(dd,&comm->ddpme[1],1);
6848         }
6849     }
6850     else
6851     {
6852         comm->npmenodes = 0;
6853         if (dd->pme_nodeid >= 0)
6854         {
6855             gmx_fatal_collective(FARGS,NULL,dd,
6856                                  "Can not have separate PME nodes without PME electrostatics");
6857         }
6858     }
6859
6860     /* If each molecule is a single charge group
6861      * or we use domain decomposition for each periodic dimension,
6862      * we do not need to take pbc into account for the bonded interactions.
6863      */
6864     if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
6865         (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
6866     {
6867         fr->bMolPBC = FALSE;
6868     }
6869     else
6870     {
6871         fr->bMolPBC = TRUE;
6872     }
6873
6874     if (debug)
6875     {
6876         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
6877     }
6878     if (comm->eDLB != edlbNO)
6879     {
6880         /* Determine the maximum number of comm. pulses in one dimension */
6881
6882         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6883
6884         /* Determine the maximum required number of grid pulses */
6885         if (comm->cellsize_limit >= comm->cutoff)
6886         {
6887             /* Only a single pulse is required */
6888             npulse = 1;
6889         }
6890         else if (!bNoCutOff && comm->cellsize_limit > 0)
6891         {
6892             /* We round down slightly here to avoid overhead due to the latency
6893              * of extra communication calls when the cut-off
6894              * would be only slightly longer than the cell size.
6895              * Later cellsize_limit is redetermined,
6896              * so we can not miss interactions due to this rounding.
6897              */
6898             npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6899         }
6900         else
6901         {
6902             /* There is no cell size limit */
6903             npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
6904         }
6905
6906         if (!bNoCutOff && npulse > 1)
6907         {
6908             /* See if we can do with less pulses, based on dlb_scale */
6909             npulse_d_max = 0;
6910             for(d=0; d<dd->ndim; d++)
6911             {
6912                 dim = dd->dim[d];
6913                 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6914                                  /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6915                 npulse_d_max = max(npulse_d_max,npulse_d);
6916             }
6917             npulse = min(npulse,npulse_d_max);
6918         }
6919
6920         /* This env var can override npulse */
6921         d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
6922         if (d > 0)
6923         {
6924             npulse = d;
6925         }
6926
6927         comm->maxpulse = 1;
6928         comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6929         for(d=0; d<dd->ndim; d++)
6930         {
6931             comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
6932             comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6933             snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
6934             comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
6935             if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6936             {
6937                 comm->bVacDLBNoLimit = FALSE;
6938             }
6939         }
6940
6941         /* cellsize_limit is set for LINCS in init_domain_decomposition */
6942         if (!comm->bVacDLBNoLimit)
6943         {
6944             comm->cellsize_limit = max(comm->cellsize_limit,
6945                                        comm->cutoff/comm->maxpulse);
6946         }
6947         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6948         /* Set the minimum cell size for each DD dimension */
6949         for(d=0; d<dd->ndim; d++)
6950         {
6951             if (comm->bVacDLBNoLimit ||
6952                 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
6953             {
6954                 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
6955             }
6956             else
6957             {
6958                 comm->cellsize_min_dlb[dd->dim[d]] =
6959                     comm->cutoff/comm->cd[d].np_dlb;
6960             }
6961         }
6962         if (comm->cutoff_mbody <= 0)
6963         {
6964             comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
6965         }
6966         if (comm->bDynLoadBal)
6967         {
6968             set_dlb_limits(dd);
6969         }
6970     }
6971
6972     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
6973     if (comm->eDLB == edlbAUTO)
6974     {
6975         if (fplog)
6976         {
6977             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
6978         }
6979         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
6980     }
6981
6982     if (ir->ePBC == epbcNONE)
6983     {
6984         vol_frac = 1 - 1/(double)dd->nnodes;
6985     }
6986     else
6987     {
6988         vol_frac =
6989             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
6990     }
6991     if (debug)
6992     {
6993         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
6994     }
6995     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
6996
6997     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
6998 }
6999
7000 static void merge_cg_buffers(int ncell,
7001                              gmx_domdec_comm_dim_t *cd, int pulse,
7002                              int  *ncg_cell,
7003                              int  *index_gl, int  *recv_i,
7004                              rvec *cg_cm,    rvec *recv_vr,
7005                              int *cgindex,
7006                              cginfo_mb_t *cginfo_mb,int *cginfo)
7007 {
7008     gmx_domdec_ind_t *ind,*ind_p;
7009     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7010     int shift,shift_at;
7011
7012     ind = &cd->ind[pulse];
7013
7014     /* First correct the already stored data */
7015     shift = ind->nrecv[ncell];
7016     for(cell=ncell-1; cell>=0; cell--)
7017     {
7018         shift -= ind->nrecv[cell];
7019         if (shift > 0)
7020         {
7021             /* Move the cg's present from previous grid pulses */
7022             cg0 = ncg_cell[ncell+cell];
7023             cg1 = ncg_cell[ncell+cell+1];
7024             cgindex[cg1+shift] = cgindex[cg1];
7025             for(cg=cg1-1; cg>=cg0; cg--)
7026             {
7027                 index_gl[cg+shift] = index_gl[cg];
7028                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7029                 cgindex[cg+shift] = cgindex[cg];
7030                 cginfo[cg+shift] = cginfo[cg];
7031             }
7032             /* Correct the already stored send indices for the shift */
7033             for(p=1; p<=pulse; p++)
7034             {
7035                 ind_p = &cd->ind[p];
7036                 cg0 = 0;
7037                 for(c=0; c<cell; c++)
7038                 {
7039                     cg0 += ind_p->nsend[c];
7040                 }
7041                 cg1 = cg0 + ind_p->nsend[cell];
7042                 for(cg=cg0; cg<cg1; cg++)
7043                 {
7044                     ind_p->index[cg] += shift;
7045                 }
7046             }
7047         }
7048     }
7049
7050     /* Merge in the communicated buffers */
7051     shift = 0;
7052     shift_at = 0;
7053     cg0 = 0;
7054     for(cell=0; cell<ncell; cell++)
7055     {
7056         cg1 = ncg_cell[ncell+cell+1] + shift;
7057         if (shift_at > 0)
7058         {
7059             /* Correct the old cg indices */
7060             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7061             {
7062                 cgindex[cg+1] += shift_at;
7063             }
7064         }
7065         for(cg=0; cg<ind->nrecv[cell]; cg++)
7066         {
7067             /* Copy this charge group from the buffer */
7068             index_gl[cg1] = recv_i[cg0];
7069             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7070             /* Add it to the cgindex */
7071             cg_gl = index_gl[cg1];
7072             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7073             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7074             cgindex[cg1+1] = cgindex[cg1] + nat;
7075             cg0++;
7076             cg1++;
7077             shift_at += nat;
7078         }
7079         shift += ind->nrecv[cell];
7080         ncg_cell[ncell+cell+1] = cg1;
7081     }
7082 }
7083
7084 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7085                                int nzone,int cg0,const int *cgindex)
7086 {
7087     int cg,zone,p;
7088
7089     /* Store the atom block boundaries for easy copying of communication buffers
7090      */
7091     cg = cg0;
7092     for(zone=0; zone<nzone; zone++)
7093     {
7094         for(p=0; p<cd->np; p++) {
7095             cd->ind[p].cell2at0[zone] = cgindex[cg];
7096             cg += cd->ind[p].nrecv[zone];
7097             cd->ind[p].cell2at1[zone] = cgindex[cg];
7098         }
7099     }
7100 }
7101
7102 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7103 {
7104     int  i;
7105     gmx_bool bMiss;
7106
7107     bMiss = FALSE;
7108     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7109     {
7110         if (!bLocalCG[link->a[i]])
7111         {
7112             bMiss = TRUE;
7113         }
7114     }
7115
7116     return bMiss;
7117 }
7118
7119 static void setup_dd_communication(gmx_domdec_t *dd,
7120                                    matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
7121 {
7122     int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
7123     int nzone,nzone_send,zone,zonei,cg0,cg1;
7124     int c,i,j,cg,cg_gl,nrcg;
7125     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7126     gmx_domdec_comm_t *comm;
7127     gmx_domdec_zones_t *zones;
7128     gmx_domdec_comm_dim_t *cd;
7129     gmx_domdec_ind_t *ind;
7130     cginfo_mb_t *cginfo_mb;
7131     gmx_bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
7132     real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
7133     rvec rb,rn;
7134     real corner[DIM][4],corner_round_0=0,corner_round_1[4];
7135     real bcorner[DIM],bcorner_round_1=0;
7136     ivec tric_dist;
7137     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7138     real skew_fac2_d,skew_fac_01;
7139     rvec sf2_round;
7140     int  nsend,nat;
7141
7142     if (debug)
7143     {
7144         fprintf(debug,"Setting up DD communication\n");
7145     }
7146
7147     comm  = dd->comm;
7148     cg_cm = fr->cg_cm;
7149
7150     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7151     {
7152         dim = dd->dim[dim_ind];
7153
7154         /* Check if we need to use triclinic distances */
7155         tric_dist[dim_ind] = 0;
7156         for(i=0; i<=dim_ind; i++)
7157         {
7158             if (ddbox->tric_dir[dd->dim[i]])
7159             {
7160                 tric_dist[dim_ind] = 1;
7161             }
7162         }
7163     }
7164
7165     bBondComm = comm->bBondComm;
7166
7167     /* Do we need to determine extra distances for multi-body bondeds? */
7168     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7169
7170     /* Do we need to determine extra distances for only two-body bondeds? */
7171     bDist2B = (bBondComm && !bDistMB);
7172
7173     r_comm2  = sqr(comm->cutoff);
7174     r_bcomm2 = sqr(comm->cutoff_mbody);
7175
7176     if (debug)
7177     {
7178         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7179     }
7180
7181     zones = &comm->zones;
7182
7183     dim0 = dd->dim[0];
7184     /* The first dimension is equal for all cells */
7185     corner[0][0] = comm->cell_x0[dim0];
7186     if (bDistMB)
7187     {
7188         bcorner[0] = corner[0][0];
7189     }
7190     if (dd->ndim >= 2)
7191     {
7192         dim1 = dd->dim[1];
7193         /* This cell row is only seen from the first row */
7194         corner[1][0] = comm->cell_x0[dim1];
7195         /* All rows can see this row */
7196         corner[1][1] = comm->cell_x0[dim1];
7197         if (dd->bGridJump)
7198         {
7199             corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7200             if (bDistMB)
7201             {
7202                 /* For the multi-body distance we need the maximum */
7203                 bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7204             }
7205         }
7206         /* Set the upper-right corner for rounding */
7207         corner_round_0 = comm->cell_x1[dim0];
7208
7209         if (dd->ndim >= 3)
7210         {
7211             dim2 = dd->dim[2];
7212             for(j=0; j<4; j++)
7213             {
7214                 corner[2][j] = comm->cell_x0[dim2];
7215             }
7216             if (dd->bGridJump)
7217             {
7218                 /* Use the maximum of the i-cells that see a j-cell */
7219                 for(i=0; i<zones->nizone; i++)
7220                 {
7221                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7222                     {
7223                         if (j >= 4)
7224                         {
7225                             corner[2][j-4] =
7226                                 max(corner[2][j-4],
7227                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7228                         }
7229                     }
7230                 }
7231                 if (bDistMB)
7232                 {
7233                     /* For the multi-body distance we need the maximum */
7234                     bcorner[2] = comm->cell_x0[dim2];
7235                     for(i=0; i<2; i++)
7236                     {
7237                         for(j=0; j<2; j++)
7238                         {
7239                             bcorner[2] = max(bcorner[2],
7240                                              comm->zone_d2[i][j].p1_0);
7241                         }
7242                     }
7243                 }
7244             }
7245
7246             /* Set the upper-right corner for rounding */
7247             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7248              * Only cell (0,0,0) can see cell 7 (1,1,1)
7249              */
7250             corner_round_1[0] = comm->cell_x1[dim1];
7251             corner_round_1[3] = comm->cell_x1[dim1];
7252             if (dd->bGridJump)
7253             {
7254                 corner_round_1[0] = max(comm->cell_x1[dim1],
7255                                         comm->zone_d1[1].mch1);
7256                 if (bDistMB)
7257                 {
7258                     /* For the multi-body distance we need the maximum */
7259                     bcorner_round_1 = max(comm->cell_x1[dim1],
7260                                           comm->zone_d1[1].p1_1);
7261                 }
7262             }
7263         }
7264     }
7265
7266     /* Triclinic stuff */
7267     normal = ddbox->normal;
7268     skew_fac_01 = 0;
7269     if (dd->ndim >= 2)
7270     {
7271         v_0 = ddbox->v[dim0];
7272         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7273         {
7274             /* Determine the coupling coefficient for the distances
7275              * to the cell planes along dim0 and dim1 through dim2.
7276              * This is required for correct rounding.
7277              */
7278             skew_fac_01 =
7279                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7280             if (debug)
7281             {
7282                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7283             }
7284         }
7285     }
7286     if (dd->ndim >= 3)
7287     {
7288         v_1 = ddbox->v[dim1];
7289     }
7290
7291     zone_cg_range = zones->cg_range;
7292     index_gl = dd->index_gl;
7293     cgindex  = dd->cgindex;
7294     cginfo_mb = fr->cginfo_mb;
7295
7296     zone_cg_range[0]   = 0;
7297     zone_cg_range[1]   = dd->ncg_home;
7298     comm->zone_ncg1[0] = dd->ncg_home;
7299     pos_cg             = dd->ncg_home;
7300
7301     nat_tot = dd->nat_home;
7302     nzone = 1;
7303     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7304     {
7305         dim = dd->dim[dim_ind];
7306         cd = &comm->cd[dim_ind];
7307
7308         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7309         {
7310             /* No pbc in this dimension, the first node should not comm. */
7311             nzone_send = 0;
7312         }
7313         else
7314         {
7315             nzone_send = nzone;
7316         }
7317
7318         bScrew = (dd->bScrewPBC && dim == XX);
7319
7320         v_d = ddbox->v[dim];
7321         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7322
7323         cd->bInPlace = TRUE;
7324         for(p=0; p<cd->np; p++)
7325         {
7326             /* Only atoms communicated in the first pulse are used
7327              * for multi-body bonded interactions or for bBondComm.
7328              */
7329             bDistBonded   = ((bDistMB || bDist2B) && p == 0);
7330             bDistMB_pulse = (bDistMB && bDistBonded);
7331
7332             ind = &cd->ind[p];
7333             nsend = 0;
7334             nat = 0;
7335             for(zone=0; zone<nzone_send; zone++)
7336             {
7337                 if (tric_dist[dim_ind] && dim_ind > 0)
7338                 {
7339                     /* Determine slightly more optimized skew_fac's
7340                      * for rounding.
7341                      * This reduces the number of communicated atoms
7342                      * by about 10% for 3D DD of rhombic dodecahedra.
7343                      */
7344                     for(dimd=0; dimd<dim; dimd++)
7345                     {
7346                         sf2_round[dimd] = 1;
7347                         if (ddbox->tric_dir[dimd])
7348                         {
7349                             for(i=dd->dim[dimd]+1; i<DIM; i++)
7350                             {
7351                                 /* If we are shifted in dimension i
7352                                  * and the cell plane is tilted forward
7353                                  * in dimension i, skip this coupling.
7354                                  */
7355                                 if (!(zones->shift[nzone+zone][i] &&
7356                                       ddbox->v[dimd][i][dimd] >= 0))
7357                                 {
7358                                     sf2_round[dimd] +=
7359                                         sqr(ddbox->v[dimd][i][dimd]);
7360                                 }
7361                             }
7362                             sf2_round[dimd] = 1/sf2_round[dimd];
7363                         }
7364                     }
7365                 }
7366
7367                 zonei = zone_perm[dim_ind][zone];
7368                 if (p == 0)
7369                 {
7370                     /* Here we permutate the zones to obtain a convenient order
7371                      * for neighbor searching
7372                      */
7373                     cg0 = zone_cg_range[zonei];
7374                     cg1 = zone_cg_range[zonei+1];
7375                 }
7376                 else
7377                 {
7378                     /* Look only at the cg's received in the previous grid pulse
7379                      */
7380                     cg1 = zone_cg_range[nzone+zone+1];
7381                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
7382                 }
7383                 ind->nsend[zone] = 0;
7384                 for(cg=cg0; cg<cg1; cg++)
7385                 {
7386                     r2  = 0;
7387                     rb2 = 0;
7388                     if (tric_dist[dim_ind] == 0)
7389                     {
7390                         /* Rectangular direction, easy */
7391                         r = cg_cm[cg][dim] - corner[dim_ind][zone];
7392                         if (r > 0)
7393                         {
7394                             r2 += r*r;
7395                         }
7396                         if (bDistMB_pulse)
7397                         {
7398                             r = cg_cm[cg][dim] - bcorner[dim_ind];
7399                             if (r > 0)
7400                             {
7401                                 rb2 += r*r;
7402                             }
7403                         }
7404                         /* Rounding gives at most a 16% reduction
7405                          * in communicated atoms
7406                          */
7407                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7408                         {
7409                             r = cg_cm[cg][dim0] - corner_round_0;
7410                             /* This is the first dimension, so always r >= 0 */
7411                             r2 += r*r;
7412                             if (bDistMB_pulse)
7413                             {
7414                                 rb2 += r*r;
7415                             }
7416                         }
7417                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7418                         {
7419                             r = cg_cm[cg][dim1] - corner_round_1[zone];
7420                             if (r > 0)
7421                             {
7422                                 r2 += r*r;
7423                             }
7424                             if (bDistMB_pulse)
7425                             {
7426                                 r = cg_cm[cg][dim1] - bcorner_round_1;
7427                                 if (r > 0)
7428                                 {
7429                                     rb2 += r*r;
7430                                 }
7431                             }
7432                         }
7433                     }
7434                     else
7435                     {
7436                         /* Triclinic direction, more complicated */
7437                         clear_rvec(rn);
7438                         clear_rvec(rb);
7439                         /* Rounding, conservative as the skew_fac multiplication
7440                          * will slightly underestimate the distance.
7441                          */
7442                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7443                         {
7444                             rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
7445                             for(i=dim0+1; i<DIM; i++)
7446                             {
7447                                 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7448                             }
7449                             r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7450                             if (bDistMB_pulse)
7451                             {
7452                                 rb[dim0] = rn[dim0];
7453                                 rb2 = r2;
7454                             }
7455                             /* Take care that the cell planes along dim0 might not
7456                              * be orthogonal to those along dim1 and dim2.
7457                              */
7458                             for(i=1; i<=dim_ind; i++)
7459                             {
7460                                 dimd = dd->dim[i];
7461                                 if (normal[dim0][dimd] > 0)
7462                                 {
7463                                     rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7464                                     if (bDistMB_pulse)
7465                                     {
7466                                         rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7467                                     }
7468                                 }
7469                             }
7470                         }
7471                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7472                         {
7473                             rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
7474                             tric_sh = 0;
7475                             for(i=dim1+1; i<DIM; i++)
7476                             {
7477                                 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7478                             }
7479                             rn[dim1] += tric_sh;
7480                             if (rn[dim1] > 0)
7481                             {
7482                                 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7483                                 /* Take care of coupling of the distances
7484                                  * to the planes along dim0 and dim1 through dim2.
7485                                  */
7486                                 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7487                                 /* Take care that the cell planes along dim1
7488                                  * might not be orthogonal to that along dim2.
7489                                  */
7490                                 if (normal[dim1][dim2] > 0)
7491                                 {
7492                                     rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7493                                 }
7494                             }
7495                             if (bDistMB_pulse)
7496                             {
7497                                 rb[dim1] +=
7498                                     cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
7499                                 if (rb[dim1] > 0)
7500                                 {
7501                                     rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7502                                     /* Take care of coupling of the distances
7503                                      * to the planes along dim0 and dim1 through dim2.
7504                                      */
7505                                     rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7506                                     /* Take care that the cell planes along dim1
7507                                      * might not be orthogonal to that along dim2.
7508                                      */
7509                                     if (normal[dim1][dim2] > 0)
7510                                     {
7511                                         rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7512                                     }
7513                                 }
7514                             }
7515                         }
7516                         /* The distance along the communication direction */
7517                         rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
7518                         tric_sh = 0;
7519                         for(i=dim+1; i<DIM; i++)
7520                         {
7521                             tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7522                         }
7523                         rn[dim] += tric_sh;
7524                         if (rn[dim] > 0)
7525                         {
7526                             r2 += rn[dim]*rn[dim]*skew_fac2_d;
7527                             /* Take care of coupling of the distances
7528                              * to the planes along dim0 and dim1 through dim2.
7529                              */
7530                             if (dim_ind == 1 && zonei == 1)
7531                             {
7532                                 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7533                             }
7534                         }
7535                         if (bDistMB_pulse)
7536                         {
7537                             clear_rvec(rb);
7538                             rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
7539                             if (rb[dim] > 0)
7540                             {
7541                                 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7542                                 /* Take care of coupling of the distances
7543                                  * to the planes along dim0 and dim1 through dim2.
7544                                  */
7545                                 if (dim_ind == 1 && zonei == 1)
7546                                 {
7547                                     rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7548                                 }
7549                             }
7550                         }
7551                     }
7552
7553                     if (r2 < r_comm2 ||
7554                         (bDistBonded &&
7555                          ((bDistMB && rb2 < r_bcomm2) ||
7556                           (bDist2B && r2  < r_bcomm2)) &&
7557                          (!bBondComm ||
7558                           (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
7559                            missing_link(comm->cglink,index_gl[cg],
7560                                         comm->bLocalCG)))))
7561                     {
7562                         /* Make an index to the local charge groups */
7563                         if (nsend+1 > ind->nalloc)
7564                         {
7565                             ind->nalloc = over_alloc_large(nsend+1);
7566                             srenew(ind->index,ind->nalloc);
7567                         }
7568                         if (nsend+1 > comm->nalloc_int)
7569                         {
7570                             comm->nalloc_int = over_alloc_large(nsend+1);
7571                             srenew(comm->buf_int,comm->nalloc_int);
7572                         }
7573                         ind->index[nsend] = cg;
7574                         comm->buf_int[nsend] = index_gl[cg];
7575                         ind->nsend[zone]++;
7576                         vec_rvec_check_alloc(&comm->vbuf,nsend+1);
7577
7578                         if (dd->ci[dim] == 0)
7579                         {
7580                             /* Correct cg_cm for pbc */
7581                             rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
7582                             if (bScrew)
7583                             {
7584                                 comm->vbuf.v[nsend][YY] =
7585                                     box[YY][YY]-comm->vbuf.v[nsend][YY];
7586                                 comm->vbuf.v[nsend][ZZ] =
7587                                     box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
7588                             }
7589                         }
7590                         else
7591                         {
7592                             copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
7593                         }
7594                         nsend++;
7595                         nat += cgindex[cg+1] - cgindex[cg];
7596                     }
7597                 }
7598             }
7599             /* Clear the counts in case we do not have pbc */
7600             for(zone=nzone_send; zone<nzone; zone++)
7601             {
7602                 ind->nsend[zone] = 0;
7603             }
7604             ind->nsend[nzone]   = nsend;
7605             ind->nsend[nzone+1] = nat;
7606             /* Communicate the number of cg's and atoms to receive */
7607             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7608                             ind->nsend, nzone+2,
7609                             ind->nrecv, nzone+2);
7610
7611             /* The rvec buffer is also required for atom buffers of size nsend
7612              * in dd_move_x and dd_move_f.
7613              */
7614             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
7615
7616             if (p > 0)
7617             {
7618                 /* We can receive in place if only the last zone is not empty */
7619                 for(zone=0; zone<nzone-1; zone++)
7620                 {
7621                     if (ind->nrecv[zone] > 0)
7622                     {
7623                         cd->bInPlace = FALSE;
7624                     }
7625                 }
7626                 if (!cd->bInPlace)
7627                 {
7628                     /* The int buffer is only required here for the cg indices */
7629                     if (ind->nrecv[nzone] > comm->nalloc_int2)
7630                     {
7631                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
7632                         srenew(comm->buf_int2,comm->nalloc_int2);
7633                     }
7634                     /* The rvec buffer is also required for atom buffers
7635                      * of size nrecv in dd_move_x and dd_move_f.
7636                      */
7637                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
7638                     vec_rvec_check_alloc(&comm->vbuf2,i);
7639                 }
7640             }
7641
7642             /* Make space for the global cg indices */
7643             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
7644                 || dd->cg_nalloc == 0)
7645             {
7646                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
7647                 srenew(index_gl,dd->cg_nalloc);
7648                 srenew(cgindex,dd->cg_nalloc+1);
7649             }
7650             /* Communicate the global cg indices */
7651             if (cd->bInPlace)
7652             {
7653                 recv_i = index_gl + pos_cg;
7654             }
7655             else
7656             {
7657                 recv_i = comm->buf_int2;
7658             }
7659             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7660                             comm->buf_int, nsend,
7661                             recv_i,        ind->nrecv[nzone]);
7662
7663             /* Make space for cg_cm */
7664             if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
7665             {
7666                 dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
7667                 cg_cm = fr->cg_cm;
7668             }
7669             /* Communicate cg_cm */
7670             if (cd->bInPlace)
7671             {
7672                 recv_vr = cg_cm + pos_cg;
7673             }
7674             else
7675             {
7676                 recv_vr = comm->vbuf2.v;
7677             }
7678             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
7679                              comm->vbuf.v, nsend,
7680                              recv_vr,      ind->nrecv[nzone]);
7681
7682             /* Make the charge group index */
7683             if (cd->bInPlace)
7684             {
7685                 zone = (p == 0 ? 0 : nzone - 1);
7686                 while (zone < nzone)
7687                 {
7688                     for(cg=0; cg<ind->nrecv[zone]; cg++)
7689                     {
7690                         cg_gl = index_gl[pos_cg];
7691                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
7692                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
7693                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
7694                         if (bBondComm)
7695                         {
7696                             /* Update the charge group presence,
7697                              * so we can use it in the next pass of the loop.
7698                              */
7699                             comm->bLocalCG[cg_gl] = TRUE;
7700                         }
7701                         pos_cg++;
7702                     }
7703                     if (p == 0)
7704                     {
7705                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
7706                     }
7707                     zone++;
7708                     zone_cg_range[nzone+zone] = pos_cg;
7709                 }
7710             }
7711             else
7712             {
7713                 /* This part of the code is never executed with bBondComm. */
7714                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
7715                                  index_gl,recv_i,cg_cm,recv_vr,
7716                                  cgindex,fr->cginfo_mb,fr->cginfo);
7717                 pos_cg += ind->nrecv[nzone];
7718             }
7719             nat_tot += ind->nrecv[nzone+1];
7720         }
7721         if (!cd->bInPlace)
7722         {
7723             /* Store the atom block for easy copying of communication buffers */
7724             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
7725         }
7726         nzone += nzone;
7727     }
7728     dd->index_gl = index_gl;
7729     dd->cgindex  = cgindex;
7730
7731     dd->ncg_tot = zone_cg_range[zones->n];
7732     dd->nat_tot = nat_tot;
7733     comm->nat[ddnatHOME] = dd->nat_home;
7734     for(i=ddnatZONE; i<ddnatNR; i++)
7735     {
7736         comm->nat[i] = dd->nat_tot;
7737     }
7738
7739     if (!bBondComm)
7740     {
7741         /* We don't need to update cginfo, since that was alrady done above.
7742          * So we pass NULL for the forcerec.
7743          */
7744         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
7745                       NULL,comm->bLocalCG);
7746     }
7747
7748     if (debug)
7749     {
7750         fprintf(debug,"Finished setting up DD communication, zones:");
7751         for(c=0; c<zones->n; c++)
7752         {
7753             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
7754         }
7755         fprintf(debug,"\n");
7756     }
7757 }
7758
7759 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
7760 {
7761     int c;
7762
7763     for(c=0; c<zones->nizone; c++)
7764     {
7765         zones->izone[c].cg1  = zones->cg_range[c+1];
7766         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
7767         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
7768     }
7769 }
7770
7771 static int comp_cgsort(const void *a,const void *b)
7772 {
7773     int comp;
7774
7775     gmx_cgsort_t *cga,*cgb;
7776     cga = (gmx_cgsort_t *)a;
7777     cgb = (gmx_cgsort_t *)b;
7778
7779     comp = cga->nsc - cgb->nsc;
7780     if (comp == 0)
7781     {
7782         comp = cga->ind_gl - cgb->ind_gl;
7783     }
7784
7785     return comp;
7786 }
7787
7788 static void order_int_cg(int n,gmx_cgsort_t *sort,
7789                          int *a,int *buf)
7790 {
7791     int i;
7792
7793     /* Order the data */
7794     for(i=0; i<n; i++)
7795     {
7796         buf[i] = a[sort[i].ind];
7797     }
7798
7799     /* Copy back to the original array */
7800     for(i=0; i<n; i++)
7801     {
7802         a[i] = buf[i];
7803     }
7804 }
7805
7806 static void order_vec_cg(int n,gmx_cgsort_t *sort,
7807                          rvec *v,rvec *buf)
7808 {
7809     int i;
7810
7811     /* Order the data */
7812     for(i=0; i<n; i++)
7813     {
7814         copy_rvec(v[sort[i].ind],buf[i]);
7815     }
7816
7817     /* Copy back to the original array */
7818     for(i=0; i<n; i++)
7819     {
7820         copy_rvec(buf[i],v[i]);
7821     }
7822 }
7823
7824 static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
7825                            rvec *v,rvec *buf)
7826 {
7827     int a,atot,cg,cg0,cg1,i;
7828
7829     /* Order the data */
7830     a = 0;
7831     for(cg=0; cg<ncg; cg++)
7832     {
7833         cg0 = cgindex[sort[cg].ind];
7834         cg1 = cgindex[sort[cg].ind+1];
7835         for(i=cg0; i<cg1; i++)
7836         {
7837             copy_rvec(v[i],buf[a]);
7838             a++;
7839         }
7840     }
7841     atot = a;
7842
7843     /* Copy back to the original array */
7844     for(a=0; a<atot; a++)
7845     {
7846         copy_rvec(buf[a],v[a]);
7847     }
7848 }
7849
7850 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
7851                          int nsort_new,gmx_cgsort_t *sort_new,
7852                          gmx_cgsort_t *sort1)
7853 {
7854     int i1,i2,i_new;
7855
7856     /* The new indices are not very ordered, so we qsort them */
7857     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
7858
7859     /* sort2 is already ordered, so now we can merge the two arrays */
7860     i1 = 0;
7861     i2 = 0;
7862     i_new = 0;
7863     while(i2 < nsort2 || i_new < nsort_new)
7864     {
7865         if (i2 == nsort2)
7866         {
7867             sort1[i1++] = sort_new[i_new++];
7868         }
7869         else if (i_new == nsort_new)
7870         {
7871             sort1[i1++] = sort2[i2++];
7872         }
7873         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
7874                  (sort2[i2].nsc == sort_new[i_new].nsc &&
7875                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
7876         {
7877             sort1[i1++] = sort2[i2++];
7878         }
7879         else
7880         {
7881             sort1[i1++] = sort_new[i_new++];
7882         }
7883     }
7884 }
7885
7886 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
7887                           rvec *cgcm,t_forcerec *fr,t_state *state,
7888                           int ncg_home_old)
7889 {
7890     gmx_domdec_sort_t *sort;
7891     gmx_cgsort_t *cgsort,*sort_i;
7892     int  ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
7893     rvec *vbuf;
7894
7895     sort = dd->comm->sort;
7896
7897     if (dd->ncg_home > sort->sort_nalloc)
7898     {
7899         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
7900         srenew(sort->sort1,sort->sort_nalloc);
7901         srenew(sort->sort2,sort->sort_nalloc);
7902     }
7903
7904     if (ncg_home_old >= 0)
7905     {
7906         /* The charge groups that remained in the same ns grid cell
7907          * are completely ordered. So we can sort efficiently by sorting
7908          * the charge groups that did move into the stationary list.
7909          */
7910         ncg_new = 0;
7911         nsort2 = 0;
7912         nsort_new = 0;
7913         for(i=0; i<dd->ncg_home; i++)
7914         {
7915             /* Check if this cg did not move to another node */
7916             cell_index = fr->ns.grid->cell_index[i];
7917             if (cell_index !=  4*fr->ns.grid->ncells)
7918             {
7919                 if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
7920                 {
7921                     /* This cg is new on this node or moved ns grid cell */
7922                     if (nsort_new >= sort->sort_new_nalloc)
7923                     {
7924                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
7925                         srenew(sort->sort_new,sort->sort_new_nalloc);
7926                     }
7927                     sort_i = &(sort->sort_new[nsort_new++]);
7928                 }
7929                 else
7930                 {
7931                     /* This cg did not move */
7932                     sort_i = &(sort->sort2[nsort2++]);
7933                 }
7934                 /* Sort on the ns grid cell indices
7935                  * and the global topology index
7936                  */
7937                 sort_i->nsc    = cell_index;
7938                 sort_i->ind_gl = dd->index_gl[i];
7939                 sort_i->ind    = i;
7940                 ncg_new++;
7941             }
7942         }
7943         if (debug)
7944         {
7945             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
7946                     nsort2,nsort_new);
7947         }
7948         /* Sort efficiently */
7949         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
7950     }
7951     else
7952     {
7953         cgsort = sort->sort1;
7954         ncg_new = 0;
7955         for(i=0; i<dd->ncg_home; i++)
7956         {
7957             /* Sort on the ns grid cell indices
7958              * and the global topology index
7959              */
7960             cgsort[i].nsc    = fr->ns.grid->cell_index[i];
7961             cgsort[i].ind_gl = dd->index_gl[i];
7962             cgsort[i].ind    = i;
7963             if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
7964             {
7965                 ncg_new++;
7966             }
7967         }
7968         if (debug)
7969         {
7970             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
7971         }
7972         /* Determine the order of the charge groups using qsort */
7973         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
7974     }
7975     cgsort = sort->sort1;
7976
7977     /* We alloc with the old size, since cgindex is still old */
7978     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
7979     vbuf = dd->comm->vbuf.v;
7980
7981     /* Remove the charge groups which are no longer at home here */
7982     dd->ncg_home = ncg_new;
7983
7984     /* Reorder the state */
7985     for(i=0; i<estNR; i++)
7986     {
7987         if (EST_DISTR(i) && state->flags & (1<<i))
7988         {
7989             switch (i)
7990             {
7991             case estX:
7992                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
7993                 break;
7994             case estV:
7995                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
7996                 break;
7997             case estSDX:
7998                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
7999                 break;
8000             case estCGP:
8001                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
8002                 break;
8003             case estLD_RNG:
8004             case estLD_RNGI:
8005             case estDISRE_INITF:
8006             case estDISRE_RM3TAV:
8007             case estORIRE_INITF:
8008             case estORIRE_DTAV:
8009                 /* No ordering required */
8010                 break;
8011             default:
8012                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8013                 break;
8014             }
8015         }
8016     }
8017     /* Reorder cgcm */
8018     order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8019
8020     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8021     {
8022         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8023         srenew(sort->ibuf,sort->ibuf_nalloc);
8024     }
8025     ibuf = sort->ibuf;
8026     /* Reorder the global cg index */
8027     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8028     /* Reorder the cginfo */
8029     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8030     /* Rebuild the local cg index */
8031     ibuf[0] = 0;
8032     for(i=0; i<dd->ncg_home; i++)
8033     {
8034         cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8035         ibuf[i+1] = ibuf[i] + cgsize;
8036     }
8037     for(i=0; i<dd->ncg_home+1; i++)
8038     {
8039         dd->cgindex[i] = ibuf[i];
8040     }
8041     /* Set the home atom number */
8042     dd->nat_home = dd->cgindex[dd->ncg_home];
8043
8044     /* Copy the sorted ns cell indices back to the ns grid struct */
8045     for(i=0; i<dd->ncg_home; i++)
8046     {
8047         fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8048     }
8049     fr->ns.grid->nr = dd->ncg_home;
8050 }
8051
8052 static void add_dd_statistics(gmx_domdec_t *dd)
8053 {
8054     gmx_domdec_comm_t *comm;
8055     int ddnat;
8056
8057     comm = dd->comm;
8058
8059     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8060     {
8061         comm->sum_nat[ddnat-ddnatZONE] +=
8062             comm->nat[ddnat] - comm->nat[ddnat-1];
8063     }
8064     comm->ndecomp++;
8065 }
8066
8067 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8068 {
8069     gmx_domdec_comm_t *comm;
8070     int ddnat;
8071
8072     comm = dd->comm;
8073
8074     /* Reset all the statistics and counters for total run counting */
8075     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8076     {
8077         comm->sum_nat[ddnat-ddnatZONE] = 0;
8078     }
8079     comm->ndecomp = 0;
8080     comm->nload = 0;
8081     comm->load_step = 0;
8082     comm->load_sum = 0;
8083     comm->load_max = 0;
8084     clear_ivec(comm->load_lim);
8085     comm->load_mdf = 0;
8086     comm->load_pme = 0;
8087 }
8088
8089 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8090 {
8091     gmx_domdec_comm_t *comm;
8092     int ddnat;
8093     double av;
8094
8095     comm = cr->dd->comm;
8096
8097     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8098
8099     if (fplog == NULL)
8100     {
8101         return;
8102     }
8103
8104     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
8105
8106     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8107     {
8108         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8109         switch(ddnat)
8110         {
8111         case ddnatZONE:
8112             fprintf(fplog,
8113                     " av. #atoms communicated per step for force:  %d x %.1f\n",
8114                     2,av);
8115             break;
8116         case ddnatVSITE:
8117             if (cr->dd->vsite_comm)
8118             {
8119                 fprintf(fplog,
8120                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
8121                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8122                         av);
8123             }
8124             break;
8125         case ddnatCON:
8126             if (cr->dd->constraint_comm)
8127             {
8128                 fprintf(fplog,
8129                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
8130                         1 + ir->nLincsIter,av);
8131             }
8132             break;
8133         default:
8134             gmx_incons(" Unknown type for DD statistics");
8135         }
8136     }
8137     fprintf(fplog,"\n");
8138
8139     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8140     {
8141         print_dd_load_av(fplog,cr->dd);
8142     }
8143 }
8144
8145 void dd_partition_system(FILE            *fplog,
8146                          gmx_large_int_t      step,
8147                          t_commrec       *cr,
8148                          gmx_bool            bMasterState,
8149                          int             nstglobalcomm,
8150                          t_state         *state_global,
8151                          gmx_mtop_t      *top_global,
8152                          t_inputrec      *ir,
8153                          t_state         *state_local,
8154                          rvec            **f,
8155                          t_mdatoms       *mdatoms,
8156                          gmx_localtop_t  *top_local,
8157                          t_forcerec      *fr,
8158                          gmx_vsite_t     *vsite,
8159                          gmx_shellfc_t   shellfc,
8160                          gmx_constr_t    constr,
8161                          t_nrnb          *nrnb,
8162                          gmx_wallcycle_t wcycle,
8163                          gmx_bool            bVerbose)
8164 {
8165     gmx_domdec_t *dd;
8166     gmx_domdec_comm_t *comm;
8167     gmx_ddbox_t ddbox={0};
8168     t_block *cgs_gl;
8169     gmx_large_int_t step_pcoupl;
8170     rvec cell_ns_x0,cell_ns_x1;
8171     int  i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
8172     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8173     gmx_bool bRedist,bSortCG,bResortAll;
8174     ivec ncells_old,np;
8175     real grid_density;
8176     char sbuf[22];
8177
8178     dd = cr->dd;
8179     comm = dd->comm;
8180
8181     bBoxChanged = (bMasterState || DEFORM(*ir));
8182     if (ir->epc != epcNO)
8183     {
8184         /* With nstcalcenery > 1 pressure coupling happens.
8185          * one step after calculating the energies.
8186          * Box scaling happens at the end of the MD step,
8187          * after the DD partitioning.
8188          * We therefore have to do DLB in the first partitioning
8189          * after an MD step where P-coupling occured.
8190          * We need to determine the last step in which p-coupling occurred.
8191          * MRS -- need to validate this for vv?
8192          */
8193         n = ir->nstcalcenergy;
8194         if (n == 1)
8195         {
8196             step_pcoupl = step - 1;
8197         }
8198         else
8199         {
8200             step_pcoupl = ((step - 1)/n)*n + 1;
8201         }
8202         if (step_pcoupl >= comm->partition_step)
8203         {
8204             bBoxChanged = TRUE;
8205         }
8206     }
8207
8208     bNStGlobalComm = (step >= comm->partition_step + nstglobalcomm);
8209
8210     if (!comm->bDynLoadBal)
8211     {
8212         bDoDLB = FALSE;
8213     }
8214     else
8215     {
8216         /* Should we do dynamic load balacing this step?
8217          * Since it requires (possibly expensive) global communication,
8218          * we might want to do DLB less frequently.
8219          */
8220         if (bBoxChanged || ir->epc != epcNO)
8221         {
8222             bDoDLB = bBoxChanged;
8223         }
8224         else
8225         {
8226             bDoDLB = bNStGlobalComm;
8227         }
8228     }
8229
8230     /* Check if we have recorded loads on the nodes */
8231     if (comm->bRecordLoad && dd_load_count(comm))
8232     {
8233         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
8234         {
8235             /* Check if we should use DLB at the second partitioning
8236              * and every 100 partitionings,
8237              * so the extra communication cost is negligible.
8238              */
8239             n = max(100,nstglobalcomm);
8240             bCheckDLB = (comm->n_load_collect == 0 ||
8241                          comm->n_load_have % n == n-1);
8242         }
8243         else
8244         {
8245             bCheckDLB = FALSE;
8246         }
8247
8248         /* Print load every nstlog, first and last step to the log file */
8249         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
8250                     comm->n_load_collect == 0 ||
8251                     (step + ir->nstlist > ir->init_step + ir->nsteps));
8252
8253         /* Avoid extra communication due to verbose screen output
8254          * when nstglobalcomm is set.
8255          */
8256         if (bDoDLB || bLogLoad || bCheckDLB ||
8257             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
8258         {
8259             get_load_distribution(dd,wcycle);
8260             if (DDMASTER(dd))
8261             {
8262                 if (bLogLoad)
8263                 {
8264                     dd_print_load(fplog,dd,step-1);
8265                 }
8266                 if (bVerbose)
8267                 {
8268                     dd_print_load_verbose(dd);
8269                 }
8270             }
8271             comm->n_load_collect++;
8272
8273             if (bCheckDLB) {
8274                 /* Since the timings are node dependent, the master decides */
8275                 if (DDMASTER(dd))
8276                 {
8277                     bTurnOnDLB =
8278                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
8279                     if (debug)
8280                     {
8281                         fprintf(debug,"step %s, imb loss %f\n",
8282                                 gmx_step_str(step,sbuf),
8283                                 dd_force_imb_perf_loss(dd));
8284                     }
8285                 }
8286                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
8287                 if (bTurnOnDLB)
8288                 {
8289                     turn_on_dlb(fplog,cr,step);
8290                     bDoDLB = TRUE;
8291                 }
8292             }
8293         }
8294         comm->n_load_have++;
8295     }
8296
8297     cgs_gl = &comm->cgs_gl;
8298
8299     bRedist = FALSE;
8300     if (bMasterState)
8301     {
8302         /* Clear the old state */
8303         clear_dd_indices(dd,0,0);
8304
8305         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
8306                   TRUE,cgs_gl,state_global->x,&ddbox);
8307
8308         get_cg_distribution(fplog,step,dd,cgs_gl,
8309                             state_global->box,&ddbox,state_global->x);
8310
8311         dd_distribute_state(dd,cgs_gl,
8312                             state_global,state_local,f);
8313
8314         dd_make_local_cgs(dd,&top_local->cgs);
8315
8316         if (dd->ncg_home > fr->cg_nalloc)
8317         {
8318             dd_realloc_fr_cg(fr,dd->ncg_home);
8319         }
8320         calc_cgcm(fplog,0,dd->ncg_home,
8321                   &top_local->cgs,state_local->x,fr->cg_cm);
8322
8323         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8324
8325         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8326
8327         cg0 = 0;
8328     }
8329     else if (state_local->ddp_count != dd->ddp_count)
8330     {
8331         if (state_local->ddp_count > dd->ddp_count)
8332         {
8333             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
8334         }
8335
8336         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
8337         {
8338             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
8339         }
8340
8341         /* Clear the old state */
8342         clear_dd_indices(dd,0,0);
8343
8344         /* Build the new indices */
8345         rebuild_cgindex(dd,cgs_gl->index,state_local);
8346         make_dd_indices(dd,cgs_gl->index,0);
8347
8348         /* Redetermine the cg COMs */
8349         calc_cgcm(fplog,0,dd->ncg_home,
8350                   &top_local->cgs,state_local->x,fr->cg_cm);
8351
8352         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8353
8354         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8355
8356         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8357                   TRUE,&top_local->cgs,state_local->x,&ddbox);
8358
8359         bRedist = comm->bDynLoadBal;
8360     }
8361     else
8362     {
8363         /* We have the full state, only redistribute the cgs */
8364
8365         /* Clear the non-home indices */
8366         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
8367
8368         /* Avoid global communication for dim's without pbc and -gcom */
8369         if (!bNStGlobalComm)
8370         {
8371             copy_rvec(comm->box0    ,ddbox.box0    );
8372             copy_rvec(comm->box_size,ddbox.box_size);
8373         }
8374         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8375                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
8376
8377         bBoxChanged = TRUE;
8378         bRedist = TRUE;
8379     }
8380     /* For dim's without pbc and -gcom */
8381     copy_rvec(ddbox.box0    ,comm->box0    );
8382     copy_rvec(ddbox.box_size,comm->box_size);
8383
8384     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
8385                       step,wcycle);
8386
8387     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
8388     {
8389         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
8390     }
8391
8392     /* Check if we should sort the charge groups */
8393     if (comm->nstSortCG > 0)
8394     {
8395         bSortCG = (bMasterState ||
8396                    (bRedist && (step % comm->nstSortCG == 0)));
8397     }
8398     else
8399     {
8400         bSortCG = FALSE;
8401     }
8402
8403     ncg_home_old = dd->ncg_home;
8404
8405     if (bRedist)
8406     {
8407         cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
8408                                  state_local,f,fr,mdatoms,
8409                                  !bSortCG,nrnb);
8410     }
8411
8412     get_nsgrid_boundaries(fr->ns.grid,dd,
8413                           state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
8414                           dd->ncg_home,fr->cg_cm,
8415                           cell_ns_x0,cell_ns_x1,&grid_density);
8416
8417     if (bBoxChanged)
8418     {
8419         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
8420     }
8421
8422     copy_ivec(fr->ns.grid->n,ncells_old);
8423     grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
8424                state_local->box,cell_ns_x0,cell_ns_x1,
8425                fr->rlistlong,grid_density);
8426     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
8427     copy_ivec(ddbox.tric_dir,comm->tric_dir);
8428
8429     if (bSortCG)
8430     {
8431         /* Sort the state on charge group position.
8432          * This enables exact restarts from this step.
8433          * It also improves performance by about 15% with larger numbers
8434          * of atoms per node.
8435          */
8436
8437         /* Fill the ns grid with the home cell,
8438          * so we can sort with the indices.
8439          */
8440         set_zones_ncg_home(dd);
8441         fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
8442                   0,dd->ncg_home,fr->cg_cm);
8443
8444         /* Check if we can user the old order and ns grid cell indices
8445          * of the charge groups to sort the charge groups efficiently.
8446          */
8447         bResortAll = (bMasterState ||
8448                       fr->ns.grid->n[XX] != ncells_old[XX] ||
8449                       fr->ns.grid->n[YY] != ncells_old[YY] ||
8450                       fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
8451
8452         if (debug)
8453         {
8454             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
8455                     gmx_step_str(step,sbuf),dd->ncg_home);
8456         }
8457         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
8458                       bResortAll ? -1 : ncg_home_old);
8459         /* Rebuild all the indices */
8460         cg0 = 0;
8461         ga2la_clear(dd->ga2la);
8462     }
8463
8464     /* Setup up the communication and communicate the coordinates */
8465     setup_dd_communication(dd,state_local->box,&ddbox,fr);
8466
8467     /* Set the indices */
8468     make_dd_indices(dd,cgs_gl->index,cg0);
8469
8470     /* Set the charge group boundaries for neighbor searching */
8471     set_cg_boundaries(&comm->zones);
8472
8473     /*
8474     write_dd_pdb("dd_home",step,"dump",top_global,cr,
8475                  -1,state_local->x,state_local->box);
8476     */
8477
8478     /* Extract a local topology from the global topology */
8479     for(i=0; i<dd->ndim; i++)
8480     {
8481         np[dd->dim[i]] = comm->cd[i].np;
8482     }
8483     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
8484                       comm->cellsize_min,np,
8485                       fr,vsite,top_global,top_local);
8486
8487     /* Set up the special atom communication */
8488     n = comm->nat[ddnatZONE];
8489     for(i=ddnatZONE+1; i<ddnatNR; i++)
8490     {
8491         switch(i)
8492         {
8493         case ddnatVSITE:
8494             if (vsite && vsite->n_intercg_vsite)
8495             {
8496                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
8497             }
8498             break;
8499         case ddnatCON:
8500             if (dd->bInterCGcons)
8501             {
8502                 /* Only for inter-cg constraints we need special code */
8503                 n = dd_make_local_constraints(dd,n,top_global,
8504                                               constr,ir->nProjOrder,
8505                                               &top_local->idef.il[F_CONSTR]);
8506             }
8507             break;
8508         default:
8509             gmx_incons("Unknown special atom type setup");
8510         }
8511         comm->nat[i] = n;
8512     }
8513
8514     /* Make space for the extra coordinates for virtual site
8515      * or constraint communication.
8516      */
8517     state_local->natoms = comm->nat[ddnatNR-1];
8518     if (state_local->natoms > state_local->nalloc)
8519     {
8520         dd_realloc_state(state_local,f,state_local->natoms);
8521     }
8522
8523     if (fr->bF_NoVirSum)
8524     {
8525         if (vsite && vsite->n_intercg_vsite)
8526         {
8527             nat_f_novirsum = comm->nat[ddnatVSITE];
8528         }
8529         else
8530         {
8531             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
8532             {
8533                 nat_f_novirsum = dd->nat_tot;
8534             }
8535             else
8536             {
8537                 nat_f_novirsum = dd->nat_home;
8538             }
8539         }
8540     }
8541     else
8542     {
8543         nat_f_novirsum = 0;
8544     }
8545
8546     /* Set the number of atoms required for the force calculation.
8547      * Forces need to be constrained when using a twin-range setup
8548      * or with energy minimization. For simple simulations we could
8549      * avoid some allocation, zeroing and copying, but this is
8550      * probably not worth the complications ande checking.
8551      */
8552     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
8553                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
8554
8555     /* We make the all mdatoms up to nat_tot_con.
8556      * We could save some work by only setting invmass
8557      * between nat_tot and nat_tot_con.
8558      */
8559     /* This call also sets the new number of home particles to dd->nat_home */
8560     atoms2md(top_global,ir,
8561              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
8562
8563     /* Now we have the charges we can sort the FE interactions */
8564     dd_sort_local_top(dd,mdatoms,top_local);
8565
8566     if (shellfc)
8567     {
8568         /* Make the local shell stuff, currently no communication is done */
8569         make_local_shells(cr,mdatoms,shellfc);
8570     }
8571
8572         if (ir->implicit_solvent)
8573     {
8574         make_local_gb(cr,fr->born,ir->gb_algorithm);
8575     }
8576
8577     if (!(cr->duty & DUTY_PME))
8578     {
8579         /* Send the charges to our PME only node */
8580         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
8581                        mdatoms->chargeA,mdatoms->chargeB,
8582                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
8583     }
8584
8585     if (constr)
8586     {
8587         set_constraints(constr,top_local,ir,mdatoms,cr);
8588     }
8589
8590     if (ir->ePull != epullNO)
8591     {
8592         /* Update the local pull groups */
8593         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
8594     }
8595
8596     if (ir->bRot)
8597     {
8598         /* Update the local rotation groups */
8599         dd_make_local_rotation_groups(dd,ir->rot);
8600     }
8601
8602
8603     add_dd_statistics(dd);
8604
8605     /* Make sure we only count the cycles for this DD partitioning */
8606     clear_dd_cycle_counts(dd);
8607
8608     /* Because the order of the atoms might have changed since
8609      * the last vsite construction, we need to communicate the constructing
8610      * atom coordinates again (for spreading the forces this MD step).
8611      */
8612     dd_move_x_vsites(dd,state_local->box,state_local->x);
8613
8614     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
8615     {
8616         dd_move_x(dd,state_local->box,state_local->x);
8617         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
8618                      -1,state_local->x,state_local->box);
8619     }
8620
8621     /* Store the partitioning step */
8622     comm->partition_step = step;
8623
8624     /* Increase the DD partitioning counter */
8625     dd->ddp_count++;
8626     /* The state currently matches this DD partitioning count, store it */
8627     state_local->ddp_count = dd->ddp_count;
8628     if (bMasterState)
8629     {
8630         /* The DD master node knows the complete cg distribution,
8631          * store the count so we can possibly skip the cg info communication.
8632          */
8633         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
8634     }
8635
8636     if (comm->DD_debug > 0)
8637     {
8638         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
8639         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
8640                                 "after partitioning");
8641     }
8642 }