src/mdlib/domdec.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of Gromacs        Copyright (c) 1991-2008
   5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation; either version 2
  10  * of the License, or (at your option) any later version.
  11  *
  12  * To help us fund GROMACS development, we humbly ask that you cite
  13  * the research papers on the package. Check out http://www.gromacs.org
  14  *
  15  * And Hey:
  16  * Gnomes, ROck Monsters And Chili Sauce
  17  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 #include <config.h>
  21 #endif
  22
  23 #include <stdio.h>
  24 #include <time.h>
  25 #include <math.h>
  26 #include <string.h>
  27 #include <stdlib.h>
  28 #include "typedefs.h"
  29 #include "smalloc.h"
  30 #include "vec.h"
  31 #include "domdec.h"
  32 #include "domdec_network.h"
  33 #include "nrnb.h"
  34 #include "pbc.h"
  35 #include "chargegroup.h"
  36 #include "constr.h"
  37 #include "mdatoms.h"
  38 #include "names.h"
  39 #include "pdbio.h"
  40 #include "futil.h"
  41 #include "force.h"
  42 #include "pme.h"
  43 #include "pull.h"
  44 #include "gmx_wallcycle.h"
  45 #include "mdrun.h"
  46 #include "nsgrid.h"
  47 #include "shellfc.h"
  48 #include "mtop_util.h"
  49 #include "gmxfio.h"
  50 #include "gmx_ga2la.h"
  51 #include "gmx_sort.h"
  52
  53 #ifdef GMX_LIB_MPI
  54 #include <mpi.h>
  55 #endif
  56 #ifdef GMX_THREADS
  57 #include "tmpi.h"
  58 #endif
  59
  60 #define DDRANK(dd,rank)    (rank)
  61 #define DDMASTERRANK(dd)   (dd->masterrank)
  62
  63 typedef struct gmx_domdec_master
  64 {
  65     /* The cell boundaries */
  66     real **cell_x;
  67     /* The global charge group division */
  68     int  *ncg;     /* Number of home charge groups for each node */
  69     int  *index;   /* Index of nnodes+1 into cg */
  70     int  *cg;      /* Global charge group index */
  71     int  *nat;     /* Number of home atoms for each node. */
  72     int  *ibuf;    /* Buffer for communication */
  73     rvec *vbuf;    /* Buffer for state scattering and gathering */
  74 } gmx_domdec_master_t;
  75
  76 typedef struct
  77 {
  78     /* The numbers of charge groups to send and receive for each cell
  79      * that requires communication, the last entry contains the total
  80      * number of atoms that needs to be communicated.
  81      */
  82     int nsend[DD_MAXIZONE+2];
  83     int nrecv[DD_MAXIZONE+2];
  84     /* The charge groups to send */
  85     int *index;
  86     int nalloc;
  87     /* The atom range for non-in-place communication */
  88     int cell2at0[DD_MAXIZONE];
  89     int cell2at1[DD_MAXIZONE];
  90 } gmx_domdec_ind_t;
  91
  92 typedef struct
  93 {
  94     int  np;                   /* Number of grid pulses in this dimension */
  95     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
  96     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
  97     int  np_nalloc;
  98     bool bInPlace;             /* Can we communicate in place?            */
  99 } gmx_domdec_comm_dim_t;
 100
 101 typedef struct
 102 {
 103     bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 104     real *cell_f;      /* State var.: cell boundaries, box relative      */
 105     real *old_cell_f;  /* Temp. var.: old cell size                      */
 106     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 107     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 108     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 109     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 110     bool bLimited;     /* State var.: is DLB limited in this dim and row */
 111     real *buf_ncd;     /* Temp. var.                                     */
 112 } gmx_domdec_root_t;
 113
 114 #define DD_NLOAD_MAX 9
 115
 116 /* Here floats are accurate enough, since these variables
 117  * only influence the load balancing, not the actual MD results.
 118  */
 119 typedef struct
 120 {
 121     int  nload;
 122     float *load;
 123     float sum;
 124     float max;
 125     float sum_m;
 126     float cvol_min;
 127     float mdf;
 128     float pme;
 129     int   flags;
 130 } gmx_domdec_load_t;
 131
 132 typedef struct
 133 {
 134     int  nsc;
 135     int  ind_gl;
 136     int  ind;
 137 } gmx_cgsort_t;
 138
 139 typedef struct
 140 {
 141     gmx_cgsort_t *sort1,*sort2;
 142     int  sort_nalloc;
 143     gmx_cgsort_t *sort_new;
 144     int  sort_new_nalloc;
 145     int  *ibuf;
 146     int  ibuf_nalloc;
 147 } gmx_domdec_sort_t;
 148
 149 typedef struct
 150 {
 151     rvec *v;
 152     int  nalloc;
 153 } vec_rvec_t;
 154
 155 /* This enum determines the order of the coordinates.
 156  * ddnatHOME and ddnatZONE should be first and second,
 157  * the others can be ordered as wanted.
 158  */
 159 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
 160
 161 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
 162 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 163
 164 typedef struct
 165 {
 166     int  dim;      /* The dimension                                          */
 167     bool dim_match;/* Tells if DD and PME dims match                         */
 168     int  nslab;    /* The number of PME slabs in this dimension              */
 169     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 170     int  *pp_min;  /* The minimum pp node location, size nslab               */
 171     int  *pp_max;  /* The maximum pp node location,size nslab                */
 172     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
 173 } gmx_ddpme_t;
 174
 175 typedef struct
 176 {
 177     real min0;    /* The minimum bottom of this zone                        */
 178     real max1;    /* The maximum top of this zone                           */
 179     real mch0;    /* The maximum bottom communicaton height for this zone   */
 180     real mch1;    /* The maximum top communicaton height for this zone      */
 181     real p1_0;    /* The bottom value of the first cell in this zone        */
 182     real p1_1;    /* The top value of the first cell in this zone           */
 183 } gmx_ddzone_t;
 184
 185 typedef struct gmx_domdec_comm
 186 {
 187     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 188      * unless stated otherwise.
 189      */
 190
 191     /* The number of decomposition dimensions for PME, 0: no PME */
 192     int  npmedecompdim;
 193     /* The number of nodes doing PME (PP/PME or only PME) */
 194     int  npmenodes;
 195     int  npmenodes_x;
 196     int  npmenodes_y;
 197     /* The communication setup including the PME only nodes */
 198     bool bCartesianPP_PME;
 199     ivec ntot;
 200     int  cartpmedim;
 201     int  *pmenodes;          /* size npmenodes                         */
 202     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 203                               * but with bCartesianPP_PME              */
 204     gmx_ddpme_t ddpme[2];
 205
 206     /* The DD particle-particle nodes only */
 207     bool bCartesianPP;
 208     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 209
 210     /* The global charge groups */
 211     t_block cgs_gl;
 212
 213     /* Should we sort the cgs */
 214     int  nstSortCG;
 215     gmx_domdec_sort_t *sort;
 216
 217     /* Are there bonded and multi-body interactions between charge groups? */
 218     bool bInterCGBondeds;
 219     bool bInterCGMultiBody;
 220
 221     /* Data for the optional bonded interaction atom communication range */
 222     bool bBondComm;
 223     t_blocka *cglink;
 224     char *bLocalCG;
 225
 226     /* The DLB option */
 227     int  eDLB;
 228     /* Are we actually using DLB? */
 229     bool bDynLoadBal;
 230
 231     /* Cell sizes for static load balancing, first index cartesian */
 232     real **slb_frac;
 233
 234     /* The width of the communicated boundaries */
 235     real cutoff_mbody;
 236     real cutoff;
 237     /* The minimum cell size (including triclinic correction) */
 238     rvec cellsize_min;
 239     /* For dlb, for use with edlbAUTO */
 240     rvec cellsize_min_dlb;
 241     /* The lower limit for the DD cell size with DLB */
 242     real cellsize_limit;
 243     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 244     bool bVacDLBNoLimit;
 245
 246     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 247     ivec tric_dir;
 248     /* box0 and box_size are required with dim's without pbc and -gcom */
 249     rvec box0;
 250     rvec box_size;
 251
 252     /* The cell boundaries */
 253     rvec cell_x0;
 254     rvec cell_x1;
 255
 256     /* The old location of the cell boundaries, to check cg displacements */
 257     rvec old_cell_x0;
 258     rvec old_cell_x1;
 259
 260     /* The communication setup and charge group boundaries for the zones */
 261     gmx_domdec_zones_t zones;
 262
 263     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 264      * cell boundaries of neighboring cells for dynamic load balancing.
 265      */
 266     gmx_ddzone_t zone_d1[2];
 267     gmx_ddzone_t zone_d2[2][2];
 268
 269     /* The coordinate/force communication setup and indices */
 270     gmx_domdec_comm_dim_t cd[DIM];
 271     /* The maximum number of cells to communicate with in one dimension */
 272     int  maxpulse;
 273
 274     /* Which cg distribution is stored on the master node */
 275     int master_cg_ddp_count;
 276
 277     /* The number of cg's received from the direct neighbors */
 278     int  zone_ncg1[DD_MAXZONE];
 279
 280     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 281     int  nat[ddnatNR];
 282
 283     /* Communication buffer for general use */
 284     int  *buf_int;
 285     int  nalloc_int;
 286
 287      /* Communication buffer for general use */
 288     vec_rvec_t vbuf;
 289
 290     /* Communication buffers only used with multiple grid pulses */
 291     int  *buf_int2;
 292     int  nalloc_int2;
 293     vec_rvec_t vbuf2;
 294
 295     /* Communication buffers for local redistribution */
 296     int  **cggl_flag;
 297     int  cggl_flag_nalloc[DIM*2];
 298     rvec **cgcm_state;
 299     int  cgcm_state_nalloc[DIM*2];
 300
 301     /* Cell sizes for dynamic load balancing */
 302     gmx_domdec_root_t **root;
 303     real *cell_f_row;
 304     real cell_f0[DIM];
 305     real cell_f1[DIM];
 306     real cell_f_max0[DIM];
 307     real cell_f_min1[DIM];
 308
 309     /* Stuff for load communication */
 310     bool bRecordLoad;
 311     gmx_domdec_load_t *load;
 312 #ifdef GMX_MPI
 313     MPI_Comm *mpi_comm_load;
 314 #endif
 315     /* Cycle counters */
 316     float cycl[ddCyclNr];
 317     int   cycl_n[ddCyclNr];
 318     float cycl_max[ddCyclNr];
 319     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 320     int eFlop;
 321     double flop;
 322     int    flop_n;
 323     /* Have often have did we have load measurements */
 324     int    n_load_have;
 325     /* Have often have we collected the load measurements */
 326     int    n_load_collect;
 327
 328     /* Statistics */
 329     double sum_nat[ddnatNR-ddnatZONE];
 330     int    ndecomp;
 331     int    nload;
 332     double load_step;
 333     double load_sum;
 334     double load_max;
 335     ivec   load_lim;
 336     double load_mdf;
 337     double load_pme;
 338
 339     /* The last partition step */
 340     gmx_large_int_t partition_step;
 341
 342     /* Debugging */
 343     int  nstDDDump;
 344     int  nstDDDumpGrid;
 345     int  DD_debug;
 346 } gmx_domdec_comm_t;
 347
 348 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 349 #define DD_CGIBS 2
 350
 351 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 352 #define DD_FLAG_NRCG  65535
 353 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 354 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 355
 356 /* Zone permutation required to obtain consecutive charge groups
 357  * for neighbor searching.
 358  */
 359 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
 360
 361 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 362  * components see only j zones with that component 0.
 363  */
 364
 365 /* The DD zone order */
 366 static const ivec dd_zo[DD_MAXZONE] =
 367   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
 368
 369 /* The 3D setup */
 370 #define dd_z3n  8
 371 #define dd_zp3n 4
 372 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
 373
 374 /* The 2D setup */
 375 #define dd_z2n  4
 376 #define dd_zp2n 2
 377 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
 378
 379 /* The 1D setup */
 380 #define dd_z1n  2
 381 #define dd_zp1n 1
 382 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
 383
 384 /* Factors used to avoid problems due to rounding issues */
 385 #define DD_CELL_MARGIN       1.0001
 386 #define DD_CELL_MARGIN2      1.00005
 387 /* Factor to account for pressure scaling during nstlist steps */
 388 #define DD_PRES_SCALE_MARGIN 1.02
 389
 390 /* Allowed performance loss before we DLB or warn */
 391 #define DD_PERF_LOSS 0.05
 392
 393 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 394
 395 /* Use separate MPI send and receive commands
 396  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 397  * This saves memory (and some copying for small nnodes).
 398  * For high parallelization scatter and gather calls are used.
 399  */
 400 #define GMX_DD_NNODES_SENDRECV 4
 401
 402
 403 /*
 404 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 405
 406 static void index2xyz(ivec nc,int ind,ivec xyz)
 407 {
 408   xyz[XX] = ind % nc[XX];
 409   xyz[YY] = (ind / nc[XX]) % nc[YY];
 410   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 411 }
 412 */
 413
 414 /* This order is required to minimize the coordinate communication in PME
 415  * which uses decomposition in the x direction.
 416  */
 417 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 418
 419 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
 420 {
 421     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 422     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 423     xyz[ZZ] = ind % nc[ZZ];
 424 }
 425
 426 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
 427 {
 428     int ddindex;
 429     int ddnodeid=-1;
 430
 431     ddindex = dd_index(dd->nc,c);
 432     if (dd->comm->bCartesianPP_PME)
 433     {
 434         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 435     }
 436     else if (dd->comm->bCartesianPP)
 437     {
 438 #ifdef GMX_MPI
 439         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
 440 #endif
 441     }
 442     else
 443     {
 444         ddnodeid = ddindex;
 445     }
 446
 447     return ddnodeid;
 448 }
 449
 450 static bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
 451 {
 452     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 453 }
 454
 455 int ddglatnr(gmx_domdec_t *dd,int i)
 456 {
 457     int atnr;
 458
 459     if (dd == NULL)
 460     {
 461         atnr = i + 1;
 462     }
 463     else
 464     {
 465         if (i >= dd->comm->nat[ddnatNR-1])
 466         {
 467             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
 468         }
 469         atnr = dd->gatindex[i] + 1;
 470     }
 471
 472     return atnr;
 473 }
 474
 475 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 476 {
 477     return &dd->comm->cgs_gl;
 478 }
 479
 480 static void vec_rvec_init(vec_rvec_t *v)
 481 {
 482     v->nalloc = 0;
 483     v->v      = NULL;
 484 }
 485
 486 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
 487 {
 488     if (n > v->nalloc)
 489     {
 490         v->nalloc = over_alloc_dd(n);
 491         srenew(v->v,v->nalloc);
 492     }
 493 }
 494
 495 void dd_store_state(gmx_domdec_t *dd,t_state *state)
 496 {
 497     int i;
 498
 499     if (state->ddp_count != dd->ddp_count)
 500     {
 501         gmx_incons("The state does not the domain decomposition state");
 502     }
 503
 504     state->ncg_gl = dd->ncg_home;
 505     if (state->ncg_gl > state->cg_gl_nalloc)
 506     {
 507         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 508         srenew(state->cg_gl,state->cg_gl_nalloc);
 509     }
 510     for(i=0; i<state->ncg_gl; i++)
 511     {
 512         state->cg_gl[i] = dd->index_gl[i];
 513     }
 514
 515     state->ddp_count_cg_gl = dd->ddp_count;
 516 }
 517
 518 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 519 {
 520     return &dd->comm->zones;
 521 }
 522
 523 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
 524                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
 525 {
 526     gmx_domdec_zones_t *zones;
 527     int izone,d,dim;
 528
 529     zones = &dd->comm->zones;
 530
 531     izone = 0;
 532     while (icg >= zones->izone[izone].cg1)
 533     {
 534         izone++;
 535     }
 536
 537     if (izone == 0)
 538     {
 539         *jcg0 = icg;
 540     }
 541     else if (izone < zones->nizone)
 542     {
 543         *jcg0 = zones->izone[izone].jcg0;
 544     }
 545     else
 546     {
 547         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
 548                   icg,izone,zones->nizone);
 549     }
 550
 551     *jcg1 = zones->izone[izone].jcg1;
 552
 553     for(d=0; d<dd->ndim; d++)
 554     {
 555         dim = dd->dim[d];
 556         shift0[dim] = zones->izone[izone].shift0[dim];
 557         shift1[dim] = zones->izone[izone].shift1[dim];
 558         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 559         {
 560             /* A conservative approach, this can be optimized */
 561             shift0[dim] -= 1;
 562             shift1[dim] += 1;
 563         }
 564     }
 565 }
 566
 567 int dd_natoms_vsite(gmx_domdec_t *dd)
 568 {
 569     return dd->comm->nat[ddnatVSITE];
 570 }
 571
 572 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
 573 {
 574     *at_start = dd->comm->nat[ddnatCON-1];
 575     *at_end   = dd->comm->nat[ddnatCON];
 576 }
 577
 578 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
 579 {
 580     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 581     int  *index,*cgindex;
 582     gmx_domdec_comm_t *comm;
 583     gmx_domdec_comm_dim_t *cd;
 584     gmx_domdec_ind_t *ind;
 585     rvec shift={0,0,0},*buf,*rbuf;
 586     bool bPBC,bScrew;
 587
 588     comm = dd->comm;
 589
 590     cgindex = dd->cgindex;
 591
 592     buf = comm->vbuf.v;
 593
 594     nzone = 1;
 595     nat_tot = dd->nat_home;
 596     for(d=0; d<dd->ndim; d++)
 597     {
 598         bPBC   = (dd->ci[dd->dim[d]] == 0);
 599         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 600         if (bPBC)
 601         {
 602             copy_rvec(box[dd->dim[d]],shift);
 603         }
 604         cd = &comm->cd[d];
 605         for(p=0; p<cd->np; p++)
 606         {
 607             ind = &cd->ind[p];
 608             index = ind->index;
 609             n = 0;
 610             if (!bPBC)
 611             {
 612                 for(i=0; i<ind->nsend[nzone]; i++)
 613                 {
 614                     at0 = cgindex[index[i]];
 615                     at1 = cgindex[index[i]+1];
 616                     for(j=at0; j<at1; j++)
 617                     {
 618                         copy_rvec(x[j],buf[n]);
 619                         n++;
 620                     }
 621                 }
 622             }
 623             else if (!bScrew)
 624             {
 625                 for(i=0; i<ind->nsend[nzone]; i++)
 626                 {
 627                     at0 = cgindex[index[i]];
 628                     at1 = cgindex[index[i]+1];
 629                     for(j=at0; j<at1; j++)
 630                     {
 631                         /* We need to shift the coordinates */
 632                         rvec_add(x[j],shift,buf[n]);
 633                         n++;
 634                     }
 635                 }
 636             }
 637             else
 638             {
 639                 for(i=0; i<ind->nsend[nzone]; i++)
 640                 {
 641                     at0 = cgindex[index[i]];
 642                     at1 = cgindex[index[i]+1];
 643                     for(j=at0; j<at1; j++)
 644                     {
 645                         /* Shift x */
 646                         buf[n][XX] = x[j][XX] + shift[XX];
 647                         /* Rotate y and z.
 648                          * This operation requires a special shift force
 649                          * treatment, which is performed in calc_vir.
 650                          */
 651                         buf[n][YY] = box[YY][YY] - x[j][YY];
 652                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 653                         n++;
 654                     }
 655                 }
 656             }
 657
 658             if (cd->bInPlace)
 659             {
 660                 rbuf = x + nat_tot;
 661             }
 662             else
 663             {
 664                 rbuf = comm->vbuf2.v;
 665             }
 666             /* Send and receive the coordinates */
 667             dd_sendrecv_rvec(dd, d, dddirBackward,
 668                              buf,  ind->nsend[nzone+1],
 669                              rbuf, ind->nrecv[nzone+1]);
 670             if (!cd->bInPlace)
 671             {
 672                 j = 0;
 673                 for(zone=0; zone<nzone; zone++)
 674                 {
 675                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 676                     {
 677                         copy_rvec(rbuf[j],x[i]);
 678                         j++;
 679                     }
 680                 }
 681             }
 682             nat_tot += ind->nrecv[nzone+1];
 683         }
 684         nzone += nzone;
 685     }
 686 }
 687
 688 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
 689 {
 690     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 691     int  *index,*cgindex;
 692     gmx_domdec_comm_t *comm;
 693     gmx_domdec_comm_dim_t *cd;
 694     gmx_domdec_ind_t *ind;
 695     rvec *buf,*sbuf;
 696     ivec vis;
 697     int  is;
 698     bool bPBC,bScrew;
 699
 700     comm = dd->comm;
 701
 702     cgindex = dd->cgindex;
 703
 704     buf = comm->vbuf.v;
 705
 706     n = 0;
 707     nzone = comm->zones.n/2;
 708     nat_tot = dd->nat_tot;
 709     for(d=dd->ndim-1; d>=0; d--)
 710     {
 711         bPBC   = (dd->ci[dd->dim[d]] == 0);
 712         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 713         if (fshift == NULL && !bScrew)
 714         {
 715             bPBC = FALSE;
 716         }
 717         /* Determine which shift vector we need */
 718         clear_ivec(vis);
 719         vis[dd->dim[d]] = 1;
 720         is = IVEC2IS(vis);
 721
 722         cd = &comm->cd[d];
 723         for(p=cd->np-1; p>=0; p--) {
 724             ind = &cd->ind[p];
 725             nat_tot -= ind->nrecv[nzone+1];
 726             if (cd->bInPlace)
 727             {
 728                 sbuf = f + nat_tot;
 729             }
 730             else
 731             {
 732                 sbuf = comm->vbuf2.v;
 733                 j = 0;
 734                 for(zone=0; zone<nzone; zone++)
 735                 {
 736                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 737                     {
 738                         copy_rvec(f[i],sbuf[j]);
 739                         j++;
 740                     }
 741                 }
 742             }
 743             /* Communicate the forces */
 744             dd_sendrecv_rvec(dd, d, dddirForward,
 745                              sbuf, ind->nrecv[nzone+1],
 746                              buf,  ind->nsend[nzone+1]);
 747             index = ind->index;
 748             /* Add the received forces */
 749             n = 0;
 750             if (!bPBC)
 751             {
 752                 for(i=0; i<ind->nsend[nzone]; i++)
 753                 {
 754                     at0 = cgindex[index[i]];
 755                     at1 = cgindex[index[i]+1];
 756                     for(j=at0; j<at1; j++)
 757                     {
 758                         rvec_inc(f[j],buf[n]);
 759                         n++;
 760                     }
 761                 }
 762             }
 763             else if (!bScrew)
 764             {
 765                 for(i=0; i<ind->nsend[nzone]; i++)
 766                 {
 767                     at0 = cgindex[index[i]];
 768                     at1 = cgindex[index[i]+1];
 769                     for(j=at0; j<at1; j++)
 770                     {
 771                         rvec_inc(f[j],buf[n]);
 772                         /* Add this force to the shift force */
 773                         rvec_inc(fshift[is],buf[n]);
 774                         n++;
 775                     }
 776                 }
 777             }
 778             else
 779             {
 780                 for(i=0; i<ind->nsend[nzone]; i++)
 781                 {
 782                     at0 = cgindex[index[i]];
 783                     at1 = cgindex[index[i]+1];
 784                     for(j=at0; j<at1; j++)
 785                     {
 786                         /* Rotate the force */
 787                         f[j][XX] += buf[n][XX];
 788                         f[j][YY] -= buf[n][YY];
 789                         f[j][ZZ] -= buf[n][ZZ];
 790                         if (fshift)
 791                         {
 792                             /* Add this force to the shift force */
 793                             rvec_inc(fshift[is],buf[n]);
 794                         }
 795                         n++;
 796                     }
 797                 }
 798             }
 799         }
 800         nzone /= 2;
 801     }
 802 }
 803
 804 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
 805 {
 806     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 807     int  *index,*cgindex;
 808     gmx_domdec_comm_t *comm;
 809     gmx_domdec_comm_dim_t *cd;
 810     gmx_domdec_ind_t *ind;
 811     real *buf,*rbuf;
 812
 813     comm = dd->comm;
 814
 815     cgindex = dd->cgindex;
 816
 817     buf = &comm->vbuf.v[0][0];
 818
 819     nzone = 1;
 820     nat_tot = dd->nat_home;
 821     for(d=0; d<dd->ndim; d++)
 822     {
 823         cd = &comm->cd[d];
 824         for(p=0; p<cd->np; p++)
 825         {
 826             ind = &cd->ind[p];
 827             index = ind->index;
 828             n = 0;
 829             for(i=0; i<ind->nsend[nzone]; i++)
 830             {
 831                 at0 = cgindex[index[i]];
 832                 at1 = cgindex[index[i]+1];
 833                 for(j=at0; j<at1; j++)
 834                 {
 835                     buf[n] = v[j];
 836                     n++;
 837                 }
 838             }
 839
 840             if (cd->bInPlace)
 841             {
 842                 rbuf = v + nat_tot;
 843             }
 844             else
 845             {
 846                 rbuf = &comm->vbuf2.v[0][0];
 847             }
 848             /* Send and receive the coordinates */
 849             dd_sendrecv_real(dd, d, dddirBackward,
 850                              buf,  ind->nsend[nzone+1],
 851                              rbuf, ind->nrecv[nzone+1]);
 852             if (!cd->bInPlace)
 853             {
 854                 j = 0;
 855                 for(zone=0; zone<nzone; zone++)
 856                 {
 857                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 858                     {
 859                         v[i] = rbuf[j];
 860                         j++;
 861                     }
 862                 }
 863             }
 864             nat_tot += ind->nrecv[nzone+1];
 865         }
 866         nzone += nzone;
 867     }
 868 }
 869
 870 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
 871 {
 872     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 873     int  *index,*cgindex;
 874     gmx_domdec_comm_t *comm;
 875     gmx_domdec_comm_dim_t *cd;
 876     gmx_domdec_ind_t *ind;
 877     real *buf,*sbuf;
 878
 879     comm = dd->comm;
 880
 881     cgindex = dd->cgindex;
 882
 883     buf = &comm->vbuf.v[0][0];
 884
 885     n = 0;
 886     nzone = comm->zones.n/2;
 887     nat_tot = dd->nat_tot;
 888     for(d=dd->ndim-1; d>=0; d--)
 889     {
 890         cd = &comm->cd[d];
 891         for(p=cd->np-1; p>=0; p--) {
 892             ind = &cd->ind[p];
 893             nat_tot -= ind->nrecv[nzone+1];
 894             if (cd->bInPlace)
 895             {
 896                 sbuf = v + nat_tot;
 897             }
 898             else
 899             {
 900                 sbuf = &comm->vbuf2.v[0][0];
 901                 j = 0;
 902                 for(zone=0; zone<nzone; zone++)
 903                 {
 904                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 905                     {
 906                         sbuf[j] = v[i];
 907                         j++;
 908                     }
 909                 }
 910             }
 911             /* Communicate the forces */
 912             dd_sendrecv_real(dd, d, dddirForward,
 913                              sbuf, ind->nrecv[nzone+1],
 914                              buf,  ind->nsend[nzone+1]);
 915             index = ind->index;
 916             /* Add the received forces */
 917             n = 0;
 918             for(i=0; i<ind->nsend[nzone]; i++)
 919             {
 920                 at0 = cgindex[index[i]];
 921                 at1 = cgindex[index[i]+1];
 922                 for(j=at0; j<at1; j++)
 923                 {
 924                     v[j] += buf[n];
 925                     n++;
 926                 }
 927             }
 928         }
 929         nzone /= 2;
 930     }
 931 }
 932
 933 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
 934 {
 935     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 936             d,i,j,
 937             zone->min0,zone->max1,
 938             zone->mch0,zone->mch0,
 939             zone->p1_0,zone->p1_1);
 940 }
 941
 942 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 943                                int ddimind,int direction,
 944                                gmx_ddzone_t *buf_s,int n_s,
 945                                gmx_ddzone_t *buf_r,int n_r)
 946 {
 947     rvec vbuf_s[5*2],vbuf_r[5*2];
 948     int i;
 949
 950     for(i=0; i<n_s; i++)
 951     {
 952         vbuf_s[i*2  ][0] = buf_s[i].min0;
 953         vbuf_s[i*2  ][1] = buf_s[i].max1;
 954         vbuf_s[i*2  ][2] = buf_s[i].mch0;
 955         vbuf_s[i*2+1][0] = buf_s[i].mch1;
 956         vbuf_s[i*2+1][1] = buf_s[i].p1_0;
 957         vbuf_s[i*2+1][2] = buf_s[i].p1_1;
 958     }
 959
 960     dd_sendrecv_rvec(dd, ddimind, direction,
 961                      vbuf_s, n_s*2,
 962                      vbuf_r, n_r*2);
 963
 964     for(i=0; i<n_r; i++)
 965     {
 966         buf_r[i].min0 = vbuf_r[i*2  ][0];
 967         buf_r[i].max1 = vbuf_r[i*2  ][1];
 968         buf_r[i].mch0 = vbuf_r[i*2  ][2];
 969         buf_r[i].mch1 = vbuf_r[i*2+1][0];
 970         buf_r[i].p1_0 = vbuf_r[i*2+1][1];
 971         buf_r[i].p1_1 = vbuf_r[i*2+1][2];
 972     }
 973 }
 974
 975 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 976                           rvec cell_ns_x0,rvec cell_ns_x1)
 977 {
 978     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
 979     gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
 980     rvec extr_s[2],extr_r[2];
 981     rvec dh;
 982     real dist_d,c=0,det;
 983     gmx_domdec_comm_t *comm;
 984     bool bPBC,bUse;
 985
 986     comm = dd->comm;
 987
 988     for(d=1; d<dd->ndim; d++)
 989     {
 990         dim = dd->dim[d];
 991         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 992         zp->min0 = cell_ns_x0[dim];
 993         zp->max1 = cell_ns_x1[dim];
 994         zp->mch0 = cell_ns_x0[dim];
 995         zp->mch1 = cell_ns_x1[dim];
 996         zp->p1_0 = cell_ns_x0[dim];
 997         zp->p1_1 = cell_ns_x1[dim];
 998     }
 999
1000     for(d=dd->ndim-2; d>=0; d--)
1001     {
1002         dim  = dd->dim[d];
1003         bPBC = (dim < ddbox->npbcdim);
1004
1005         /* Use an rvec to store two reals */
1006         extr_s[d][0] = comm->cell_f0[d+1];
1007         extr_s[d][1] = comm->cell_f1[d+1];
1008         extr_s[d][2] = 0;
1009
1010         pos = 0;
1011         /* Store the extremes in the backward sending buffer,
1012          * so the get updated separately from the forward communication.
1013          */
1014         for(d1=d; d1<dd->ndim-1; d1++)
1015         {
1016             /* We invert the order to be able to use the same loop for buf_e */
1017             buf_s[pos].min0 = extr_s[d1][1];
1018             buf_s[pos].max1 = extr_s[d1][0];
1019             buf_s[pos].mch0 = 0;
1020             buf_s[pos].mch1 = 0;
1021             /* Store the cell corner of the dimension we communicate along */
1022             buf_s[pos].p1_0 = comm->cell_x0[dim];
1023             buf_s[pos].p1_1 = 0;
1024             pos++;
1025         }
1026
1027         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1028         pos++;
1029
1030         if (dd->ndim == 3 && d == 0)
1031         {
1032             buf_s[pos] = comm->zone_d2[0][1];
1033             pos++;
1034             buf_s[pos] = comm->zone_d1[0];
1035             pos++;
1036         }
1037
1038         /* We only need to communicate the extremes
1039          * in the forward direction
1040          */
1041         npulse = comm->cd[d].np;
1042         if (bPBC)
1043         {
1044             /* Take the minimum to avoid double communication */
1045             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1046         }
1047         else
1048         {
1049             /* Without PBC we should really not communicate over
1050              * the boundaries, but implementing that complicates
1051              * the communication setup and therefore we simply
1052              * do all communication, but ignore some data.
1053              */
1054             npulse_min = npulse;
1055         }
1056         for(p=0; p<npulse_min; p++)
1057         {
1058             /* Communicate the extremes forward */
1059             bUse = (bPBC || dd->ci[dim] > 0);
1060
1061             dd_sendrecv_rvec(dd, d, dddirForward,
1062                              extr_s+d, dd->ndim-d-1,
1063                              extr_r+d, dd->ndim-d-1);
1064
1065             if (bUse)
1066             {
1067                 for(d1=d; d1<dd->ndim-1; d1++)
1068                 {
1069                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1070                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1071                 }
1072             }
1073         }
1074
1075         buf_size = pos;
1076         for(p=0; p<npulse; p++)
1077         {
1078             /* Communicate all the zone information backward */
1079             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1080
1081             dd_sendrecv_ddzone(dd, d, dddirBackward,
1082                                buf_s, buf_size,
1083                                buf_r, buf_size);
1084
1085             clear_rvec(dh);
1086             if (p > 0)
1087             {
1088                 for(d1=d+1; d1<dd->ndim; d1++)
1089                 {
1090                     /* Determine the decrease of maximum required
1091                      * communication height along d1 due to the distance along d,
1092                      * this avoids a lot of useless atom communication.
1093                      */
1094                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1095
1096                     if (ddbox->tric_dir[dim])
1097                     {
1098                         /* c is the off-diagonal coupling between the cell planes
1099                          * along directions d and d1.
1100                          */
1101                         c = ddbox->v[dim][dd->dim[d1]][dim];
1102                     }
1103                     else
1104                     {
1105                         c = 0;
1106                     }
1107                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1108                     if (det > 0)
1109                     {
1110                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1111                     }
1112                     else
1113                     {
1114                         /* A negative value signals out of range */
1115                         dh[d1] = -1;
1116                     }
1117                 }
1118             }
1119
1120             /* Accumulate the extremes over all pulses */
1121             for(i=0; i<buf_size; i++)
1122             {
1123                 if (p == 0)
1124                 {
1125                     buf_e[i] = buf_r[i];
1126                 }
1127                 else
1128                 {
1129                     if (bUse)
1130                     {
1131                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1132                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1133                     }
1134
1135                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1136                     {
1137                         d1 = 1;
1138                     }
1139                     else
1140                     {
1141                         d1 = d + 1;
1142                     }
1143                     if (bUse && dh[d1] >= 0)
1144                     {
1145                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1146                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1147                     }
1148                 }
1149                 /* Copy the received buffer to the send buffer,
1150                  * to pass the data through with the next pulse.
1151                  */
1152                 buf_s[i] = buf_r[i];
1153             }
1154             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1155                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1156             {
1157                 /* Store the extremes */
1158                 pos = 0;
1159
1160                 for(d1=d; d1<dd->ndim-1; d1++)
1161                 {
1162                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1163                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1164                     pos++;
1165                 }
1166
1167                 if (d == 1 || (d == 0 && dd->ndim == 3))
1168                 {
1169                     for(i=d; i<2; i++)
1170                     {
1171                         comm->zone_d2[1-d][i] = buf_e[pos];
1172                         pos++;
1173                     }
1174                 }
1175                 if (d == 0)
1176                 {
1177                     comm->zone_d1[1] = buf_e[pos];
1178                     pos++;
1179                 }
1180             }
1181         }
1182     }
1183
1184     if (dd->ndim >= 2)
1185     {
1186         dim = dd->dim[1];
1187         for(i=0; i<2; i++)
1188         {
1189             if (debug)
1190             {
1191                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1192             }
1193             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1194             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1195         }
1196     }
1197     if (dd->ndim >= 3)
1198     {
1199         dim = dd->dim[2];
1200         for(i=0; i<2; i++)
1201         {
1202             for(j=0; j<2; j++)
1203             {
1204                 if (debug)
1205                 {
1206                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1207                 }
1208                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1209                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1210             }
1211         }
1212     }
1213     for(d=1; d<dd->ndim; d++)
1214     {
1215         comm->cell_f_max0[d] = extr_s[d-1][0];
1216         comm->cell_f_min1[d] = extr_s[d-1][1];
1217         if (debug)
1218         {
1219             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1220                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1221         }
1222     }
1223 }
1224
1225 static void dd_collect_cg(gmx_domdec_t *dd,
1226                           t_state *state_local)
1227 {
1228     gmx_domdec_master_t *ma=NULL;
1229     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1230     t_block *cgs_gl;
1231
1232     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1233     {
1234         /* The master has the correct distribution */
1235         return;
1236     }
1237
1238     if (state_local->ddp_count == dd->ddp_count)
1239     {
1240         ncg_home = dd->ncg_home;
1241         cg       = dd->index_gl;
1242         nat_home = dd->nat_home;
1243     }
1244     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1245     {
1246         cgs_gl = &dd->comm->cgs_gl;
1247
1248         ncg_home = state_local->ncg_gl;
1249         cg       = state_local->cg_gl;
1250         nat_home = 0;
1251         for(i=0; i<ncg_home; i++)
1252         {
1253             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1254         }
1255     }
1256     else
1257     {
1258         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1259     }
1260
1261     buf2[0] = dd->ncg_home;
1262     buf2[1] = dd->nat_home;
1263     if (DDMASTER(dd))
1264     {
1265         ma = dd->ma;
1266         ibuf = ma->ibuf;
1267     }
1268     else
1269     {
1270         ibuf = NULL;
1271     }
1272     /* Collect the charge group and atom counts on the master */
1273     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1274
1275     if (DDMASTER(dd))
1276     {
1277         ma->index[0] = 0;
1278         for(i=0; i<dd->nnodes; i++)
1279         {
1280             ma->ncg[i] = ma->ibuf[2*i];
1281             ma->nat[i] = ma->ibuf[2*i+1];
1282             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1283
1284         }
1285         /* Make byte counts and indices */
1286         for(i=0; i<dd->nnodes; i++)
1287         {
1288             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1289             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1290         }
1291         if (debug)
1292         {
1293             fprintf(debug,"Initial charge group distribution: ");
1294             for(i=0; i<dd->nnodes; i++)
1295                 fprintf(debug," %d",ma->ncg[i]);
1296             fprintf(debug,"\n");
1297         }
1298     }
1299
1300     /* Collect the charge group indices on the master */
1301     dd_gatherv(dd,
1302                dd->ncg_home*sizeof(int),dd->index_gl,
1303                DDMASTER(dd) ? ma->ibuf : NULL,
1304                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1305                DDMASTER(dd) ? ma->cg : NULL);
1306
1307     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1308 }
1309
1310 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1311                                     rvec *lv,rvec *v)
1312 {
1313     gmx_domdec_master_t *ma;
1314     int  n,i,c,a,nalloc=0;
1315     rvec *buf=NULL;
1316     t_block *cgs_gl;
1317
1318     ma = dd->ma;
1319
1320     if (!DDMASTER(dd))
1321     {
1322 #ifdef GMX_MPI
1323         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1324                  dd->rank,dd->mpi_comm_all);
1325 #endif
1326     } else {
1327         /* Copy the master coordinates to the global array */
1328         cgs_gl = &dd->comm->cgs_gl;
1329
1330         n = DDMASTERRANK(dd);
1331         a = 0;
1332         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1333         {
1334             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1335             {
1336                 copy_rvec(lv[a++],v[c]);
1337             }
1338         }
1339
1340         for(n=0; n<dd->nnodes; n++)
1341         {
1342             if (n != dd->rank)
1343             {
1344                 if (ma->nat[n] > nalloc)
1345                 {
1346                     nalloc = over_alloc_dd(ma->nat[n]);
1347                     srenew(buf,nalloc);
1348                 }
1349 #ifdef GMX_MPI
1350                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1351                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1352 #endif
1353                 a = 0;
1354                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1355                 {
1356                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1357                     {
1358                         copy_rvec(buf[a++],v[c]);
1359                     }
1360                 }
1361             }
1362         }
1363         sfree(buf);
1364     }
1365 }
1366
1367 static void get_commbuffer_counts(gmx_domdec_t *dd,
1368                                   int **counts,int **disps)
1369 {
1370     gmx_domdec_master_t *ma;
1371     int n;
1372
1373     ma = dd->ma;
1374
1375     /* Make the rvec count and displacment arrays */
1376     *counts  = ma->ibuf;
1377     *disps   = ma->ibuf + dd->nnodes;
1378     for(n=0; n<dd->nnodes; n++)
1379     {
1380         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1381         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1382     }
1383 }
1384
1385 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1386                                    rvec *lv,rvec *v)
1387 {
1388     gmx_domdec_master_t *ma;
1389     int  *rcounts=NULL,*disps=NULL;
1390     int  n,i,c,a;
1391     rvec *buf=NULL;
1392     t_block *cgs_gl;
1393
1394     ma = dd->ma;
1395
1396     if (DDMASTER(dd))
1397     {
1398         get_commbuffer_counts(dd,&rcounts,&disps);
1399
1400         buf = ma->vbuf;
1401     }
1402
1403     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1404
1405     if (DDMASTER(dd))
1406     {
1407         cgs_gl = &dd->comm->cgs_gl;
1408
1409         a = 0;
1410         for(n=0; n<dd->nnodes; n++)
1411         {
1412             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1413             {
1414                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1415                 {
1416                     copy_rvec(buf[a++],v[c]);
1417                 }
1418             }
1419         }
1420     }
1421 }
1422
1423 void dd_collect_vec(gmx_domdec_t *dd,
1424                     t_state *state_local,rvec *lv,rvec *v)
1425 {
1426     gmx_domdec_master_t *ma;
1427     int  n,i,c,a,nalloc=0;
1428     rvec *buf=NULL;
1429
1430     dd_collect_cg(dd,state_local);
1431
1432     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1433     {
1434         dd_collect_vec_sendrecv(dd,lv,v);
1435     }
1436     else
1437     {
1438         dd_collect_vec_gatherv(dd,lv,v);
1439     }
1440 }
1441
1442
1443 void dd_collect_state(gmx_domdec_t *dd,
1444                       t_state *state_local,t_state *state)
1445 {
1446     int est,i,j,nh;
1447
1448     nh = state->nhchainlength;
1449
1450     if (DDMASTER(dd))
1451     {
1452         state->lambda = state_local->lambda;
1453         state->veta = state_local->veta;
1454         state->vol0 = state_local->vol0;
1455         copy_mat(state_local->box,state->box);
1456         copy_mat(state_local->boxv,state->boxv);
1457         copy_mat(state_local->svir_prev,state->svir_prev);
1458         copy_mat(state_local->fvir_prev,state->fvir_prev);
1459         copy_mat(state_local->pres_prev,state->pres_prev);
1460
1461
1462         for(i=0; i<state_local->ngtc; i++)
1463         {
1464             for(j=0; j<nh; j++) {
1465                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1466                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1467             }
1468             state->therm_integral[i] = state_local->therm_integral[i];
1469         }
1470         for(i=0; i<state_local->nnhpres; i++)
1471         {
1472             for(j=0; j<nh; j++) {
1473                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1474                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1475             }
1476         }
1477     }
1478     for(est=0; est<estNR; est++)
1479     {
1480         if (EST_DISTR(est) && state_local->flags & (1<<est))
1481         {
1482             switch (est) {
1483             case estX:
1484                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1485                 break;
1486             case estV:
1487                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1488                 break;
1489             case estSDX:
1490                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1491                 break;
1492             case estCGP:
1493                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1494                 break;
1495             case estLD_RNG:
1496                 if (state->nrngi == 1)
1497                 {
1498                     if (DDMASTER(dd))
1499                     {
1500                         for(i=0; i<state_local->nrng; i++)
1501                         {
1502                             state->ld_rng[i] = state_local->ld_rng[i];
1503                         }
1504                     }
1505                 }
1506                 else
1507                 {
1508                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1509                               state_local->ld_rng,state->ld_rng);
1510                 }
1511                 break;
1512             case estLD_RNGI:
1513                 if (state->nrngi == 1)
1514                 {
1515                    if (DDMASTER(dd))
1516                     {
1517                         state->ld_rngi[0] = state_local->ld_rngi[0];
1518                     }
1519                 }
1520                 else
1521                 {
1522                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1523                               state_local->ld_rngi,state->ld_rngi);
1524                 }
1525                 break;
1526             case estDISRE_INITF:
1527             case estDISRE_RM3TAV:
1528             case estORIRE_INITF:
1529             case estORIRE_DTAV:
1530                 break;
1531             default:
1532                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1533             }
1534         }
1535     }
1536 }
1537
1538 static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
1539 {
1540     if (debug)
1541     {
1542         fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1543     }
1544     fr->cg_nalloc = over_alloc_dd(nalloc);
1545     srenew(fr->cg_cm,fr->cg_nalloc);
1546     srenew(fr->cginfo,fr->cg_nalloc);
1547 }
1548
1549 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1550 {
1551     int est;
1552
1553     if (debug)
1554     {
1555         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1556     }
1557
1558     state->nalloc = over_alloc_dd(nalloc);
1559
1560     for(est=0; est<estNR; est++)
1561     {
1562         if (EST_DISTR(est) && state->flags & (1<<est))
1563         {
1564             switch(est) {
1565             case estX:
1566                 srenew(state->x,state->nalloc);
1567                 break;
1568             case estV:
1569                 srenew(state->v,state->nalloc);
1570                 break;
1571             case estSDX:
1572                 srenew(state->sd_X,state->nalloc);
1573                 break;
1574             case estCGP:
1575                 srenew(state->cg_p,state->nalloc);
1576                 break;
1577             case estLD_RNG:
1578             case estLD_RNGI:
1579             case estDISRE_INITF:
1580             case estDISRE_RM3TAV:
1581             case estORIRE_INITF:
1582             case estORIRE_DTAV:
1583                 /* No reallocation required */
1584                 break;
1585             default:
1586                 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1587             }
1588         }
1589     }
1590
1591     if (f != NULL)
1592     {
1593         srenew(*f,state->nalloc);
1594     }
1595 }
1596
1597 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1598                                        rvec *v,rvec *lv)
1599 {
1600     gmx_domdec_master_t *ma;
1601     int  n,i,c,a,nalloc=0;
1602     rvec *buf=NULL;
1603
1604     if (DDMASTER(dd))
1605     {
1606         ma  = dd->ma;
1607
1608         for(n=0; n<dd->nnodes; n++)
1609         {
1610             if (n != dd->rank)
1611             {
1612                 if (ma->nat[n] > nalloc)
1613                 {
1614                     nalloc = over_alloc_dd(ma->nat[n]);
1615                     srenew(buf,nalloc);
1616                 }
1617                 /* Use lv as a temporary buffer */
1618                 a = 0;
1619                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1620                 {
1621                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1622                     {
1623                         copy_rvec(v[c],buf[a++]);
1624                     }
1625                 }
1626                 if (a != ma->nat[n])
1627                 {
1628                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1629                               a,ma->nat[n]);
1630                 }
1631
1632 #ifdef GMX_MPI
1633                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1634                          DDRANK(dd,n),n,dd->mpi_comm_all);
1635 #endif
1636             }
1637         }
1638         sfree(buf);
1639         n = DDMASTERRANK(dd);
1640         a = 0;
1641         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1642         {
1643             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1644             {
1645                 copy_rvec(v[c],lv[a++]);
1646             }
1647         }
1648     }
1649     else
1650     {
1651 #ifdef GMX_MPI
1652         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1653                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1654 #endif
1655     }
1656 }
1657
1658 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1659                                        rvec *v,rvec *lv)
1660 {
1661     gmx_domdec_master_t *ma;
1662     int  *scounts=NULL,*disps=NULL;
1663     int  n,i,c,a,nalloc=0;
1664     rvec *buf=NULL;
1665
1666     if (DDMASTER(dd))
1667     {
1668         ma  = dd->ma;
1669
1670         get_commbuffer_counts(dd,&scounts,&disps);
1671
1672         buf = ma->vbuf;
1673         a = 0;
1674         for(n=0; n<dd->nnodes; n++)
1675         {
1676             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1677             {
1678                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1679                 {
1680                     copy_rvec(v[c],buf[a++]);
1681                 }
1682             }
1683         }
1684     }
1685
1686     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1687 }
1688
1689 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1690 {
1691     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1692     {
1693         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1694     }
1695     else
1696     {
1697         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1698     }
1699 }
1700
1701 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1702                                 t_state *state,t_state *state_local,
1703                                 rvec **f)
1704 {
1705     int  i,j,ngtch,ngtcp,nh;
1706
1707     nh = state->nhchainlength;
1708
1709     if (DDMASTER(dd))
1710     {
1711         state_local->lambda = state->lambda;
1712         state_local->veta   = state->veta;
1713         state_local->vol0   = state->vol0;
1714         copy_mat(state->box,state_local->box);
1715         copy_mat(state->box_rel,state_local->box_rel);
1716         copy_mat(state->boxv,state_local->boxv);
1717         copy_mat(state->svir_prev,state_local->svir_prev);
1718         copy_mat(state->fvir_prev,state_local->fvir_prev);
1719         for(i=0; i<state_local->ngtc; i++)
1720         {
1721             for(j=0; j<nh; j++) {
1722                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1723                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1724             }
1725             state_local->therm_integral[i] = state->therm_integral[i];
1726         }
1727         for(i=0; i<state_local->nnhpres; i++)
1728         {
1729             for(j=0; j<nh; j++) {
1730                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1731                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1732             }
1733         }
1734     }
1735     dd_bcast(dd,sizeof(real),&state_local->lambda);
1736     dd_bcast(dd,sizeof(real),&state_local->veta);
1737     dd_bcast(dd,sizeof(real),&state_local->vol0);
1738     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1739     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1740     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1741     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1742     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1743     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1744     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1745     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1746     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1747     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1748
1749     if (dd->nat_home > state_local->nalloc)
1750     {
1751         dd_realloc_state(state_local,f,dd->nat_home);
1752     }
1753     for(i=0; i<estNR; i++)
1754     {
1755         if (EST_DISTR(i) && state_local->flags & (1<<i))
1756         {
1757             switch (i) {
1758             case estX:
1759                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1760                 break;
1761             case estV:
1762                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1763                 break;
1764             case estSDX:
1765                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1766                 break;
1767             case estCGP:
1768                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1769                 break;
1770             case estLD_RNG:
1771                 if (state->nrngi == 1)
1772                 {
1773                     dd_bcastc(dd,
1774                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1775                               state->ld_rng,state_local->ld_rng);
1776                 }
1777                 else
1778                 {
1779                     dd_scatter(dd,
1780                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1781                                state->ld_rng,state_local->ld_rng);
1782                 }
1783                 break;
1784             case estLD_RNGI:
1785                 if (state->nrngi == 1)
1786                 {
1787                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1788                               state->ld_rngi,state_local->ld_rngi);
1789                 }
1790                 else
1791                 {
1792                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1793                                state->ld_rngi,state_local->ld_rngi);
1794                 }
1795                 break;
1796             case estDISRE_INITF:
1797             case estDISRE_RM3TAV:
1798             case estORIRE_INITF:
1799             case estORIRE_DTAV:
1800                 /* Not implemented yet */
1801                 break;
1802             default:
1803                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1804             }
1805         }
1806     }
1807 }
1808
1809 static char dim2char(int dim)
1810 {
1811     char c='?';
1812
1813     switch (dim)
1814     {
1815     case XX: c = 'X'; break;
1816     case YY: c = 'Y'; break;
1817     case ZZ: c = 'Z'; break;
1818     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1819     }
1820
1821     return c;
1822 }
1823
1824 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1825                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1826 {
1827     rvec grid_s[2],*grid_r=NULL,cx,r;
1828     char fname[STRLEN],format[STRLEN],buf[22];
1829     FILE *out;
1830     int  a,i,d,z,y,x;
1831     matrix tric;
1832     real vol;
1833
1834     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1835     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1836
1837     if (DDMASTER(dd))
1838     {
1839         snew(grid_r,2*dd->nnodes);
1840     }
1841
1842     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1843
1844     if (DDMASTER(dd))
1845     {
1846         for(d=0; d<DIM; d++)
1847         {
1848             for(i=0; i<DIM; i++)
1849             {
1850                 if (d == i)
1851                 {
1852                     tric[d][i] = 1;
1853                 }
1854                 else
1855                 {
1856                     if (dd->nc[d] > 1 && d < ddbox->npbcdim)
1857                     {
1858                         tric[d][i] = box[i][d]/box[i][i];
1859                     }
1860                     else
1861                     {
1862                         tric[d][i] = 0;
1863                     }
1864                 }
1865             }
1866         }
1867         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1868         sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1869         out = gmx_fio_fopen(fname,"w");
1870         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1871         a = 1;
1872         for(i=0; i<dd->nnodes; i++)
1873         {
1874             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1875             for(d=0; d<DIM; d++)
1876             {
1877                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1878             }
1879             for(z=0; z<2; z++)
1880             {
1881                 for(y=0; y<2; y++)
1882                 {
1883                     for(x=0; x<2; x++)
1884                     {
1885                         cx[XX] = grid_r[i*2+x][XX];
1886                         cx[YY] = grid_r[i*2+y][YY];
1887                         cx[ZZ] = grid_r[i*2+z][ZZ];
1888                         mvmul(tric,cx,r);
1889                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1890                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1891                     }
1892                 }
1893             }
1894             for(d=0; d<DIM; d++)
1895             {
1896                 for(x=0; x<4; x++)
1897                 {
1898                     switch(d)
1899                     {
1900                     case 0: y = 1 + i*8 + 2*x; break;
1901                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1902                     case 2: y = 1 + i*8 + x; break;
1903                     }
1904                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1905                 }
1906             }
1907         }
1908         gmx_fio_fclose(out);
1909         sfree(grid_r);
1910     }
1911 }
1912
1913 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1914                   gmx_mtop_t *mtop,t_commrec *cr,
1915                   int natoms,rvec x[],matrix box)
1916 {
1917     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1918     FILE *out;
1919     int  i,ii,resnr,c;
1920     char *atomname,*resname;
1921     real b;
1922     gmx_domdec_t *dd;
1923
1924     dd = cr->dd;
1925     if (natoms == -1)
1926     {
1927         natoms = dd->comm->nat[ddnatVSITE];
1928     }
1929
1930     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
1931
1932     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1933     sprintf(format4,"%s%s\n",pdbformat4,"%6.2f%6.2f");
1934
1935     out = gmx_fio_fopen(fname,"w");
1936
1937     fprintf(out,"TITLE     %s\n",title);
1938     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1939     for(i=0; i<natoms; i++)
1940     {
1941         ii = dd->gatindex[i];
1942         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
1943         if (i < dd->comm->nat[ddnatZONE])
1944         {
1945             c = 0;
1946             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1947             {
1948                 c++;
1949             }
1950             b = c;
1951         }
1952         else if (i < dd->comm->nat[ddnatVSITE])
1953         {
1954             b = dd->comm->zones.n;
1955         }
1956         else
1957         {
1958             b = dd->comm->zones.n + 1;
1959         }
1960         fprintf(out,strlen(atomname)<4 ? format : format4,
1961                 "ATOM",(ii+1)%100000,
1962                 atomname,resname,' ',resnr%10000,' ',
1963                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
1964     }
1965     fprintf(out,"TER\n");
1966
1967     gmx_fio_fclose(out);
1968 }
1969
1970 real dd_cutoff_mbody(gmx_domdec_t *dd)
1971 {
1972     gmx_domdec_comm_t *comm;
1973     int  di;
1974     real r;
1975
1976     comm = dd->comm;
1977
1978     r = -1;
1979     if (comm->bInterCGBondeds)
1980     {
1981         if (comm->cutoff_mbody > 0)
1982         {
1983             r = comm->cutoff_mbody;
1984         }
1985         else
1986         {
1987             /* cutoff_mbody=0 means we do not have DLB */
1988             r = comm->cellsize_min[dd->dim[0]];
1989             for(di=1; di<dd->ndim; di++)
1990             {
1991                 r = min(r,comm->cellsize_min[dd->dim[di]]);
1992             }
1993             if (comm->bBondComm)
1994             {
1995                 r = max(r,comm->cutoff_mbody);
1996             }
1997             else
1998             {
1999                 r = min(r,comm->cutoff);
2000             }
2001         }
2002     }
2003
2004     return r;
2005 }
2006
2007 real dd_cutoff_twobody(gmx_domdec_t *dd)
2008 {
2009     real r_mb;
2010
2011     r_mb = dd_cutoff_mbody(dd);
2012
2013     return max(dd->comm->cutoff,r_mb);
2014 }
2015
2016
2017 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2018 {
2019     int nc,ntot;
2020
2021     nc   = dd->nc[dd->comm->cartpmedim];
2022     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2023     copy_ivec(coord,coord_pme);
2024     coord_pme[dd->comm->cartpmedim] =
2025         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2026 }
2027
2028 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2029 {
2030     /* Here we assign a PME node to communicate with this DD node
2031      * by assuming that the major index of both is x.
2032      * We add cr->npmenodes/2 to obtain an even distribution.
2033      */
2034     return (ddindex*npme + npme/2)/ndd;
2035 }
2036
2037 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2038 {
2039     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2040 }
2041
2042 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2043 {
2044     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2045 }
2046
2047 static int *dd_pmenodes(t_commrec *cr)
2048 {
2049     int *pmenodes;
2050     int n,i,p0,p1;
2051
2052     snew(pmenodes,cr->npmenodes);
2053     n = 0;
2054     for(i=0; i<cr->dd->nnodes; i++) {
2055         p0 = cr_ddindex2pmeindex(cr,i);
2056         p1 = cr_ddindex2pmeindex(cr,i+1);
2057         if (i+1 == cr->dd->nnodes || p1 > p0) {
2058             if (debug)
2059                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2060             pmenodes[n] = i + 1 + n;
2061             n++;
2062         }
2063     }
2064
2065     return pmenodes;
2066 }
2067
2068 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2069 {
2070     gmx_domdec_t *dd;
2071     ivec coords,coords_pme,nc;
2072     int  slab;
2073
2074     dd = cr->dd;
2075     /*
2076       if (dd->comm->bCartesian) {
2077       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2078       dd_coords2pmecoords(dd,coords,coords_pme);
2079       copy_ivec(dd->ntot,nc);
2080       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2081       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2082
2083       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2084       } else {
2085       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2086       }
2087     */
2088     coords[XX] = x;
2089     coords[YY] = y;
2090     coords[ZZ] = z;
2091     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2092
2093     return slab;
2094 }
2095
2096 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2097 {
2098     gmx_domdec_comm_t *comm;
2099     ivec coords;
2100     int  ddindex,nodeid=-1;
2101
2102     comm = cr->dd->comm;
2103
2104     coords[XX] = x;
2105     coords[YY] = y;
2106     coords[ZZ] = z;
2107     if (comm->bCartesianPP_PME)
2108     {
2109 #ifdef GMX_MPI
2110         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2111 #endif
2112     }
2113     else
2114     {
2115         ddindex = dd_index(cr->dd->nc,coords);
2116         if (comm->bCartesianPP)
2117         {
2118             nodeid = comm->ddindex2simnodeid[ddindex];
2119         }
2120         else
2121         {
2122             if (comm->pmenodes)
2123             {
2124                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2125             }
2126             else
2127             {
2128                 nodeid = ddindex;
2129             }
2130         }
2131     }
2132
2133     return nodeid;
2134 }
2135
2136 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2137 {
2138     gmx_domdec_t *dd;
2139     gmx_domdec_comm_t *comm;
2140     ivec coord,coord_pme;
2141     int  i;
2142     int  pmenode=-1;
2143
2144     dd = cr->dd;
2145     comm = dd->comm;
2146
2147     /* This assumes a uniform x domain decomposition grid cell size */
2148     if (comm->bCartesianPP_PME)
2149     {
2150 #ifdef GMX_MPI
2151         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2152         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2153         {
2154             /* This is a PP node */
2155             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2156             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2157         }
2158 #endif
2159     }
2160     else if (comm->bCartesianPP)
2161     {
2162         if (sim_nodeid < dd->nnodes)
2163         {
2164             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2165         }
2166     }
2167     else
2168     {
2169         /* This assumes DD cells with identical x coordinates
2170          * are numbered sequentially.
2171          */
2172         if (dd->comm->pmenodes == NULL)
2173         {
2174             if (sim_nodeid < dd->nnodes)
2175             {
2176                 /* The DD index equals the nodeid */
2177                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2178             }
2179         }
2180         else
2181         {
2182             i = 0;
2183             while (sim_nodeid > dd->comm->pmenodes[i])
2184             {
2185                 i++;
2186             }
2187             if (sim_nodeid < dd->comm->pmenodes[i])
2188             {
2189                 pmenode = dd->comm->pmenodes[i];
2190             }
2191         }
2192     }
2193
2194     return pmenode;
2195 }
2196
2197 bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2198 {
2199     bool bPMEOnlyNode;
2200
2201     if (DOMAINDECOMP(cr))
2202     {
2203         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2204     }
2205     else
2206     {
2207         bPMEOnlyNode = FALSE;
2208     }
2209
2210     return bPMEOnlyNode;
2211 }
2212
2213 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2214                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2215 {
2216     gmx_domdec_t *dd;
2217     int x,y,z;
2218     ivec coord,coord_pme;
2219
2220     dd = cr->dd;
2221
2222     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2223
2224     *nmy_ddnodes = 0;
2225     for(x=0; x<dd->nc[XX]; x++)
2226     {
2227         for(y=0; y<dd->nc[YY]; y++)
2228         {
2229             for(z=0; z<dd->nc[ZZ]; z++)
2230             {
2231                 if (dd->comm->bCartesianPP_PME)
2232                 {
2233                     coord[XX] = x;
2234                     coord[YY] = y;
2235                     coord[ZZ] = z;
2236                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2237                     if (dd->ci[XX] == coord_pme[XX] &&
2238                         dd->ci[YY] == coord_pme[YY] &&
2239                         dd->ci[ZZ] == coord_pme[ZZ])
2240                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2241                 }
2242                 else
2243                 {
2244                     /* The slab corresponds to the nodeid in the PME group */
2245                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2246                     {
2247                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2248                     }
2249                 }
2250             }
2251         }
2252     }
2253
2254     /* The last PP-only node is the peer node */
2255     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2256
2257     if (debug)
2258     {
2259         fprintf(debug,"Receive coordinates from PP nodes:");
2260         for(x=0; x<*nmy_ddnodes; x++)
2261         {
2262             fprintf(debug," %d",(*my_ddnodes)[x]);
2263         }
2264         fprintf(debug,"\n");
2265     }
2266 }
2267
2268 static bool receive_vir_ener(t_commrec *cr)
2269 {
2270     gmx_domdec_comm_t *comm;
2271     int  pmenode,coords[DIM],rank;
2272     bool bReceive;
2273
2274     bReceive = TRUE;
2275     if (cr->npmenodes < cr->dd->nnodes)
2276     {
2277         comm = cr->dd->comm;
2278         if (comm->bCartesianPP_PME)
2279         {
2280             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2281 #ifdef GMX_MPI
2282             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2283             coords[comm->cartpmedim]++;
2284             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2285             {
2286                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2287                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2288                 {
2289                     /* This is not the last PP node for pmenode */
2290                     bReceive = FALSE;
2291                 }
2292             }
2293 #endif
2294         }
2295         else
2296         {
2297             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2298             if (cr->sim_nodeid+1 < cr->nnodes &&
2299                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2300             {
2301                 /* This is not the last PP node for pmenode */
2302                 bReceive = FALSE;
2303             }
2304         }
2305     }
2306
2307     return bReceive;
2308 }
2309
2310 static void set_zones_ncg_home(gmx_domdec_t *dd)
2311 {
2312     gmx_domdec_zones_t *zones;
2313     int i;
2314
2315     zones = &dd->comm->zones;
2316
2317     zones->cg_range[0] = 0;
2318     for(i=1; i<zones->n+1; i++)
2319     {
2320         zones->cg_range[i] = dd->ncg_home;
2321     }
2322 }
2323
2324 static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
2325 {
2326     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2327
2328     ind = state->cg_gl;
2329     dd_cg_gl = dd->index_gl;
2330     cgindex  = dd->cgindex;
2331     nat = 0;
2332     cgindex[0] = nat;
2333     for(i=0; i<state->ncg_gl; i++)
2334     {
2335         cgindex[i] = nat;
2336         cg_gl = ind[i];
2337         dd_cg_gl[i] = cg_gl;
2338         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2339     }
2340     cgindex[i] = nat;
2341
2342     dd->ncg_home = state->ncg_gl;
2343     dd->nat_home = nat;
2344
2345     set_zones_ncg_home(dd);
2346 }
2347
2348 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2349 {
2350     while (cg >= cginfo_mb->cg_end)
2351     {
2352         cginfo_mb++;
2353     }
2354
2355     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2356 }
2357
2358 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2359                           t_forcerec *fr,char *bLocalCG)
2360 {
2361     cginfo_mb_t *cginfo_mb;
2362     int *cginfo;
2363     int cg;
2364
2365     if (fr != NULL)
2366     {
2367         cginfo_mb = fr->cginfo_mb;
2368         cginfo    = fr->cginfo;
2369
2370         for(cg=cg0; cg<cg1; cg++)
2371         {
2372             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2373         }
2374     }
2375
2376     if (bLocalCG != NULL)
2377     {
2378         for(cg=cg0; cg<cg1; cg++)
2379         {
2380             bLocalCG[index_gl[cg]] = TRUE;
2381         }
2382     }
2383 }
2384
2385 static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
2386 {
2387     int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
2388     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2389     gmx_ga2la_t *ga2la;
2390     char *bLocalCG;
2391
2392     bLocalCG = dd->comm->bLocalCG;
2393
2394     if (dd->nat_tot > dd->gatindex_nalloc)
2395     {
2396         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2397         srenew(dd->gatindex,dd->gatindex_nalloc);
2398     }
2399
2400     nzone      = dd->comm->zones.n;
2401     zone2cg    = dd->comm->zones.cg_range;
2402     zone_ncg1  = dd->comm->zone_ncg1;
2403     index_gl   = dd->index_gl;
2404     gatindex   = dd->gatindex;
2405
2406     if (zone2cg[1] != dd->ncg_home)
2407     {
2408         gmx_incons("dd->ncg_zone is not up to date");
2409     }
2410
2411     /* Make the local to global and global to local atom index */
2412     a = dd->cgindex[cg_start];
2413     for(zone=0; zone<nzone; zone++)
2414     {
2415         if (zone == 0)
2416         {
2417             cg0 = cg_start;
2418         }
2419         else
2420         {
2421             cg0 = zone2cg[zone];
2422         }
2423         for(cg=cg0; cg<zone2cg[zone+1]; cg++)
2424         {
2425             zone1 = zone;
2426             if (cg - cg0 >= zone_ncg1[zone])
2427             {
2428                 /* Signal that this cg is from more than one zone away */
2429                 zone1 += nzone;
2430             }
2431             cg_gl = index_gl[cg];
2432             for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2433             {
2434                 gatindex[a] = a_gl;
2435                 ga2la_set(dd->ga2la,a_gl,a,zone1);
2436                 a++;
2437             }
2438         }
2439     }
2440 }
2441
2442 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2443                           const char *where)
2444 {
2445     int ncg,i,ngl,nerr;
2446
2447     nerr = 0;
2448     if (bLocalCG == NULL)
2449     {
2450         return nerr;
2451     }
2452     for(i=0; i<dd->ncg_tot; i++)
2453     {
2454         if (!bLocalCG[dd->index_gl[i]])
2455         {
2456             fprintf(stderr,
2457                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2458             nerr++;
2459         }
2460     }
2461     ngl = 0;
2462     for(i=0; i<ncg_sys; i++)
2463     {
2464         if (bLocalCG[i])
2465         {
2466             ngl++;
2467         }
2468     }
2469     if (ngl != dd->ncg_tot)
2470     {
2471         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2472         nerr++;
2473     }
2474
2475     return nerr;
2476 }
2477
2478 static void check_index_consistency(gmx_domdec_t *dd,
2479                                     int natoms_sys,int ncg_sys,
2480                                     const char *where)
2481 {
2482     int  nerr,ngl,i,a,cell;
2483     int  *have;
2484
2485     nerr = 0;
2486
2487     if (dd->comm->DD_debug > 1)
2488     {
2489         snew(have,natoms_sys);
2490         for(a=0; a<dd->nat_tot; a++)
2491         {
2492             if (have[dd->gatindex[a]] > 0)
2493             {
2494                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2495             }
2496             else
2497             {
2498                 have[dd->gatindex[a]] = a + 1;
2499             }
2500         }
2501         sfree(have);
2502     }
2503
2504     snew(have,dd->nat_tot);
2505
2506     ngl  = 0;
2507     for(i=0; i<natoms_sys; i++)
2508     {
2509         if (ga2la_get(dd->ga2la,i,&a,&cell))
2510         {
2511             if (a >= dd->nat_tot)
2512             {
2513                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2514                 nerr++;
2515             }
2516             else
2517             {
2518                 have[a] = 1;
2519                 if (dd->gatindex[a] != i)
2520                 {
2521                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2522                     nerr++;
2523                 }
2524             }
2525             ngl++;
2526         }
2527     }
2528     if (ngl != dd->nat_tot)
2529     {
2530         fprintf(stderr,
2531                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2532                 dd->rank,where,ngl,dd->nat_tot);
2533     }
2534     for(a=0; a<dd->nat_tot; a++)
2535     {
2536         if (have[a] == 0)
2537         {
2538             fprintf(stderr,
2539                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2540                     dd->rank,where,a+1,dd->gatindex[a]+1);
2541         }
2542     }
2543     sfree(have);
2544
2545     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2546
2547     if (nerr > 0) {
2548         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2549                   dd->rank,where,nerr);
2550     }
2551 }
2552
2553 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2554 {
2555     int  i;
2556     char *bLocalCG;
2557
2558     if (a_start == 0)
2559     {
2560         /* Clear the whole list without searching */
2561         ga2la_clear(dd->ga2la);
2562     }
2563     else
2564     {
2565         for(i=a_start; i<dd->nat_tot; i++)
2566         {
2567             ga2la_del(dd->ga2la,dd->gatindex[i]);
2568         }
2569     }
2570
2571     bLocalCG = dd->comm->bLocalCG;
2572     if (bLocalCG)
2573     {
2574         for(i=cg_start; i<dd->ncg_tot; i++)
2575         {
2576             bLocalCG[dd->index_gl[i]] = FALSE;
2577         }
2578     }
2579
2580     dd_clear_local_vsite_indices(dd);
2581
2582     if (dd->constraints)
2583     {
2584         dd_clear_local_constraint_indices(dd);
2585     }
2586 }
2587
2588 static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
2589 {
2590     real grid_jump_limit;
2591
2592     /* The distance between the boundaries of cells at distance
2593      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2594      * and by the fact that cells should not be shifted by more than
2595      * half their size, such that cg's only shift by one cell
2596      * at redecomposition.
2597      */
2598     grid_jump_limit = comm->cellsize_limit;
2599     if (!comm->bVacDLBNoLimit)
2600     {
2601         grid_jump_limit = max(grid_jump_limit,
2602                               comm->cutoff/comm->cd[dim_ind].np);
2603     }
2604
2605     return grid_jump_limit;
2606 }
2607
2608 static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2609 {
2610     gmx_domdec_comm_t *comm;
2611     int  d,dim;
2612     real limit,bfac;
2613
2614     comm = dd->comm;
2615
2616     for(d=1; d<dd->ndim; d++)
2617     {
2618         dim = dd->dim[d];
2619         limit = grid_jump_limit(comm,d);
2620         bfac = ddbox->box_size[dim];
2621         if (ddbox->tric_dir[dim])
2622         {
2623             bfac *= ddbox->skew_fac[dim];
2624         }
2625         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2626             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2627         {
2628             char buf[22];
2629             gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
2630                       gmx_step_str(step,buf),
2631                       dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2632         }
2633     }
2634 }
2635
2636 static int dd_load_count(gmx_domdec_comm_t *comm)
2637 {
2638     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2639 }
2640
2641 static float dd_force_load(gmx_domdec_comm_t *comm)
2642 {
2643     float load;
2644
2645     if (comm->eFlop)
2646     {
2647         load = comm->flop;
2648         if (comm->eFlop > 1)
2649         {
2650             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2651         }
2652     }
2653     else
2654     {
2655         load = comm->cycl[ddCyclF];
2656         if (comm->cycl_n[ddCyclF] > 1)
2657         {
2658             /* Subtract the maximum of the last n cycle counts
2659              * to get rid of possible high counts due to other soures,
2660              * for instance system activity, that would otherwise
2661              * affect the dynamic load balancing.
2662              */
2663             load -= comm->cycl_max[ddCyclF];
2664         }
2665     }
2666
2667     return load;
2668 }
2669
2670 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2671 {
2672     gmx_domdec_comm_t *comm;
2673     int i;
2674
2675     comm = dd->comm;
2676
2677     snew(*dim_f,dd->nc[dim]+1);
2678     (*dim_f)[0] = 0;
2679     for(i=1; i<dd->nc[dim]; i++)
2680     {
2681         if (comm->slb_frac[dim])
2682         {
2683             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2684         }
2685         else
2686         {
2687             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2688         }
2689     }
2690     (*dim_f)[dd->nc[dim]] = 1;
2691 }
2692
2693 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2694 {
2695     int  pmeindex,slab,nso,i;
2696     ivec xyz;
2697
2698     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2699     {
2700         ddpme->dim = YY;
2701     }
2702     else
2703     {
2704         ddpme->dim = dimind;
2705     }
2706     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2707
2708     ddpme->nslab = (ddpme->dim == 0 ?
2709                     dd->comm->npmenodes_x :
2710                     dd->comm->npmenodes_y);
2711
2712     if (ddpme->nslab <= 1)
2713     {
2714         return;
2715     }
2716
2717     nso = dd->comm->npmenodes/ddpme->nslab;
2718     /* Determine for each PME slab the PP location range for dimension dim */
2719     snew(ddpme->pp_min,ddpme->nslab);
2720     snew(ddpme->pp_max,ddpme->nslab);
2721     for(slab=0; slab<ddpme->nslab; slab++) {
2722         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2723         ddpme->pp_max[slab] = 0;
2724     }
2725     for(i=0; i<dd->nnodes; i++) {
2726         ddindex2xyz(dd->nc,i,xyz);
2727         /* For y only use our y/z slab.
2728          * This assumes that the PME x grid size matches the DD grid size.
2729          */
2730         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2731             pmeindex = ddindex2pmeindex(dd,i);
2732             if (dimind == 0) {
2733                 slab = pmeindex/nso;
2734             } else {
2735                 slab = pmeindex % ddpme->nslab;
2736             }
2737             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2738             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2739         }
2740     }
2741
2742     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2743 }
2744
2745 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2746 {
2747     if (dd->comm->ddpme[0].dim == XX)
2748     {
2749         return dd->comm->ddpme[0].maxshift;
2750     }
2751     else
2752     {
2753         return 0;
2754     }
2755 }
2756
2757 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2758 {
2759     if (dd->comm->ddpme[0].dim == YY)
2760     {
2761         return dd->comm->ddpme[0].maxshift;
2762     }
2763     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2764     {
2765         return dd->comm->ddpme[1].maxshift;
2766     }
2767     else
2768     {
2769         return 0;
2770     }
2771 }
2772
2773 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2774                              bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2775 {
2776     gmx_domdec_comm_t *comm;
2777     int  nc,ns,s;
2778     int  *xmin,*xmax;
2779     real range,pme_boundary;
2780     int  sh;
2781
2782     comm = dd->comm;
2783     nc  = dd->nc[ddpme->dim];
2784     ns  = ddpme->nslab;
2785
2786     if (!ddpme->dim_match)
2787     {
2788         /* PP decomposition is not along dim: the worst situation */
2789         sh = ns/2;
2790     }
2791     else if (ns <= 3 || (bUniform && ns == nc))
2792     {
2793         /* The optimal situation */
2794         sh = 1;
2795     }
2796     else
2797     {
2798         /* We need to check for all pme nodes which nodes they
2799          * could possibly need to communicate with.
2800          */
2801         xmin = ddpme->pp_min;
2802         xmax = ddpme->pp_max;
2803         /* Allow for atoms to be maximally 2/3 times the cut-off
2804          * out of their DD cell. This is a reasonable balance between
2805          * between performance and support for most charge-group/cut-off
2806          * combinations.
2807          */
2808         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2809         /* Avoid extra communication when we are exactly at a boundary */
2810         range *= 0.999;
2811
2812         sh = 1;
2813         for(s=0; s<ns; s++)
2814         {
2815             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2816             pme_boundary = (real)s/ns;
2817             while (sh+1 < ns &&
2818                    ((s-(sh+1) >= 0 &&
2819                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2820                     (s-(sh+1) <  0 &&
2821                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2822             {
2823                 sh++;
2824             }
2825             pme_boundary = (real)(s+1)/ns;
2826             while (sh+1 < ns &&
2827                    ((s+(sh+1) <  ns &&
2828                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2829                     (s+(sh+1) >= ns &&
2830                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2831             {
2832                 sh++;
2833             }
2834         }
2835     }
2836
2837     ddpme->maxshift = sh;
2838
2839     if (debug)
2840     {
2841         fprintf(debug,"PME slab communication range for dim %d is %d\n",
2842                 ddpme->dim,ddpme->maxshift);
2843     }
2844 }
2845
2846 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2847 {
2848     int d,dim;
2849
2850     for(d=0; d<dd->ndim; d++)
2851     {
2852         dim = dd->dim[d];
2853         if (dim < ddbox->nboundeddim &&
2854             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2855             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2856         {
2857             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2858                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2859                       dd->nc[dim],dd->comm->cellsize_limit);
2860         }
2861     }
2862 }
2863
2864 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2865                                   bool bMaster,ivec npulse)
2866 {
2867     gmx_domdec_comm_t *comm;
2868     int  d,j;
2869     rvec cellsize_min;
2870     real *cell_x,cell_dx,cellsize;
2871
2872     comm = dd->comm;
2873
2874     for(d=0; d<DIM; d++)
2875     {
2876         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2877         npulse[d] = 1;
2878         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2879         {
2880             /* Uniform grid */
2881             cell_dx = ddbox->box_size[d]/dd->nc[d];
2882             if (bMaster)
2883             {
2884                 for(j=0; j<dd->nc[d]+1; j++)
2885                 {
2886                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2887                 }
2888             }
2889             else
2890             {
2891                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2892                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2893             }
2894             cellsize = cell_dx*ddbox->skew_fac[d];
2895             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
2896             {
2897                 npulse[d]++;
2898             }
2899             cellsize_min[d] = cellsize;
2900         }
2901         else
2902         {
2903             /* Statically load balanced grid */
2904             /* Also when we are not doing a master distribution we determine
2905              * all cell borders in a loop to obtain identical values
2906              * to the master distribution case and to determine npulse.
2907              */
2908             if (bMaster)
2909             {
2910                 cell_x = dd->ma->cell_x[d];
2911             }
2912             else
2913             {
2914                 snew(cell_x,dd->nc[d]+1);
2915             }
2916             cell_x[0] = ddbox->box0[d];
2917             for(j=0; j<dd->nc[d]; j++)
2918             {
2919                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2920                 cell_x[j+1] = cell_x[j] + cell_dx;
2921                 cellsize = cell_dx*ddbox->skew_fac[d];
2922                 while (cellsize*npulse[d] < comm->cutoff &&
2923                        npulse[d] < dd->nc[d]-1)
2924                 {
2925                     npulse[d]++;
2926                 }
2927                 cellsize_min[d] = min(cellsize_min[d],cellsize);
2928             }
2929             if (!bMaster)
2930             {
2931                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2932                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2933                 sfree(cell_x);
2934             }
2935         }
2936         /* The following limitation is to avoid that a cell would receive
2937          * some of its own home charge groups back over the periodic boundary.
2938          * Double charge groups cause trouble with the global indices.
2939          */
2940         if (d < ddbox->npbcdim &&
2941             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2942         {
2943             gmx_fatal_collective(FARGS,NULL,dd,
2944                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2945                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
2946                                  comm->cutoff,
2947                                  dd->nc[d],dd->nc[d],
2948                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
2949         }
2950     }
2951
2952     if (!comm->bDynLoadBal)
2953     {
2954         copy_rvec(cellsize_min,comm->cellsize_min);
2955     }
2956
2957     for(d=0; d<comm->npmedecompdim; d++)
2958     {
2959         set_pme_maxshift(dd,&comm->ddpme[d],
2960                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
2961                          comm->ddpme[d].slb_dim_f);
2962     }
2963 }
2964
2965
2966 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2967                                        int d,int dim,gmx_domdec_root_t *root,
2968                                        gmx_ddbox_t *ddbox,
2969                                        bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
2970 {
2971     gmx_domdec_comm_t *comm;
2972     int  ncd,i,j,nmin,nmin_old;
2973     bool bLimLo,bLimHi;
2974     real *cell_size;
2975     real fac,halfway,cellsize_limit_f_i,region_size;
2976     bool bPBC,bLastHi=FALSE;
2977     int nrange[]={range[0],range[1]};
2978
2979     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];
2980
2981     comm = dd->comm;
2982
2983     ncd = dd->nc[dim];
2984
2985     bPBC = (dim < ddbox->npbcdim);
2986
2987     cell_size = root->buf_ncd;
2988
2989     if (debug)
2990     {
2991         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
2992     }
2993
2994     /* First we need to check if the scaling does not make cells
2995      * smaller than the smallest allowed size.
2996      * We need to do this iteratively, since if a cell is too small,
2997      * it needs to be enlarged, which makes all the other cells smaller,
2998      * which could in turn make another cell smaller than allowed.
2999      */
3000     for(i=range[0]; i<range[1]; i++)
3001     {
3002         root->bCellMin[i] = FALSE;
3003     }
3004     nmin = 0;
3005     do
3006     {
3007         nmin_old = nmin;
3008         /* We need the total for normalization */
3009         fac = 0;
3010         for(i=range[0]; i<range[1]; i++)
3011         {
3012             if (root->bCellMin[i] == FALSE)
3013             {
3014                 fac += cell_size[i];
3015             }
3016         }
3017         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3018         /* Determine the cell boundaries */
3019         for(i=range[0]; i<range[1]; i++)
3020         {
3021             if (root->bCellMin[i] == FALSE)
3022             {
3023                 cell_size[i] *= fac;
3024                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3025                 {
3026                     cellsize_limit_f_i = 0;
3027                 }
3028                 else
3029                 {
3030                     cellsize_limit_f_i = cellsize_limit_f;
3031                 }
3032                 if (cell_size[i] < cellsize_limit_f_i)
3033                 {
3034                     root->bCellMin[i] = TRUE;
3035                     cell_size[i] = cellsize_limit_f_i;
3036                     nmin++;
3037                 }
3038             }
3039             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3040         }
3041     }
3042     while (nmin > nmin_old);
3043
3044     i=range[1]-1;
3045     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3046     /* For this check we should not use DD_CELL_MARGIN,
3047      * but a slightly smaller factor,
3048      * since rounding could get use below the limit.
3049      */
3050     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3051     {
3052         char buf[22];
3053         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3054                   gmx_step_str(step,buf),
3055                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3056                   ncd,comm->cellsize_min[dim]);
3057     }
3058
3059     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3060
3061     if (!bUniform)
3062     {
3063         /* Check if the boundary did not displace more than halfway
3064          * each of the cells it bounds, as this could cause problems,
3065          * especially when the differences between cell sizes are large.
3066          * If changes are applied, they will not make cells smaller
3067          * than the cut-off, as we check all the boundaries which
3068          * might be affected by a change and if the old state was ok,
3069          * the cells will at most be shrunk back to their old size.
3070          */
3071         for(i=range[0]+1; i<range[1]; i++)
3072         {
3073             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3074             if (root->cell_f[i] < halfway)
3075             {
3076                 root->cell_f[i] = halfway;
3077                 /* Check if the change also causes shifts of the next boundaries */
3078                 for(j=i+1; j<range[1]; j++)
3079                 {
3080                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3081                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3082                 }
3083             }
3084             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3085             if (root->cell_f[i] > halfway)
3086             {
3087                 root->cell_f[i] = halfway;
3088                 /* Check if the change also causes shifts of the next boundaries */
3089                 for(j=i-1; j>=range[0]+1; j--)
3090                 {
3091                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3092                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3093                 }
3094             }
3095         }
3096     }
3097
3098     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3099     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3100      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3101      * for a and b nrange is used */
3102     if (d > 0)
3103     {
3104         /* Take care of the staggering of the cell boundaries */
3105         if (bUniform)
3106         {
3107             for(i=range[0]; i<range[1]; i++)
3108             {
3109                 root->cell_f_max0[i] = root->cell_f[i];
3110                 root->cell_f_min1[i] = root->cell_f[i+1];
3111             }
3112         }
3113         else
3114         {
3115             for(i=range[0]+1; i<range[1]; i++)
3116             {
3117                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3118                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3119                 if (bLimLo && bLimHi)
3120                 {
3121                     /* Both limits violated, try the best we can */
3122                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3123                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3124                     nrange[0]=range[0];
3125                     nrange[1]=i;
3126                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3127
3128                     nrange[0]=i;
3129                     nrange[1]=range[1];
3130                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3131
3132                     return;
3133                 }
3134                 else if (bLimLo)
3135                 {
3136                     /* root->cell_f[i] = root->bound_min[i]; */
3137                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3138                     bLastHi=FALSE;
3139                 }
3140                 else if (bLimHi && !bLastHi)
3141                 {
3142                     bLastHi=TRUE;
3143                     if (nrange[1] < range[1])   /* found a LimLo before */
3144                     {
3145                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3146                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3147                         nrange[0]=nrange[1];
3148                     }
3149                     root->cell_f[i] = root->bound_max[i];
3150                     nrange[1]=i;
3151                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3152                     nrange[0]=i;
3153                     nrange[1]=range[1];
3154                 }
3155             }
3156             if (nrange[1] < range[1])   /* found last a LimLo */
3157             {
3158                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3159                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3160                 nrange[0]=nrange[1];
3161                 nrange[1]=range[1];
3162                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3163             }
3164             else if (nrange[0] > range[0]) /* found at least one LimHi */
3165             {
3166                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3167             }
3168         }
3169     }
3170 }
3171
3172
3173 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3174                                        int d,int dim,gmx_domdec_root_t *root,
3175                                        gmx_ddbox_t *ddbox,bool bDynamicBox,
3176                                        bool bUniform,gmx_large_int_t step)
3177 {
3178     gmx_domdec_comm_t *comm;
3179     int  ncd,d1,i,j,pos;
3180     real *cell_size;
3181     real load_aver,load_i,imbalance,change,change_max,sc;
3182     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3183     real change_limit = 0.1;
3184     real relax = 0.5;
3185     bool bPBC;
3186     int range[] = { 0, 0 };
3187
3188     comm = dd->comm;
3189
3190     ncd = dd->nc[dim];
3191
3192     bPBC = (dim < ddbox->npbcdim);
3193
3194     cell_size = root->buf_ncd;
3195
3196     /* Store the original boundaries */
3197     for(i=0; i<ncd+1; i++)
3198     {
3199         root->old_cell_f[i] = root->cell_f[i];
3200     }
3201     if (bUniform) {
3202         for(i=0; i<ncd; i++)
3203         {
3204             cell_size[i] = 1.0/ncd;
3205         }
3206     }
3207     else if (dd_load_count(comm))
3208     {
3209         load_aver = comm->load[d].sum_m/ncd;
3210         change_max = 0;
3211         for(i=0; i<ncd; i++)
3212         {
3213             /* Determine the relative imbalance of cell i */
3214             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3215             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3216             /* Determine the change of the cell size using underrelaxation */
3217             change = -relax*imbalance;
3218             change_max = max(change_max,max(change,-change));
3219         }
3220         /* Limit the amount of scaling.
3221          * We need to use the same rescaling for all cells in one row,
3222          * otherwise the load balancing might not converge.
3223          */
3224         sc = relax;
3225         if (change_max > change_limit)
3226         {
3227             sc *= change_limit/change_max;
3228         }
3229         for(i=0; i<ncd; i++)
3230         {
3231             /* Determine the relative imbalance of cell i */
3232             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3233             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3234             /* Determine the change of the cell size using underrelaxation */
3235             change = -sc*imbalance;
3236             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3237         }
3238     }
3239
3240     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3241     cellsize_limit_f *= DD_CELL_MARGIN;
3242     dist_min_f_hard        = grid_jump_limit(comm,d)/ddbox->box_size[dim];
3243     dist_min_f       = dist_min_f_hard * DD_CELL_MARGIN;
3244     if (ddbox->tric_dir[dim])
3245     {
3246         cellsize_limit_f /= ddbox->skew_fac[dim];
3247         dist_min_f       /= ddbox->skew_fac[dim];
3248     }
3249     if (bDynamicBox && d > 0)
3250     {
3251         dist_min_f *= DD_PRES_SCALE_MARGIN;
3252     }
3253     if (d > 0 && !bUniform)
3254     {
3255         /* Make sure that the grid is not shifted too much */
3256         for(i=1; i<ncd; i++) {
3257             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3258             {
3259                 gmx_incons("Inconsistent DD boundary staggering limits!");
3260             }
3261             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3262             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3263             if (space > 0) {
3264                 root->bound_min[i] += 0.5*space;
3265             }
3266             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3267             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3268             if (space < 0) {
3269                 root->bound_max[i] += 0.5*space;
3270             }
3271             if (debug)
3272             {
3273                 fprintf(debug,
3274                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3275                         d,i,
3276                         root->cell_f_max0[i-1] + dist_min_f,
3277                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3278                         root->cell_f_min1[i] - dist_min_f);
3279             }
3280         }
3281     }
3282     range[1]=ncd;
3283     root->cell_f[0] = 0;
3284     root->cell_f[ncd] = 1;
3285     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3286
3287
3288     /* After the checks above, the cells should obey the cut-off
3289      * restrictions, but it does not hurt to check.
3290      */
3291     for(i=0; i<ncd; i++)
3292     {
3293         if (debug)
3294         {
3295             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3296                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3297         }
3298
3299         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3300             root->cell_f[i+1] - root->cell_f[i] <
3301             cellsize_limit_f/DD_CELL_MARGIN)
3302         {
3303             char buf[22];
3304             fprintf(stderr,
3305                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3306                     gmx_step_str(step,buf),dim2char(dim),i,
3307                     (root->cell_f[i+1] - root->cell_f[i])
3308                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3309         }
3310     }
3311
3312     pos = ncd + 1;
3313     /* Store the cell boundaries of the lower dimensions at the end */
3314     for(d1=0; d1<d; d1++)
3315     {
3316         root->cell_f[pos++] = comm->cell_f0[d1];
3317         root->cell_f[pos++] = comm->cell_f1[d1];
3318     }
3319
3320     if (d < comm->npmedecompdim)
3321     {
3322         /* The master determines the maximum shift for
3323          * the coordinate communication between separate PME nodes.
3324          */
3325         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3326     }
3327     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3328     if (d >= 1)
3329     {
3330         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3331     }
3332 }
3333
3334 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3335                                              gmx_ddbox_t *ddbox,int dimind)
3336 {
3337     gmx_domdec_comm_t *comm;
3338     int dim;
3339
3340     comm = dd->comm;
3341
3342     /* Set the cell dimensions */
3343     dim = dd->dim[dimind];
3344     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3345     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3346     if (dim >= ddbox->nboundeddim)
3347     {
3348         comm->cell_x0[dim] += ddbox->box0[dim];
3349         comm->cell_x1[dim] += ddbox->box0[dim];
3350     }
3351 }
3352
3353 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3354                                          int d,int dim,real *cell_f_row,
3355                                          gmx_ddbox_t *ddbox)
3356 {
3357     gmx_domdec_comm_t *comm;
3358     int d1,dim1,pos;
3359
3360     comm = dd->comm;
3361
3362 #ifdef GMX_MPI
3363     /* Each node would only need to know two fractions,
3364      * but it is probably cheaper to broadcast the whole array.
3365      */
3366     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3367               0,comm->mpi_comm_load[d]);
3368 #endif
3369     /* Copy the fractions for this dimension from the buffer */
3370     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3371     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3372     /* The whole array was communicated, so set the buffer position */
3373     pos = dd->nc[dim] + 1;
3374     for(d1=0; d1<=d; d1++)
3375     {
3376         if (d1 < d)
3377         {
3378             /* Copy the cell fractions of the lower dimensions */
3379             comm->cell_f0[d1] = cell_f_row[pos++];
3380             comm->cell_f1[d1] = cell_f_row[pos++];
3381         }
3382         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3383     }
3384     /* Convert the communicated shift from float to int */
3385     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3386     if (d >= 1)
3387     {
3388         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3389     }
3390 }
3391
3392 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3393                                          gmx_ddbox_t *ddbox,bool bDynamicBox,
3394                                          bool bUniform,gmx_large_int_t step)
3395 {
3396     gmx_domdec_comm_t *comm;
3397     int d,dim,d1;
3398     bool bRowMember,bRowRoot;
3399     real *cell_f_row;
3400
3401     comm = dd->comm;
3402
3403     for(d=0; d<dd->ndim; d++)
3404     {
3405         dim = dd->dim[d];
3406         bRowMember = TRUE;
3407         bRowRoot = TRUE;
3408         for(d1=d; d1<dd->ndim; d1++)
3409         {
3410             if (dd->ci[dd->dim[d1]] > 0)
3411             {
3412                 if (d1 > d)
3413                 {
3414                     bRowMember = FALSE;
3415                 }
3416                 bRowRoot = FALSE;
3417             }
3418         }
3419         if (bRowMember)
3420         {
3421             if (bRowRoot)
3422             {
3423                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3424                                            ddbox,bDynamicBox,bUniform,step);
3425                 cell_f_row = comm->root[d]->cell_f;
3426             }
3427             else
3428             {
3429                 cell_f_row = comm->cell_f_row;
3430             }
3431             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3432         }
3433     }
3434 }
3435
3436 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3437 {
3438     int d;
3439
3440     /* This function assumes the box is static and should therefore
3441      * not be called when the box has changed since the last
3442      * call to dd_partition_system.
3443      */
3444     for(d=0; d<dd->ndim; d++)
3445     {
3446         relative_to_absolute_cell_bounds(dd,ddbox,d);
3447     }
3448 }
3449
3450
3451
3452 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3453                                   gmx_ddbox_t *ddbox,bool bDynamicBox,
3454                                   bool bUniform,bool bDoDLB,gmx_large_int_t step,
3455                                   gmx_wallcycle_t wcycle)
3456 {
3457     gmx_domdec_comm_t *comm;
3458     int dim;
3459
3460     comm = dd->comm;
3461
3462     if (bDoDLB)
3463     {
3464         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3465         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3466         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3467     }
3468     else if (bDynamicBox)
3469     {
3470         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3471     }
3472
3473     /* Set the dimensions for which no DD is used */
3474     for(dim=0; dim<DIM; dim++) {
3475         if (dd->nc[dim] == 1) {
3476             comm->cell_x0[dim] = 0;
3477             comm->cell_x1[dim] = ddbox->box_size[dim];
3478             if (dim >= ddbox->nboundeddim)
3479             {
3480                 comm->cell_x0[dim] += ddbox->box0[dim];
3481                 comm->cell_x1[dim] += ddbox->box0[dim];
3482             }
3483         }
3484     }
3485 }
3486
3487 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3488 {
3489     int d,np,i;
3490     gmx_domdec_comm_dim_t *cd;
3491
3492     for(d=0; d<dd->ndim; d++)
3493     {
3494         cd = &dd->comm->cd[d];
3495         np = npulse[dd->dim[d]];
3496         if (np > cd->np_nalloc)
3497         {
3498             if (debug)
3499             {
3500                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3501                         dim2char(dd->dim[d]),np);
3502             }
3503             if (DDMASTER(dd) && cd->np_nalloc > 0)
3504             {
3505                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3506             }
3507             srenew(cd->ind,np);
3508             for(i=cd->np_nalloc; i<np; i++)
3509             {
3510                 cd->ind[i].index  = NULL;
3511                 cd->ind[i].nalloc = 0;
3512             }
3513             cd->np_nalloc = np;
3514         }
3515         cd->np = np;
3516     }
3517 }
3518
3519
3520 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3521                               gmx_ddbox_t *ddbox,bool bDynamicBox,
3522                               bool bUniform,bool bDoDLB,gmx_large_int_t step,
3523                               gmx_wallcycle_t wcycle)
3524 {
3525     gmx_domdec_comm_t *comm;
3526     int  d;
3527     ivec npulse;
3528
3529     comm = dd->comm;
3530
3531     /* Copy the old cell boundaries for the cg displacement check */
3532     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3533     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3534
3535     if (comm->bDynLoadBal)
3536     {
3537         if (DDMASTER(dd))
3538         {
3539             check_box_size(dd,ddbox);
3540         }
3541         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3542     }
3543     else
3544     {
3545         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3546         realloc_comm_ind(dd,npulse);
3547     }
3548
3549     if (debug)
3550     {
3551         for(d=0; d<DIM; d++)
3552         {
3553             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3554                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3555         }
3556     }
3557 }
3558
3559 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3560                                   gmx_ddbox_t *ddbox,
3561                                   rvec cell_ns_x0,rvec cell_ns_x1,
3562                                   gmx_large_int_t step)
3563 {
3564     gmx_domdec_comm_t *comm;
3565     int dim_ind,dim;
3566
3567     comm = dd->comm;
3568
3569     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3570     {
3571         dim = dd->dim[dim_ind];
3572
3573         /* Without PBC we don't have restrictions on the outer cells */
3574         if (!(dim >= ddbox->npbcdim &&
3575               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3576             comm->bDynLoadBal &&
3577             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3578             comm->cellsize_min[dim])
3579         {
3580             char buf[22];
3581             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3582                       gmx_step_str(step,buf),dim2char(dim),
3583                       comm->cell_x1[dim] - comm->cell_x0[dim],
3584                       ddbox->skew_fac[dim],
3585                       dd->comm->cellsize_min[dim],
3586                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3587         }
3588     }
3589
3590     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3591     {
3592         /* Communicate the boundaries and update cell_ns_x0/1 */
3593         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3594         if (dd->bGridJump && dd->ndim > 1)
3595         {
3596             check_grid_jump(step,dd,ddbox);
3597         }
3598     }
3599 }
3600
3601 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3602 {
3603     if (YY < npbcdim)
3604     {
3605         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3606     }
3607     else
3608     {
3609         tcm[YY][XX] = 0;
3610     }
3611     if (ZZ < npbcdim)
3612     {
3613         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3614         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3615     }
3616     else
3617     {
3618         tcm[ZZ][XX] = 0;
3619         tcm[ZZ][YY] = 0;
3620     }
3621 }
3622
3623 static void check_screw_box(matrix box)
3624 {
3625     /* Mathematical limitation */
3626     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3627     {
3628         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3629     }
3630
3631     /* Limitation due to the asymmetry of the eighth shell method */
3632     if (box[ZZ][YY] != 0)
3633     {
3634         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3635     }
3636 }
3637
3638 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3639                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3640                           gmx_domdec_t *dd)
3641 {
3642     gmx_domdec_master_t *ma;
3643     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3644     int  i,icg,j,k,k0,k1,d,npbcdim;
3645     matrix tcm;
3646     rvec box_size,cg_cm;
3647     ivec ind;
3648     real nrcg,inv_ncg,pos_d;
3649     atom_id *cgindex;
3650     bool bUnbounded,bScrew;
3651
3652     ma = dd->ma;
3653
3654     if (tmp_ind == NULL)
3655     {
3656         snew(tmp_nalloc,dd->nnodes);
3657         snew(tmp_ind,dd->nnodes);
3658         for(i=0; i<dd->nnodes; i++)
3659         {
3660             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3661             snew(tmp_ind[i],tmp_nalloc[i]);
3662         }
3663     }
3664
3665     /* Clear the count */
3666     for(i=0; i<dd->nnodes; i++)
3667     {
3668         ma->ncg[i] = 0;
3669         ma->nat[i] = 0;
3670     }
3671
3672     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3673
3674     cgindex = cgs->index;
3675
3676     /* Compute the center of geometry for all charge groups */
3677     for(icg=0; icg<cgs->nr; icg++)
3678     {
3679         k0      = cgindex[icg];
3680         k1      = cgindex[icg+1];
3681         nrcg    = k1 - k0;
3682         if (nrcg == 1)
3683         {
3684             copy_rvec(pos[k0],cg_cm);
3685         }
3686         else
3687         {
3688             inv_ncg = 1.0/nrcg;
3689
3690             clear_rvec(cg_cm);
3691             for(k=k0; (k<k1); k++)
3692             {
3693                 rvec_inc(cg_cm,pos[k]);
3694             }
3695             for(d=0; (d<DIM); d++)
3696             {
3697                 cg_cm[d] *= inv_ncg;
3698             }
3699         }
3700         /* Put the charge group in the box and determine the cell index */
3701         for(d=DIM-1; d>=0; d--) {
3702             pos_d = cg_cm[d];
3703             if (d < dd->npbcdim)
3704             {
3705                 bScrew = (dd->bScrewPBC && d == XX);
3706                 if (tric_dir[d] && dd->nc[d] > 1)
3707                 {
3708                     /* Use triclinic coordintates for this dimension */
3709                     for(j=d+1; j<DIM; j++)
3710                     {
3711                         pos_d += cg_cm[j]*tcm[j][d];
3712                     }
3713                 }
3714                 while(pos_d >= box[d][d])
3715                 {
3716                     pos_d -= box[d][d];
3717                     rvec_dec(cg_cm,box[d]);
3718                     if (bScrew)
3719                     {
3720                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3721                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3722                     }
3723                     for(k=k0; (k<k1); k++)
3724                     {
3725                         rvec_dec(pos[k],box[d]);
3726                         if (bScrew)
3727                         {
3728                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3729                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3730                         }
3731                     }
3732                 }
3733                 while(pos_d < 0)
3734                 {
3735                     pos_d += box[d][d];
3736                     rvec_inc(cg_cm,box[d]);
3737                     if (bScrew)
3738                     {
3739                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3740                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3741                     }
3742                     for(k=k0; (k<k1); k++)
3743                     {
3744                         rvec_inc(pos[k],box[d]);
3745                         if (bScrew) {
3746                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3747                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3748                         }
3749                     }
3750                 }
3751             }
3752             /* This could be done more efficiently */
3753             ind[d] = 0;
3754             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3755             {
3756                 ind[d]++;
3757             }
3758         }
3759         i = dd_index(dd->nc,ind);
3760         if (ma->ncg[i] == tmp_nalloc[i])
3761         {
3762             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3763             srenew(tmp_ind[i],tmp_nalloc[i]);
3764         }
3765         tmp_ind[i][ma->ncg[i]] = icg;
3766         ma->ncg[i]++;
3767         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3768     }
3769
3770     k1 = 0;
3771     for(i=0; i<dd->nnodes; i++)
3772     {
3773         ma->index[i] = k1;
3774         for(k=0; k<ma->ncg[i]; k++)
3775         {
3776             ma->cg[k1++] = tmp_ind[i][k];
3777         }
3778     }
3779     ma->index[dd->nnodes] = k1;
3780
3781     for(i=0; i<dd->nnodes; i++)
3782     {
3783         sfree(tmp_ind[i]);
3784     }
3785     sfree(tmp_ind);
3786     sfree(tmp_nalloc);
3787
3788     if (fplog)
3789     {
3790         char buf[22];
3791         fprintf(fplog,"Charge group distribution at step %s:",
3792                 gmx_step_str(step,buf));
3793         for(i=0; i<dd->nnodes; i++)
3794         {
3795             fprintf(fplog," %d",ma->ncg[i]);
3796         }
3797         fprintf(fplog,"\n");
3798     }
3799 }
3800
3801 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3802                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3803                                 rvec pos[])
3804 {
3805     gmx_domdec_master_t *ma=NULL;
3806     ivec npulse;
3807     int  i,cg_gl;
3808     int  *ibuf,buf2[2] = { 0, 0 };
3809
3810     if (DDMASTER(dd))
3811     {
3812         ma = dd->ma;
3813
3814         if (dd->bScrewPBC)
3815         {
3816             check_screw_box(box);
3817         }
3818
3819         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3820
3821         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3822         for(i=0; i<dd->nnodes; i++)
3823         {
3824             ma->ibuf[2*i]   = ma->ncg[i];
3825             ma->ibuf[2*i+1] = ma->nat[i];
3826         }
3827         ibuf = ma->ibuf;
3828     }
3829     else
3830     {
3831         ibuf = NULL;
3832     }
3833     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3834
3835     dd->ncg_home = buf2[0];
3836     dd->nat_home = buf2[1];
3837     dd->ncg_tot  = dd->ncg_home;
3838     dd->nat_tot  = dd->nat_home;
3839     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3840     {
3841         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3842         srenew(dd->index_gl,dd->cg_nalloc);
3843         srenew(dd->cgindex,dd->cg_nalloc+1);
3844     }
3845     if (DDMASTER(dd))
3846     {
3847         for(i=0; i<dd->nnodes; i++)
3848         {
3849             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3850             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3851         }
3852     }
3853
3854     dd_scatterv(dd,
3855                 DDMASTER(dd) ? ma->ibuf : NULL,
3856                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3857                 DDMASTER(dd) ? ma->cg : NULL,
3858                 dd->ncg_home*sizeof(int),dd->index_gl);
3859
3860     /* Determine the home charge group sizes */
3861     dd->cgindex[0] = 0;
3862     for(i=0; i<dd->ncg_home; i++)
3863     {
3864         cg_gl = dd->index_gl[i];
3865         dd->cgindex[i+1] =
3866             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3867     }
3868
3869     if (debug)
3870     {
3871         fprintf(debug,"Home charge groups:\n");
3872         for(i=0; i<dd->ncg_home; i++)
3873         {
3874             fprintf(debug," %d",dd->index_gl[i]);
3875             if (i % 10 == 9)
3876                 fprintf(debug,"\n");
3877         }
3878         fprintf(debug,"\n");
3879     }
3880 }
3881
3882 static int compact_and_copy_vec_at(int ncg,int *move,
3883                                    int *cgindex,
3884                                    int nvec,int vec,
3885                                    rvec *src,gmx_domdec_comm_t *comm,
3886                                    bool bCompact)
3887 {
3888     int m,icg,i,i0,i1,nrcg;
3889     int home_pos;
3890     int pos_vec[DIM*2];
3891
3892     home_pos = 0;
3893
3894     for(m=0; m<DIM*2; m++)
3895     {
3896         pos_vec[m] = 0;
3897     }
3898
3899     i0 = 0;
3900     for(icg=0; icg<ncg; icg++)
3901     {
3902         i1 = cgindex[icg+1];
3903         m = move[icg];
3904         if (m == -1)
3905         {
3906             if (bCompact)
3907             {
3908                 /* Compact the home array in place */
3909                 for(i=i0; i<i1; i++)
3910                 {
3911                     copy_rvec(src[i],src[home_pos++]);
3912                 }
3913             }
3914         }
3915         else
3916         {
3917             /* Copy to the communication buffer */
3918             nrcg = i1 - i0;
3919             pos_vec[m] += 1 + vec*nrcg;
3920             for(i=i0; i<i1; i++)
3921             {
3922                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
3923             }
3924             pos_vec[m] += (nvec - vec - 1)*nrcg;
3925         }
3926         if (!bCompact)
3927         {
3928             home_pos += i1 - i0;
3929         }
3930         i0 = i1;
3931     }
3932
3933     return home_pos;
3934 }
3935
3936 static int compact_and_copy_vec_cg(int ncg,int *move,
3937                                    int *cgindex,
3938                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
3939                                    bool bCompact)
3940 {
3941     int m,icg,i0,i1,nrcg;
3942     int home_pos;
3943     int pos_vec[DIM*2];
3944
3945     home_pos = 0;
3946
3947     for(m=0; m<DIM*2; m++)
3948     {
3949         pos_vec[m] = 0;
3950     }
3951
3952     i0 = 0;
3953     for(icg=0; icg<ncg; icg++)
3954     {
3955         i1 = cgindex[icg+1];
3956         m = move[icg];
3957         if (m == -1)
3958         {
3959             if (bCompact)
3960             {
3961                 /* Compact the home array in place */
3962                 copy_rvec(src[icg],src[home_pos++]);
3963             }
3964         }
3965         else
3966         {
3967             nrcg = i1 - i0;
3968             /* Copy to the communication buffer */
3969             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
3970             pos_vec[m] += 1 + nrcg*nvec;
3971         }
3972         i0 = i1;
3973     }
3974     if (!bCompact)
3975     {
3976         home_pos = ncg;
3977     }
3978
3979     return home_pos;
3980 }
3981
3982 static int compact_ind(int ncg,int *move,
3983                        int *index_gl,int *cgindex,
3984                        int *gatindex,
3985                        gmx_ga2la_t ga2la,char *bLocalCG,
3986                        int *cginfo)
3987 {
3988     int cg,nat,a0,a1,a,a_gl;
3989     int home_pos;
3990
3991     home_pos = 0;
3992     nat = 0;
3993     for(cg=0; cg<ncg; cg++)
3994     {
3995         a0 = cgindex[cg];
3996         a1 = cgindex[cg+1];
3997         if (move[cg] == -1)
3998         {
3999             /* Compact the home arrays in place.
4000              * Anything that can be done here avoids access to global arrays.
4001              */
4002             cgindex[home_pos] = nat;
4003             for(a=a0; a<a1; a++)
4004             {
4005                 a_gl = gatindex[a];
4006                 gatindex[nat] = a_gl;
4007                 /* The cell number stays 0, so we don't need to set it */
4008                 ga2la_change_la(ga2la,a_gl,nat);
4009                 nat++;
4010             }
4011             index_gl[home_pos] = index_gl[cg];
4012             cginfo[home_pos]   = cginfo[cg];
4013             /* The charge group remains local, so bLocalCG does not change */
4014             home_pos++;
4015         }
4016         else
4017         {
4018             /* Clear the global indices */
4019             for(a=a0; a<a1; a++)
4020             {
4021                 ga2la_del(ga2la,gatindex[a]);
4022             }
4023             if (bLocalCG)
4024             {
4025                 bLocalCG[index_gl[cg]] = FALSE;
4026             }
4027         }
4028     }
4029     cgindex[home_pos] = nat;
4030
4031     return home_pos;
4032 }
4033
4034 static void clear_and_mark_ind(int ncg,int *move,
4035                                int *index_gl,int *cgindex,int *gatindex,
4036                                gmx_ga2la_t ga2la,char *bLocalCG,
4037                                int *cell_index)
4038 {
4039     int cg,a0,a1,a;
4040
4041     for(cg=0; cg<ncg; cg++)
4042     {
4043         if (move[cg] >= 0)
4044         {
4045             a0 = cgindex[cg];
4046             a1 = cgindex[cg+1];
4047             /* Clear the global indices */
4048             for(a=a0; a<a1; a++)
4049             {
4050                 ga2la_del(ga2la,gatindex[a]);
4051             }
4052             if (bLocalCG)
4053             {
4054                 bLocalCG[index_gl[cg]] = FALSE;
4055             }
4056             /* Signal that this cg has moved using the ns cell index.
4057              * Here we set it to -1.
4058              * fill_grid will change it from -1 to 4*grid->ncells.
4059              */
4060             cell_index[cg] = -1;
4061         }
4062     }
4063 }
4064
4065 static void print_cg_move(FILE *fplog,
4066                           gmx_domdec_t *dd,
4067                           gmx_large_int_t step,int cg,int dim,int dir,
4068                           bool bHaveLimitdAndCMOld,real limitd,
4069                           rvec cm_old,rvec cm_new,real pos_d)
4070 {
4071     gmx_domdec_comm_t *comm;
4072     char buf[22];
4073
4074     comm = dd->comm;
4075
4076     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4077     if (bHaveLimitdAndCMOld)
4078     {
4079         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition (%f) in direction %c\n",
4080                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4081     }
4082     else
4083     {
4084         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4085                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4086     }
4087     fprintf(fplog,"distance out of cell %f\n",
4088             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4089     if (bHaveLimitdAndCMOld)
4090     {
4091         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4092                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4093     }
4094     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4095             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4096     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4097             dim2char(dim),
4098             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4099     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4100             dim2char(dim),
4101             comm->cell_x0[dim],comm->cell_x1[dim]);
4102 }
4103
4104 static void cg_move_error(FILE *fplog,
4105                           gmx_domdec_t *dd,
4106                           gmx_large_int_t step,int cg,int dim,int dir,
4107                           bool bHaveLimitdAndCMOld,real limitd,
4108                           rvec cm_old,rvec cm_new,real pos_d)
4109 {
4110     if (fplog)
4111     {
4112         print_cg_move(fplog, dd,step,cg,dim,dir,
4113                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4114     }
4115     print_cg_move(stderr,dd,step,cg,dim,dir,
4116                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4117     gmx_fatal(FARGS,
4118               "A charge group moved too far between two domain decomposition steps\n"
4119               "This usually means that your system is not well equilibrated");
4120 }
4121
4122 static void rotate_state_atom(t_state *state,int a)
4123 {
4124     int est;
4125
4126     for(est=0; est<estNR; est++)
4127     {
4128         if (EST_DISTR(est) && state->flags & (1<<est)) {
4129             switch (est) {
4130             case estX:
4131                 /* Rotate the complete state; for a rectangular box only */
4132                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4133                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4134                 break;
4135             case estV:
4136                 state->v[a][YY] = -state->v[a][YY];
4137                 state->v[a][ZZ] = -state->v[a][ZZ];
4138                 break;
4139             case estSDX:
4140                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4141                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4142                 break;
4143             case estCGP:
4144                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4145                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4146                 break;
4147             case estDISRE_INITF:
4148             case estDISRE_RM3TAV:
4149             case estORIRE_INITF:
4150             case estORIRE_DTAV:
4151                 /* These are distances, so not affected by rotation */
4152                 break;
4153             default:
4154                 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4155             }
4156         }
4157     }
4158 }
4159
4160 static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4161                               gmx_domdec_t *dd,ivec tric_dir,
4162                               t_state *state,rvec **f,
4163                               t_forcerec *fr,t_mdatoms *md,
4164                               bool bCompact,
4165                               t_nrnb *nrnb)
4166 {
4167     int  *move;
4168     int  npbcdim;
4169     int  ncg[DIM*2],nat[DIM*2];
4170     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4171     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4172     int  sbuf[2],rbuf[2];
4173     int  home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
4174     int  flag;
4175     bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4176     bool bScrew;
4177     ivec dev;
4178     real inv_ncg,pos_d;
4179     matrix tcm;
4180     rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4181     atom_id *cgindex;
4182     cginfo_mb_t *cginfo_mb;
4183     gmx_domdec_comm_t *comm;
4184
4185     if (dd->bScrewPBC)
4186     {
4187         check_screw_box(state->box);
4188     }
4189
4190     comm  = dd->comm;
4191     cg_cm = fr->cg_cm;
4192
4193     for(i=0; i<estNR; i++)
4194     {
4195         if (EST_DISTR(i))
4196         {
4197             switch (i)
4198             {
4199             case estX:   /* Always present */            break;
4200             case estV:   bV   = (state->flags & (1<<i)); break;
4201             case estSDX: bSDX = (state->flags & (1<<i)); break;
4202             case estCGP: bCGP = (state->flags & (1<<i)); break;
4203             case estLD_RNG:
4204             case estLD_RNGI:
4205             case estDISRE_INITF:
4206             case estDISRE_RM3TAV:
4207             case estORIRE_INITF:
4208             case estORIRE_DTAV:
4209                 /* No processing required */
4210                 break;
4211             default:
4212             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4213             }
4214         }
4215     }
4216
4217     if (dd->ncg_tot > comm->nalloc_int)
4218     {
4219         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4220         srenew(comm->buf_int,comm->nalloc_int);
4221     }
4222     move = comm->buf_int;
4223
4224     /* Clear the count */
4225     for(c=0; c<dd->ndim*2; c++)
4226     {
4227         ncg[c] = 0;
4228         nat[c] = 0;
4229     }
4230
4231     npbcdim = dd->npbcdim;
4232
4233     for(d=0; (d<DIM); d++)
4234     {
4235         limitd[d] = dd->comm->cellsize_min[d];
4236         if (d >= npbcdim && dd->ci[d] == 0)
4237         {
4238             cell_x0[d] = -GMX_FLOAT_MAX;
4239         }
4240         else
4241         {
4242             cell_x0[d] = comm->cell_x0[d];
4243         }
4244         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4245         {
4246             cell_x1[d] = GMX_FLOAT_MAX;
4247         }
4248         else
4249         {
4250             cell_x1[d] = comm->cell_x1[d];
4251         }
4252         if (d < npbcdim)
4253         {
4254             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4255             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4256         }
4257         else
4258         {
4259             /* We check after communication if a charge group moved
4260              * more than one cell. Set the pre-comm check limit to float_max.
4261              */
4262             limit0[d] = -GMX_FLOAT_MAX;
4263             limit1[d] =  GMX_FLOAT_MAX;
4264         }
4265     }
4266
4267     make_tric_corr_matrix(npbcdim,state->box,tcm);
4268
4269     cgindex = dd->cgindex;
4270
4271     /* Compute the center of geometry for all home charge groups
4272      * and put them in the box and determine where they should go.
4273      */
4274     for(cg=0; cg<dd->ncg_home; cg++)
4275     {
4276         k0   = cgindex[cg];
4277         k1   = cgindex[cg+1];
4278         nrcg = k1 - k0;
4279         if (nrcg == 1)
4280         {
4281             copy_rvec(state->x[k0],cm_new);
4282         }
4283         else
4284         {
4285             inv_ncg = 1.0/nrcg;
4286
4287             clear_rvec(cm_new);
4288             for(k=k0; (k<k1); k++)
4289             {
4290                 rvec_inc(cm_new,state->x[k]);
4291             }
4292             for(d=0; (d<DIM); d++)
4293             {
4294                 cm_new[d] = inv_ncg*cm_new[d];
4295             }
4296         }
4297
4298         clear_ivec(dev);
4299         /* Do pbc and check DD cell boundary crossings */
4300         for(d=DIM-1; d>=0; d--)
4301         {
4302             if (dd->nc[d] > 1)
4303             {
4304                 bScrew = (dd->bScrewPBC && d == XX);
4305                 /* Determine the location of this cg in lattice coordinates */
4306                 pos_d = cm_new[d];
4307                 if (tric_dir[d])
4308                 {
4309                     for(d2=d+1; d2<DIM; d2++)
4310                     {
4311                         pos_d += cm_new[d2]*tcm[d2][d];
4312                     }
4313                 }
4314                 /* Put the charge group in the triclinic unit-cell */
4315                 if (pos_d >= cell_x1[d])
4316                 {
4317                     if (pos_d >= limit1[d])
4318                     {
4319                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4320                                       cg_cm[cg],cm_new,pos_d);
4321                     }
4322                     dev[d] = 1;
4323                     if (dd->ci[d] == dd->nc[d] - 1)
4324                     {
4325                         rvec_dec(cm_new,state->box[d]);
4326                         if (bScrew)
4327                         {
4328                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4329                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4330                         }
4331                         for(k=k0; (k<k1); k++)
4332                         {
4333                             rvec_dec(state->x[k],state->box[d]);
4334                             if (bScrew)
4335                             {
4336                                 rotate_state_atom(state,k);
4337                             }
4338                         }
4339                     }
4340                 }
4341                 else if (pos_d < cell_x0[d])
4342                 {
4343                     if (pos_d < limit0[d])
4344                     {
4345                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4346                                       cg_cm[cg],cm_new,pos_d);
4347                     }
4348                     dev[d] = -1;
4349                     if (dd->ci[d] == 0)
4350                     {
4351                         rvec_inc(cm_new,state->box[d]);
4352                         if (bScrew)
4353                         {
4354                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4355                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4356                         }
4357                         for(k=k0; (k<k1); k++)
4358                         {
4359                             rvec_inc(state->x[k],state->box[d]);
4360                             if (bScrew)
4361                             {
4362                                 rotate_state_atom(state,k);
4363                             }
4364                         }
4365                     }
4366                 }
4367             }
4368             else if (d < npbcdim)
4369             {
4370                 /* Put the charge group in the rectangular unit-cell */
4371                 while (cm_new[d] >= state->box[d][d])
4372                 {
4373                     rvec_dec(cm_new,state->box[d]);
4374                     for(k=k0; (k<k1); k++)
4375                     {
4376                         rvec_dec(state->x[k],state->box[d]);
4377                     }
4378                 }
4379                 while (cm_new[d] < 0)
4380                 {
4381                     rvec_inc(cm_new,state->box[d]);
4382                     for(k=k0; (k<k1); k++)
4383                     {
4384                         rvec_inc(state->x[k],state->box[d]);
4385                     }
4386                 }
4387             }
4388         }
4389
4390         copy_rvec(cm_new,cg_cm[cg]);
4391
4392         /* Determine where this cg should go */
4393         flag = 0;
4394         mc = -1;
4395         for(d=0; d<dd->ndim; d++)
4396         {
4397             dim = dd->dim[d];
4398             if (dev[dim] == 1)
4399             {
4400                 flag |= DD_FLAG_FW(d);
4401                 if (mc == -1)
4402                 {
4403                     mc = d*2;
4404                 }
4405             }
4406             else if (dev[dim] == -1)
4407             {
4408                 flag |= DD_FLAG_BW(d);
4409                 if (mc == -1) {
4410                     if (dd->nc[dim] > 2)
4411                     {
4412                         mc = d*2 + 1;
4413                     }
4414                     else
4415                     {
4416                         mc = d*2;
4417                     }
4418                 }
4419             }
4420         }
4421         move[cg] = mc;
4422         if (mc >= 0)
4423         {
4424             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4425             {
4426                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4427                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4428             }
4429             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4430             /* We store the cg size in the lower 16 bits
4431              * and the place where the charge group should go
4432              * in the next 6 bits. This saves some communication volume.
4433              */
4434             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4435             ncg[mc] += 1;
4436             nat[mc] += nrcg;
4437         }
4438     }
4439
4440     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4441     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4442
4443     nvec = 1;
4444     if (bV)
4445     {
4446         nvec++;
4447     }
4448     if (bSDX)
4449     {
4450         nvec++;
4451     }
4452     if (bCGP)
4453     {
4454         nvec++;
4455     }
4456
4457     /* Make sure the communication buffers are large enough */
4458     for(mc=0; mc<dd->ndim*2; mc++)
4459     {
4460         nvr = ncg[mc] + nat[mc]*nvec;
4461         if (nvr > comm->cgcm_state_nalloc[mc])
4462         {
4463             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4464             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4465         }
4466     }
4467
4468     /* Recalculating cg_cm might be cheaper than communicating,
4469      * but that could give rise to rounding issues.
4470      */
4471     home_pos_cg =
4472         compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4473                                 nvec,cg_cm,comm,bCompact);
4474
4475     vec = 0;
4476     home_pos_at =
4477         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4478                                 nvec,vec++,state->x,comm,bCompact);
4479     if (bV)
4480     {
4481         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4482                                 nvec,vec++,state->v,comm,bCompact);
4483     }
4484     if (bSDX)
4485     {
4486         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4487                                 nvec,vec++,state->sd_X,comm,bCompact);
4488     }
4489     if (bCGP)
4490     {
4491         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4492                                 nvec,vec++,state->cg_p,comm,bCompact);
4493     }
4494
4495     if (bCompact)
4496     {
4497         compact_ind(dd->ncg_home,move,
4498                     dd->index_gl,dd->cgindex,dd->gatindex,
4499                     dd->ga2la,comm->bLocalCG,
4500                     fr->cginfo);
4501     }
4502     else
4503     {
4504         clear_and_mark_ind(dd->ncg_home,move,
4505                            dd->index_gl,dd->cgindex,dd->gatindex,
4506                            dd->ga2la,comm->bLocalCG,
4507                            fr->ns.grid->cell_index);
4508     }
4509
4510     cginfo_mb = fr->cginfo_mb;
4511
4512     ncg_stay_home = home_pos_cg;
4513     for(d=0; d<dd->ndim; d++)
4514     {
4515         dim = dd->dim[d];
4516         ncg_recv = 0;
4517         nat_recv = 0;
4518         nvr      = 0;
4519         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4520         {
4521             cdd = d*2 + dir;
4522             /* Communicate the cg and atom counts */
4523             sbuf[0] = ncg[cdd];
4524             sbuf[1] = nat[cdd];
4525             if (debug)
4526             {
4527                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4528                         d,dir,sbuf[0],sbuf[1]);
4529             }
4530             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4531
4532             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4533             {
4534                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4535                 srenew(comm->buf_int,comm->nalloc_int);
4536             }
4537
4538             /* Communicate the charge group indices, sizes and flags */
4539             dd_sendrecv_int(dd, d, dir,
4540                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4541                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4542
4543             nvs = ncg[cdd] + nat[cdd]*nvec;
4544             i   = rbuf[0]  + rbuf[1] *nvec;
4545             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4546
4547             /* Communicate cgcm and state */
4548             dd_sendrecv_rvec(dd, d, dir,
4549                              comm->cgcm_state[cdd], nvs,
4550                              comm->vbuf.v+nvr, i);
4551             ncg_recv += rbuf[0];
4552             nat_recv += rbuf[1];
4553             nvr      += i;
4554         }
4555
4556         /* Process the received charge groups */
4557         buf_pos = 0;
4558         for(cg=0; cg<ncg_recv; cg++)
4559         {
4560             flag = comm->buf_int[cg*DD_CGIBS+1];
4561
4562             if (dim >= npbcdim && dd->nc[dim] > 2)
4563             {
4564                 /* No pbc in this dim and more than one domain boundary.
4565                  * We to a separate check if a charge did not move too far.
4566                  */
4567                 if (((flag & DD_FLAG_FW(d)) &&
4568                      comm->vbuf.v[buf_pos][d] > cell_x1[dim]) ||
4569                     ((flag & DD_FLAG_BW(d)) &&
4570                      comm->vbuf.v[buf_pos][d] < cell_x0[dim]))
4571                 {
4572                     cg_move_error(fplog,dd,step,cg,d,
4573                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4574                                    FALSE,0,
4575                                    comm->vbuf.v[buf_pos],
4576                                    comm->vbuf.v[buf_pos],
4577                                    comm->vbuf.v[buf_pos][d]);
4578                 }
4579             }
4580
4581             mc = -1;
4582             if (d < dd->ndim-1)
4583             {
4584                 /* Check which direction this cg should go */
4585                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4586                 {
4587                     if (dd->bGridJump)
4588                     {
4589                         /* The cell boundaries for dimension d2 are not equal
4590                          * for each cell row of the lower dimension(s),
4591                          * therefore we might need to redetermine where
4592                          * this cg should go.
4593                          */
4594                         dim2 = dd->dim[d2];
4595                         /* If this cg crosses the box boundary in dimension d2
4596                          * we can use the communicated flag, so we do not
4597                          * have to worry about pbc.
4598                          */
4599                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4600                                (flag & DD_FLAG_FW(d2))) ||
4601                               (dd->ci[dim2] == 0 &&
4602                                (flag & DD_FLAG_BW(d2)))))
4603                         {
4604                             /* Clear the two flags for this dimension */
4605                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4606                             /* Determine the location of this cg
4607                              * in lattice coordinates
4608                              */
4609                             pos_d = comm->vbuf.v[buf_pos][dim2];
4610                             if (tric_dir[dim2])
4611                             {
4612                                 for(d3=dim2+1; d3<DIM; d3++)
4613                                 {
4614                                     pos_d +=
4615                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4616                                 }
4617                             }
4618                             /* Check of we are not at the box edge.
4619                              * pbc is only handled in the first step above,
4620                              * but this check could move over pbc while
4621                              * the first step did not due to different rounding.
4622                              */
4623                             if (pos_d >= cell_x1[dim2] &&
4624                                 dd->ci[dim2] != dd->nc[dim2]-1)
4625                             {
4626                                 flag |= DD_FLAG_FW(d2);
4627                             }
4628                             else if (pos_d < cell_x0[dim2] &&
4629                                      dd->ci[dim2] != 0)
4630                             {
4631                                 flag |= DD_FLAG_BW(d2);
4632                             }
4633                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4634                         }
4635                     }
4636                     /* Set to which neighboring cell this cg should go */
4637                     if (flag & DD_FLAG_FW(d2))
4638                     {
4639                         mc = d2*2;
4640                     }
4641                     else if (flag & DD_FLAG_BW(d2))
4642                     {
4643                         if (dd->nc[dd->dim[d2]] > 2)
4644                         {
4645                             mc = d2*2+1;
4646                         }
4647                         else
4648                         {
4649                             mc = d2*2;
4650                         }
4651                     }
4652                 }
4653             }
4654
4655             nrcg = flag & DD_FLAG_NRCG;
4656             if (mc == -1)
4657             {
4658                 if (home_pos_cg+1 > dd->cg_nalloc)
4659                 {
4660                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4661                     srenew(dd->index_gl,dd->cg_nalloc);
4662                     srenew(dd->cgindex,dd->cg_nalloc+1);
4663                 }
4664                 /* Set the global charge group index and size */
4665                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4666                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4667                 /* Copy the state from the buffer */
4668                 if (home_pos_cg >= fr->cg_nalloc)
4669                 {
4670                     dd_realloc_fr_cg(fr,home_pos_cg+1);
4671                     cg_cm = fr->cg_cm;
4672                 }
4673                 copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
4674                 /* Set the cginfo */
4675                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4676                                                    dd->index_gl[home_pos_cg]);
4677                 if (comm->bLocalCG)
4678                 {
4679                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4680                 }
4681
4682                 if (home_pos_at+nrcg > state->nalloc)
4683                 {
4684                     dd_realloc_state(state,f,home_pos_at+nrcg);
4685                 }
4686                 for(i=0; i<nrcg; i++)
4687                 {
4688                     copy_rvec(comm->vbuf.v[buf_pos++],
4689                               state->x[home_pos_at+i]);
4690                 }
4691                 if (bV)
4692                 {
4693                     for(i=0; i<nrcg; i++)
4694                     {
4695                         copy_rvec(comm->vbuf.v[buf_pos++],
4696                                   state->v[home_pos_at+i]);
4697                     }
4698                 }
4699                 if (bSDX)
4700                 {
4701                     for(i=0; i<nrcg; i++)
4702                     {
4703                         copy_rvec(comm->vbuf.v[buf_pos++],
4704                                   state->sd_X[home_pos_at+i]);
4705                     }
4706                 }
4707                 if (bCGP)
4708                 {
4709                     for(i=0; i<nrcg; i++)
4710                     {
4711                         copy_rvec(comm->vbuf.v[buf_pos++],
4712                                   state->cg_p[home_pos_at+i]);
4713                     }
4714                 }
4715                 home_pos_cg += 1;
4716                 home_pos_at += nrcg;
4717             }
4718             else
4719             {
4720                 /* Reallocate the buffers if necessary  */
4721                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4722                 {
4723                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4724                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4725                 }
4726                 nvr = ncg[mc] + nat[mc]*nvec;
4727                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4728                 {
4729                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4730                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4731                 }
4732                 /* Copy from the receive to the send buffers */
4733                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4734                        comm->buf_int + cg*DD_CGIBS,
4735                        DD_CGIBS*sizeof(int));
4736                 memcpy(comm->cgcm_state[mc][nvr],
4737                        comm->vbuf.v[buf_pos],
4738                        (1+nrcg*nvec)*sizeof(rvec));
4739                 buf_pos += 1 + nrcg*nvec;
4740                 ncg[mc] += 1;
4741                 nat[mc] += nrcg;
4742             }
4743         }
4744     }
4745
4746     /* With sorting (!bCompact) the indices are now only partially up to date
4747      * and ncg_home and nat_home are not the real count, since there are
4748      * "holes" in the arrays for the charge groups that moved to neighbors.
4749      */
4750     dd->ncg_home = home_pos_cg;
4751     dd->nat_home = home_pos_at;
4752
4753     if (debug)
4754     {
4755         fprintf(debug,"Finished repartitioning\n");
4756     }
4757
4758     return ncg_stay_home;
4759 }
4760
4761 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4762 {
4763     dd->comm->cycl[ddCycl] += cycles;
4764     dd->comm->cycl_n[ddCycl]++;
4765     if (cycles > dd->comm->cycl_max[ddCycl])
4766     {
4767         dd->comm->cycl_max[ddCycl] = cycles;
4768     }
4769 }
4770
4771 static double force_flop_count(t_nrnb *nrnb)
4772 {
4773     int i;
4774     double sum;
4775     const char *name;
4776
4777     sum = 0;
4778     for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
4779     {
4780         /* To get closer to the real timings, we half the count
4781          * for the normal loops and again half it for water loops.
4782          */
4783         name = nrnb_str(i);
4784         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4785         {
4786             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4787         }
4788         else
4789         {
4790             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4791         }
4792     }
4793     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
4794     {
4795         name = nrnb_str(i);
4796         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4797         sum += nrnb->n[i]*cost_nrnb(i);
4798     }
4799     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
4800     {
4801         sum += nrnb->n[i]*cost_nrnb(i);
4802     }
4803
4804     return sum;
4805 }
4806
4807 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
4808 {
4809     if (dd->comm->eFlop)
4810     {
4811         dd->comm->flop -= force_flop_count(nrnb);
4812     }
4813 }
4814 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
4815 {
4816     if (dd->comm->eFlop)
4817     {
4818         dd->comm->flop += force_flop_count(nrnb);
4819         dd->comm->flop_n++;
4820     }
4821 }
4822
4823 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4824 {
4825     int i;
4826
4827     for(i=0; i<ddCyclNr; i++)
4828     {
4829         dd->comm->cycl[i] = 0;
4830         dd->comm->cycl_n[i] = 0;
4831         dd->comm->cycl_max[i] = 0;
4832     }
4833     dd->comm->flop = 0;
4834     dd->comm->flop_n = 0;
4835 }
4836
4837 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
4838 {
4839     gmx_domdec_comm_t *comm;
4840     gmx_domdec_load_t *load;
4841     gmx_domdec_root_t *root=NULL;
4842     int  d,dim,cid,i,pos;
4843     float cell_frac=0,sbuf[DD_NLOAD_MAX];
4844     bool bSepPME;
4845
4846     if (debug)
4847     {
4848         fprintf(debug,"get_load_distribution start\n");
4849     }
4850
4851     wallcycle_start(wcycle,ewcDDCOMMLOAD);
4852
4853     comm = dd->comm;
4854
4855     bSepPME = (dd->pme_nodeid >= 0);
4856
4857     for(d=dd->ndim-1; d>=0; d--)
4858     {
4859         dim = dd->dim[d];
4860         /* Check if we participate in the communication in this dimension */
4861         if (d == dd->ndim-1 ||
4862             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
4863         {
4864             load = &comm->load[d];
4865             if (dd->bGridJump)
4866             {
4867                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4868             }
4869             pos = 0;
4870             if (d == dd->ndim-1)
4871             {
4872                 sbuf[pos++] = dd_force_load(comm);
4873                 sbuf[pos++] = sbuf[0];
4874                 if (dd->bGridJump)
4875                 {
4876                     sbuf[pos++] = sbuf[0];
4877                     sbuf[pos++] = cell_frac;
4878                     if (d > 0)
4879                     {
4880                         sbuf[pos++] = comm->cell_f_max0[d];
4881                         sbuf[pos++] = comm->cell_f_min1[d];
4882                     }
4883                 }
4884                 if (bSepPME)
4885                 {
4886                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4887                     sbuf[pos++] = comm->cycl[ddCyclPME];
4888                 }
4889             }
4890             else
4891             {
4892                 sbuf[pos++] = comm->load[d+1].sum;
4893                 sbuf[pos++] = comm->load[d+1].max;
4894                 if (dd->bGridJump)
4895                 {
4896                     sbuf[pos++] = comm->load[d+1].sum_m;
4897                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4898                     sbuf[pos++] = comm->load[d+1].flags;
4899                     if (d > 0)
4900                     {
4901                         sbuf[pos++] = comm->cell_f_max0[d];
4902                         sbuf[pos++] = comm->cell_f_min1[d];
4903                     }
4904                 }
4905                 if (bSepPME)
4906                 {
4907                     sbuf[pos++] = comm->load[d+1].mdf;
4908                     sbuf[pos++] = comm->load[d+1].pme;
4909                 }
4910             }
4911             load->nload = pos;
4912             /* Communicate a row in DD direction d.
4913              * The communicators are setup such that the root always has rank 0.
4914              */
4915 #ifdef GMX_MPI
4916             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
4917                        load->load,load->nload*sizeof(float),MPI_BYTE,
4918                        0,comm->mpi_comm_load[d]);
4919 #endif
4920             if (dd->ci[dim] == dd->master_ci[dim])
4921             {
4922                 /* We are the root, process this row */
4923                 if (comm->bDynLoadBal)
4924                 {
4925                     root = comm->root[d];
4926                 }
4927                 load->sum = 0;
4928                 load->max = 0;
4929                 load->sum_m = 0;
4930                 load->cvol_min = 1;
4931                 load->flags = 0;
4932                 load->mdf = 0;
4933                 load->pme = 0;
4934                 pos = 0;
4935                 for(i=0; i<dd->nc[dim]; i++)
4936                 {
4937                     load->sum += load->load[pos++];
4938                     load->max = max(load->max,load->load[pos]);
4939                     pos++;
4940                     if (dd->bGridJump)
4941                     {
4942                         if (root->bLimited)
4943                         {
4944                             /* This direction could not be load balanced properly,
4945                              * therefore we need to use the maximum iso the average load.
4946                              */
4947                             load->sum_m = max(load->sum_m,load->load[pos]);
4948                         }
4949                         else
4950                         {
4951                             load->sum_m += load->load[pos];
4952                         }
4953                         pos++;
4954                         load->cvol_min = min(load->cvol_min,load->load[pos]);
4955                         pos++;
4956                         if (d < dd->ndim-1)
4957                         {
4958                             load->flags = (int)(load->load[pos++] + 0.5);
4959                         }
4960                         if (d > 0)
4961                         {
4962                             root->cell_f_max0[i] = load->load[pos++];
4963                             root->cell_f_min1[i] = load->load[pos++];
4964                         }
4965                     }
4966                     if (bSepPME)
4967                     {
4968                         load->mdf = max(load->mdf,load->load[pos]);
4969                         pos++;
4970                         load->pme = max(load->pme,load->load[pos]);
4971                         pos++;
4972                     }
4973                 }
4974                 if (comm->bDynLoadBal && root->bLimited)
4975                 {
4976                     load->sum_m *= dd->nc[dim];
4977                     load->flags |= (1<<d);
4978                 }
4979             }
4980         }
4981     }
4982
4983     if (DDMASTER(dd))
4984     {
4985         comm->nload      += dd_load_count(comm);
4986         comm->load_step  += comm->cycl[ddCyclStep];
4987         comm->load_sum   += comm->load[0].sum;
4988         comm->load_max   += comm->load[0].max;
4989         if (comm->bDynLoadBal)
4990         {
4991             for(d=0; d<dd->ndim; d++)
4992             {
4993                 if (comm->load[0].flags & (1<<d))
4994                 {
4995                     comm->load_lim[d]++;
4996                 }
4997             }
4998         }
4999         if (bSepPME)
5000         {
5001             comm->load_mdf += comm->load[0].mdf;
5002             comm->load_pme += comm->load[0].pme;
5003         }
5004     }
5005
5006     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5007
5008     if (debug)
5009     {
5010         fprintf(debug,"get_load_distribution finished\n");
5011     }
5012 }
5013
5014 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5015 {
5016     /* Return the relative performance loss on the total run time
5017      * due to the force calculation load imbalance.
5018      */
5019     if (dd->comm->nload > 0)
5020     {
5021         return
5022             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5023             (dd->comm->load_step*dd->nnodes);
5024     }
5025     else
5026     {
5027         return 0;
5028     }
5029 }
5030
5031 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5032 {
5033     char  buf[STRLEN];
5034     int   npp,npme,nnodes,d,limp;
5035     float imbal,pme_f_ratio,lossf,lossp=0;
5036     bool  bLim;
5037     gmx_domdec_comm_t *comm;
5038
5039     comm = dd->comm;
5040     if (DDMASTER(dd) && comm->nload > 0)
5041     {
5042         npp    = dd->nnodes;
5043         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5044         nnodes = npp + npme;
5045         imbal = comm->load_max*npp/comm->load_sum - 1;
5046         lossf = dd_force_imb_perf_loss(dd);
5047         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5048         fprintf(fplog,"%s",buf);
5049         fprintf(stderr,"\n");
5050         fprintf(stderr,"%s",buf);
5051         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5052         fprintf(fplog,"%s",buf);
5053         fprintf(stderr,"%s",buf);
5054         bLim = FALSE;
5055         if (comm->bDynLoadBal)
5056         {
5057             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5058             for(d=0; d<dd->ndim; d++)
5059             {
5060                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5061                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5062                 if (limp >= 50)
5063                 {
5064                     bLim = TRUE;
5065                 }
5066             }
5067             sprintf(buf+strlen(buf),"\n");
5068             fprintf(fplog,"%s",buf);
5069             fprintf(stderr,"%s",buf);
5070         }
5071         if (npme > 0)
5072         {
5073             pme_f_ratio = comm->load_pme/comm->load_mdf;
5074             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5075             if (lossp <= 0)
5076             {
5077                 lossp *= (float)npme/(float)nnodes;
5078             }
5079             else
5080             {
5081                 lossp *= (float)npp/(float)nnodes;
5082             }
5083             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5084             fprintf(fplog,"%s",buf);
5085             fprintf(stderr,"%s",buf);
5086             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5087             fprintf(fplog,"%s",buf);
5088             fprintf(stderr,"%s",buf);
5089         }
5090         fprintf(fplog,"\n");
5091         fprintf(stderr,"\n");
5092
5093         if (lossf >= DD_PERF_LOSS)
5094         {
5095             sprintf(buf,
5096                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
5097                     "      in the domain decomposition.\n",lossf*100);
5098             if (!comm->bDynLoadBal)
5099             {
5100                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5101             }
5102             else if (bLim)
5103             {
5104                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5105             }
5106             fprintf(fplog,"%s\n",buf);
5107             fprintf(stderr,"%s\n",buf);
5108         }
5109         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5110         {
5111             sprintf(buf,
5112                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5113                     "      had %s work to do than the PP nodes.\n"
5114                     "      You might want to %s the number of PME nodes\n"
5115                     "      or %s the cut-off and the grid spacing.\n",
5116                     fabs(lossp*100),
5117                     (lossp < 0) ? "less"     : "more",
5118                     (lossp < 0) ? "decrease" : "increase",
5119                     (lossp < 0) ? "decrease" : "increase");
5120             fprintf(fplog,"%s\n",buf);
5121             fprintf(stderr,"%s\n",buf);
5122         }
5123     }
5124 }
5125
5126 static float dd_vol_min(gmx_domdec_t *dd)
5127 {
5128     return dd->comm->load[0].cvol_min*dd->nnodes;
5129 }
5130
5131 static bool dd_load_flags(gmx_domdec_t *dd)
5132 {
5133     return dd->comm->load[0].flags;
5134 }
5135
5136 static float dd_f_imbal(gmx_domdec_t *dd)
5137 {
5138     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5139 }
5140
5141 static float dd_pme_f_ratio(gmx_domdec_t *dd)
5142 {
5143     return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5144 }
5145
5146 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5147 {
5148     int flags,d;
5149     char buf[22];
5150
5151     flags = dd_load_flags(dd);
5152     if (flags)
5153     {
5154         fprintf(fplog,
5155                 "DD  load balancing is limited by minimum cell size in dimension");
5156         for(d=0; d<dd->ndim; d++)
5157         {
5158             if (flags & (1<<d))
5159             {
5160                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5161             }
5162         }
5163         fprintf(fplog,"\n");
5164     }
5165     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5166     if (dd->comm->bDynLoadBal)
5167     {
5168         fprintf(fplog,"  vol min/aver %5.3f%c",
5169                 dd_vol_min(dd),flags ? '!' : ' ');
5170     }
5171     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5172     if (dd->comm->cycl_n[ddCyclPME])
5173     {
5174         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5175     }
5176     fprintf(fplog,"\n\n");
5177 }
5178
5179 static void dd_print_load_verbose(gmx_domdec_t *dd)
5180 {
5181     if (dd->comm->bDynLoadBal)
5182     {
5183         fprintf(stderr,"vol %4.2f%c ",
5184                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5185     }
5186     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5187     if (dd->comm->cycl_n[ddCyclPME])
5188     {
5189         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5190     }
5191 }
5192
5193 #ifdef GMX_MPI
5194 static void make_load_communicator(gmx_domdec_t *dd,MPI_Group g_all,
5195                                    int dim_ind,ivec loc)
5196 {
5197     MPI_Group g_row;
5198     MPI_Comm  c_row;
5199     int  dim,i,*rank;
5200     ivec loc_c;
5201     gmx_domdec_root_t *root;
5202
5203     dim = dd->dim[dim_ind];
5204     copy_ivec(loc,loc_c);
5205     snew(rank,dd->nc[dim]);
5206     for(i=0; i<dd->nc[dim]; i++)
5207     {
5208         loc_c[dim] = i;
5209         rank[i] = dd_index(dd->nc,loc_c);
5210     }
5211     /* Here we create a new group, that does not necessarily
5212      * include our process. But MPI_Comm_create needs to be
5213      * called by all the processes in the original communicator.
5214      * Calling MPI_Group_free afterwards gives errors, so I assume
5215      * also the group is needed by all processes. (B. Hess)
5216      */
5217     MPI_Group_incl(g_all,dd->nc[dim],rank,&g_row);
5218     MPI_Comm_create(dd->mpi_comm_all,g_row,&c_row);
5219     if (c_row != MPI_COMM_NULL)
5220     {
5221         /* This process is part of the group */
5222         dd->comm->mpi_comm_load[dim_ind] = c_row;
5223         if (dd->comm->eDLB != edlbNO)
5224         {
5225             if (dd->ci[dim] == dd->master_ci[dim])
5226             {
5227                 /* This is the root process of this row */
5228                 snew(dd->comm->root[dim_ind],1);
5229                 root = dd->comm->root[dim_ind];
5230                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5231                 snew(root->old_cell_f,dd->nc[dim]+1);
5232                 snew(root->bCellMin,dd->nc[dim]);
5233                 if (dim_ind > 0)
5234                 {
5235                     snew(root->cell_f_max0,dd->nc[dim]);
5236                     snew(root->cell_f_min1,dd->nc[dim]);
5237                     snew(root->bound_min,dd->nc[dim]);
5238                     snew(root->bound_max,dd->nc[dim]);
5239                 }
5240                 snew(root->buf_ncd,dd->nc[dim]);
5241             }
5242             else
5243             {
5244                 /* This is not a root process, we only need to receive cell_f */
5245                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5246             }
5247         }
5248         if (dd->ci[dim] == dd->master_ci[dim])
5249         {
5250             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5251         }
5252     }
5253     sfree(rank);
5254 }
5255 #endif
5256
5257 static void make_load_communicators(gmx_domdec_t *dd)
5258 {
5259 #ifdef GMX_MPI
5260   MPI_Group g_all;
5261   int  dim0,dim1,i,j;
5262   ivec loc;
5263
5264   if (debug)
5265     fprintf(debug,"Making load communicators\n");
5266
5267   MPI_Comm_group(dd->mpi_comm_all,&g_all);
5268
5269   snew(dd->comm->load,dd->ndim);
5270   snew(dd->comm->mpi_comm_load,dd->ndim);
5271
5272   clear_ivec(loc);
5273   make_load_communicator(dd,g_all,0,loc);
5274   if (dd->ndim > 1) {
5275     dim0 = dd->dim[0];
5276     for(i=0; i<dd->nc[dim0]; i++) {
5277       loc[dim0] = i;
5278       make_load_communicator(dd,g_all,1,loc);
5279     }
5280   }
5281   if (dd->ndim > 2) {
5282     dim0 = dd->dim[0];
5283     for(i=0; i<dd->nc[dim0]; i++) {
5284       loc[dim0] = i;
5285       dim1 = dd->dim[1];
5286       for(j=0; j<dd->nc[dim1]; j++) {
5287           loc[dim1] = j;
5288           make_load_communicator(dd,g_all,2,loc);
5289       }
5290     }
5291   }
5292
5293   MPI_Group_free(&g_all);
5294
5295   if (debug)
5296     fprintf(debug,"Finished making load communicators\n");
5297 #endif
5298 }
5299
5300 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5301 {
5302     bool bZYX;
5303     int  d,dim,i,j,m;
5304     ivec tmp,s;
5305     int  nzone,nzonep;
5306     ivec dd_zp[DD_MAXIZONE];
5307     gmx_domdec_zones_t *zones;
5308     gmx_domdec_ns_ranges_t *izone;
5309
5310     for(d=0; d<dd->ndim; d++)
5311     {
5312         dim = dd->dim[d];
5313         copy_ivec(dd->ci,tmp);
5314         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5315         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5316         copy_ivec(dd->ci,tmp);
5317         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5318         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5319         if (debug)
5320         {
5321             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5322                     dd->rank,dim,
5323                     dd->neighbor[d][0],
5324                     dd->neighbor[d][1]);
5325         }
5326     }
5327
5328     if (DDMASTER(dd))
5329     {
5330         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5331             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5332     }
5333     if (fplog)
5334     {
5335         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5336                 dd->ndim,
5337                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5338                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5339     }
5340     switch (dd->ndim)
5341     {
5342     case 3:
5343         nzone  = dd_z3n;
5344         nzonep = dd_zp3n;
5345         for(i=0; i<nzonep; i++)
5346         {
5347             copy_ivec(dd_zp3[i],dd_zp[i]);
5348         }
5349         break;
5350     case 2:
5351         nzone  = dd_z2n;
5352         nzonep = dd_zp2n;
5353         for(i=0; i<nzonep; i++)
5354         {
5355             copy_ivec(dd_zp2[i],dd_zp[i]);
5356         }
5357         break;
5358     case 1:
5359         nzone  = dd_z1n;
5360         nzonep = dd_zp1n;
5361         for(i=0; i<nzonep; i++)
5362         {
5363             copy_ivec(dd_zp1[i],dd_zp[i]);
5364         }
5365         break;
5366     default:
5367         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5368         nzone = 0;
5369         nzonep = 0;
5370     }
5371
5372     zones = &dd->comm->zones;
5373
5374     for(i=0; i<nzone; i++)
5375     {
5376         m = 0;
5377         clear_ivec(zones->shift[i]);
5378         for(d=0; d<dd->ndim; d++)
5379         {
5380             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5381         }
5382     }
5383
5384     zones->n = nzone;
5385     for(i=0; i<nzone; i++)
5386     {
5387         for(d=0; d<DIM; d++)
5388         {
5389             s[d] = dd->ci[d] - zones->shift[i][d];
5390             if (s[d] < 0)
5391             {
5392                 s[d] += dd->nc[d];
5393             }
5394             else if (s[d] >= dd->nc[d])
5395             {
5396                 s[d] -= dd->nc[d];
5397             }
5398         }
5399     }
5400     zones->nizone = nzonep;
5401     for(i=0; i<zones->nizone; i++)
5402     {
5403         if (dd_zp[i][0] != i)
5404         {
5405             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5406         }
5407         izone = &zones->izone[i];
5408         izone->j0 = dd_zp[i][1];
5409         izone->j1 = dd_zp[i][2];
5410         for(dim=0; dim<DIM; dim++)
5411         {
5412             if (dd->nc[dim] == 1)
5413             {
5414                 /* All shifts should be allowed */
5415                 izone->shift0[dim] = -1;
5416                 izone->shift1[dim] = 1;
5417             }
5418             else
5419             {
5420                 /*
5421                   izone->shift0[d] = 0;
5422                   izone->shift1[d] = 0;
5423                   for(j=izone->j0; j<izone->j1; j++) {
5424                   if (dd->shift[j][d] > dd->shift[i][d])
5425                   izone->shift0[d] = -1;
5426                   if (dd->shift[j][d] < dd->shift[i][d])
5427                   izone->shift1[d] = 1;
5428                   }
5429                 */
5430
5431                 int shift_diff;
5432
5433                 /* Assume the shift are not more than 1 cell */
5434                 izone->shift0[dim] = 1;
5435                 izone->shift1[dim] = -1;
5436                 for(j=izone->j0; j<izone->j1; j++)
5437                 {
5438                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5439                     if (shift_diff < izone->shift0[dim])
5440                     {
5441                         izone->shift0[dim] = shift_diff;
5442                     }
5443                     if (shift_diff > izone->shift1[dim])
5444                     {
5445                         izone->shift1[dim] = shift_diff;
5446                     }
5447                 }
5448             }
5449         }
5450     }
5451
5452     if (dd->comm->eDLB != edlbNO)
5453     {
5454         snew(dd->comm->root,dd->ndim);
5455     }
5456
5457     if (dd->comm->bRecordLoad)
5458     {
5459         make_load_communicators(dd);
5460     }
5461 }
5462
5463 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5464 {
5465     gmx_domdec_t *dd;
5466     gmx_domdec_comm_t *comm;
5467     int  i,rank,*buf;
5468     ivec periods;
5469 #ifdef GMX_MPI
5470     MPI_Comm comm_cart;
5471 #endif
5472
5473     dd = cr->dd;
5474     comm = dd->comm;
5475
5476 #ifdef GMX_MPI
5477     if (comm->bCartesianPP)
5478     {
5479         /* Set up cartesian communication for the particle-particle part */
5480         if (fplog)
5481         {
5482             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5483                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5484         }
5485
5486         for(i=0; i<DIM; i++)
5487         {
5488             periods[i] = TRUE;
5489         }
5490         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5491                         &comm_cart);
5492         /* We overwrite the old communicator with the new cartesian one */
5493         cr->mpi_comm_mygroup = comm_cart;
5494     }
5495
5496     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5497     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5498
5499     if (comm->bCartesianPP_PME)
5500     {
5501         /* Since we want to use the original cartesian setup for sim,
5502          * and not the one after split, we need to make an index.
5503          */
5504         snew(comm->ddindex2ddnodeid,dd->nnodes);
5505         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5506         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5507         /* Get the rank of the DD master,
5508          * above we made sure that the master node is a PP node.
5509          */
5510         if (MASTER(cr))
5511         {
5512             rank = dd->rank;
5513         }
5514         else
5515         {
5516             rank = 0;
5517         }
5518         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5519     }
5520     else if (comm->bCartesianPP)
5521     {
5522         if (cr->npmenodes == 0)
5523         {
5524             /* The PP communicator is also
5525              * the communicator for this simulation
5526              */
5527             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5528         }
5529         cr->nodeid = dd->rank;
5530
5531         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5532
5533         /* We need to make an index to go from the coordinates
5534          * to the nodeid of this simulation.
5535          */
5536         snew(comm->ddindex2simnodeid,dd->nnodes);
5537         snew(buf,dd->nnodes);
5538         if (cr->duty & DUTY_PP)
5539         {
5540             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5541         }
5542         /* Communicate the ddindex to simulation nodeid index */
5543         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5544                       cr->mpi_comm_mysim);
5545         sfree(buf);
5546
5547         /* Determine the master coordinates and rank.
5548          * The DD master should be the same node as the master of this sim.
5549          */
5550         for(i=0; i<dd->nnodes; i++)
5551         {
5552             if (comm->ddindex2simnodeid[i] == 0)
5553             {
5554                 ddindex2xyz(dd->nc,i,dd->master_ci);
5555                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5556             }
5557         }
5558         if (debug)
5559         {
5560             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5561         }
5562     }
5563     else
5564     {
5565         /* No Cartesian communicators */
5566         /* We use the rank in dd->comm->all as DD index */
5567         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5568         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5569         dd->masterrank = 0;
5570         clear_ivec(dd->master_ci);
5571     }
5572 #endif
5573
5574     if (fplog)
5575     {
5576         fprintf(fplog,
5577                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5578                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5579     }
5580     if (debug)
5581     {
5582         fprintf(debug,
5583                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5584                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5585     }
5586 }
5587
5588 static void receive_ddindex2simnodeid(t_commrec *cr)
5589 {
5590     gmx_domdec_t *dd;
5591
5592     gmx_domdec_comm_t *comm;
5593     int  *buf;
5594
5595     dd = cr->dd;
5596     comm = dd->comm;
5597
5598 #ifdef GMX_MPI
5599     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5600     {
5601         snew(comm->ddindex2simnodeid,dd->nnodes);
5602         snew(buf,dd->nnodes);
5603         if (cr->duty & DUTY_PP)
5604         {
5605             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5606         }
5607 #ifdef GMX_MPI
5608         /* Communicate the ddindex to simulation nodeid index */
5609         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5610                       cr->mpi_comm_mysim);
5611 #endif
5612         sfree(buf);
5613     }
5614 #endif
5615 }
5616
5617 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5618                                                      int ncg,int natoms)
5619 {
5620     gmx_domdec_master_t *ma;
5621     int i;
5622
5623     snew(ma,1);
5624
5625     snew(ma->ncg,dd->nnodes);
5626     snew(ma->index,dd->nnodes+1);
5627     snew(ma->cg,ncg);
5628     snew(ma->nat,dd->nnodes);
5629     snew(ma->ibuf,dd->nnodes*2);
5630     snew(ma->cell_x,DIM);
5631     for(i=0; i<DIM; i++)
5632     {
5633         snew(ma->cell_x[i],dd->nc[i]+1);
5634     }
5635
5636     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5637     {
5638         ma->vbuf = NULL;
5639     }
5640     else
5641     {
5642         snew(ma->vbuf,natoms);
5643     }
5644
5645     return ma;
5646 }
5647
5648 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5649                                int reorder)
5650 {
5651     gmx_domdec_t *dd;
5652     gmx_domdec_comm_t *comm;
5653     int  i,rank;
5654     bool bDiv[DIM];
5655     ivec periods;
5656 #ifdef GMX_MPI
5657     MPI_Comm comm_cart;
5658 #endif
5659
5660     dd = cr->dd;
5661     comm = dd->comm;
5662
5663     if (comm->bCartesianPP)
5664     {
5665         for(i=1; i<DIM; i++)
5666         {
5667             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5668         }
5669         if (bDiv[YY] || bDiv[ZZ])
5670         {
5671             comm->bCartesianPP_PME = TRUE;
5672             /* If we have 2D PME decomposition, which is always in x+y,
5673              * we stack the PME only nodes in z.
5674              * Otherwise we choose the direction that provides the thinnest slab
5675              * of PME only nodes as this will have the least effect
5676              * on the PP communication.
5677              * But for the PME communication the opposite might be better.
5678              */
5679             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5680                              !bDiv[YY] ||
5681                              dd->nc[YY] > dd->nc[ZZ]))
5682             {
5683                 comm->cartpmedim = ZZ;
5684             }
5685             else
5686             {
5687                 comm->cartpmedim = YY;
5688             }
5689             comm->ntot[comm->cartpmedim]
5690                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5691         }
5692         else if (fplog)
5693         {
5694             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5695             fprintf(fplog,
5696                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5697         }
5698     }
5699
5700 #ifdef GMX_MPI
5701     if (comm->bCartesianPP_PME)
5702     {
5703         if (fplog)
5704         {
5705             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5706         }
5707
5708         for(i=0; i<DIM; i++)
5709         {
5710             periods[i] = TRUE;
5711         }
5712         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5713                         &comm_cart);
5714
5715         MPI_Comm_rank(comm_cart,&rank);
5716         if (MASTERNODE(cr) && rank != 0)
5717         {
5718             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5719         }
5720
5721         /* With this assigment we loose the link to the original communicator
5722          * which will usually be MPI_COMM_WORLD, unless have multisim.
5723          */
5724         cr->mpi_comm_mysim = comm_cart;
5725         cr->sim_nodeid = rank;
5726
5727         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5728
5729         if (fplog)
5730         {
5731             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5732                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5733         }
5734
5735         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5736         {
5737             cr->duty = DUTY_PP;
5738         }
5739         if (cr->npmenodes == 0 ||
5740             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5741         {
5742             cr->duty = DUTY_PME;
5743         }
5744
5745         /* Split the sim communicator into PP and PME only nodes */
5746         MPI_Comm_split(cr->mpi_comm_mysim,
5747                        cr->duty,
5748                        dd_index(comm->ntot,dd->ci),
5749                        &cr->mpi_comm_mygroup);
5750     }
5751     else
5752     {
5753         switch (dd_node_order)
5754         {
5755         case ddnoPP_PME:
5756             if (fplog)
5757             {
5758                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5759             }
5760             break;
5761         case ddnoINTERLEAVE:
5762             /* Interleave the PP-only and PME-only nodes,
5763              * as on clusters with dual-core machines this will double
5764              * the communication bandwidth of the PME processes
5765              * and thus speed up the PP <-> PME and inter PME communication.
5766              */
5767             if (fplog)
5768             {
5769                 fprintf(fplog,"Interleaving PP and PME nodes\n");
5770             }
5771             comm->pmenodes = dd_pmenodes(cr);
5772             break;
5773         case ddnoCARTESIAN:
5774             break;
5775         default:
5776             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
5777         }
5778
5779         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
5780         {
5781             cr->duty = DUTY_PME;
5782         }
5783         else
5784         {
5785             cr->duty = DUTY_PP;
5786         }
5787
5788         /* Split the sim communicator into PP and PME only nodes */
5789         MPI_Comm_split(cr->mpi_comm_mysim,
5790                        cr->duty,
5791                        cr->nodeid,
5792                        &cr->mpi_comm_mygroup);
5793         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
5794     }
5795 #endif
5796
5797     if (fplog)
5798     {
5799         fprintf(fplog,"This is a %s only node\n\n",
5800                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5801     }
5802 }
5803
5804 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
5805 {
5806     gmx_domdec_t *dd;
5807     gmx_domdec_comm_t *comm;
5808     int CartReorder;
5809
5810     dd = cr->dd;
5811     comm = dd->comm;
5812
5813     copy_ivec(dd->nc,comm->ntot);
5814
5815     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
5816     comm->bCartesianPP_PME = FALSE;
5817
5818     /* Reorder the nodes by default. This might change the MPI ranks.
5819      * Real reordering is only supported on very few architectures,
5820      * Blue Gene is one of them.
5821      */
5822     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
5823
5824     if (cr->npmenodes > 0)
5825     {
5826         /* Split the communicator into a PP and PME part */
5827         split_communicator(fplog,cr,dd_node_order,CartReorder);
5828         if (comm->bCartesianPP_PME)
5829         {
5830             /* We (possibly) reordered the nodes in split_communicator,
5831              * so it is no longer required in make_pp_communicator.
5832              */
5833             CartReorder = FALSE;
5834         }
5835     }
5836     else
5837     {
5838         /* All nodes do PP and PME */
5839 #ifdef GMX_MPI
5840         /* We do not require separate communicators */
5841         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5842 #endif
5843     }
5844
5845     if (cr->duty & DUTY_PP)
5846     {
5847         /* Copy or make a new PP communicator */
5848         make_pp_communicator(fplog,cr,CartReorder);
5849     }
5850     else
5851     {
5852         receive_ddindex2simnodeid(cr);
5853     }
5854
5855     if (!(cr->duty & DUTY_PME))
5856     {
5857         /* Set up the commnuication to our PME node */
5858         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
5859         dd->pme_receive_vir_ener = receive_vir_ener(cr);
5860         if (debug)
5861         {
5862             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
5863                     dd->pme_nodeid,dd->pme_receive_vir_ener);
5864         }
5865     }
5866     else
5867     {
5868         dd->pme_nodeid = -1;
5869     }
5870
5871     if (DDMASTER(dd))
5872     {
5873         dd->ma = init_gmx_domdec_master_t(dd,
5874                                           comm->cgs_gl.nr,
5875                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
5876     }
5877 }
5878
5879 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
5880 {
5881     real *slb_frac,tot;
5882     int  i,n;
5883     double dbl;
5884
5885     slb_frac = NULL;
5886     if (nc > 1 && size_string != NULL)
5887     {
5888         if (fplog)
5889         {
5890             fprintf(fplog,"Using static load balancing for the %s direction\n",
5891                     dir);
5892         }
5893         snew(slb_frac,nc);
5894         tot = 0;
5895         for (i=0; i<nc; i++)
5896         {
5897             dbl = 0;
5898             sscanf(size_string,"%lf%n",&dbl,&n);
5899             if (dbl == 0)
5900             {
5901                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
5902             }
5903             slb_frac[i] = dbl;
5904             size_string += n;
5905             tot += slb_frac[i];
5906         }
5907         /* Normalize */
5908         if (fplog)
5909         {
5910             fprintf(fplog,"Relative cell sizes:");
5911         }
5912         for (i=0; i<nc; i++)
5913         {
5914             slb_frac[i] /= tot;
5915             if (fplog)
5916             {
5917                 fprintf(fplog," %5.3f",slb_frac[i]);
5918             }
5919         }
5920         if (fplog)
5921         {
5922             fprintf(fplog,"\n");
5923         }
5924     }
5925
5926     return slb_frac;
5927 }
5928
5929 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
5930 {
5931     int n,nmol,ftype;
5932     gmx_mtop_ilistloop_t iloop;
5933     t_ilist *il;
5934
5935     n = 0;
5936     iloop = gmx_mtop_ilistloop_init(mtop);
5937     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
5938     {
5939         for(ftype=0; ftype<F_NRE; ftype++)
5940         {
5941             if ((interaction_function[ftype].flags & IF_BOND) &&
5942                 NRAL(ftype) >  2)
5943             {
5944                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
5945             }
5946         }
5947   }
5948
5949   return n;
5950 }
5951
5952 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
5953 {
5954     char *val;
5955     int  nst;
5956
5957     nst = def;
5958     val = getenv(env_var);
5959     if (val)
5960     {
5961         if (sscanf(val,"%d",&nst) <= 0)
5962         {
5963             nst = 1;
5964         }
5965         if (fplog)
5966         {
5967             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
5968                     env_var,val,nst);
5969         }
5970     }
5971
5972     return nst;
5973 }
5974
5975 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
5976 {
5977     if (MASTER(cr))
5978     {
5979         fprintf(stderr,"\n%s\n",warn_string);
5980     }
5981     if (fplog)
5982     {
5983         fprintf(fplog,"\n%s\n",warn_string);
5984     }
5985 }
5986
5987 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
5988                                   t_inputrec *ir,FILE *fplog)
5989 {
5990     if (ir->ePBC == epbcSCREW &&
5991         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
5992     {
5993         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
5994     }
5995
5996     if (ir->ns_type == ensSIMPLE)
5997     {
5998         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
5999     }
6000
6001     if (ir->nstlist == 0)
6002     {
6003         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6004     }
6005
6006     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6007     {
6008         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6009     }
6010 }
6011
6012 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6013 {
6014     int  di,d;
6015     real r;
6016
6017     r = ddbox->box_size[XX];
6018     for(di=0; di<dd->ndim; di++)
6019     {
6020         d = dd->dim[di];
6021         /* Check using the initial average cell size */
6022         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6023     }
6024
6025     return r;
6026 }
6027
6028 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6029                              const char *dlb_opt,bool bRecordLoad,
6030                              unsigned long Flags,t_inputrec *ir)
6031 {
6032     gmx_domdec_t *dd;
6033     int  eDLB=-1;
6034     char buf[STRLEN];
6035
6036     switch (dlb_opt[0])
6037     {
6038     case 'a': eDLB = edlbAUTO; break;
6039     case 'n': eDLB = edlbNO;   break;
6040     case 'y': eDLB = edlbYES;  break;
6041     default: gmx_incons("Unknown dlb_opt");
6042     }
6043
6044     if (Flags & MD_RERUN)
6045     {
6046         return edlbNO;
6047     }
6048
6049     if (!EI_DYNAMICS(ir->eI))
6050     {
6051         if (eDLB == edlbYES)
6052         {
6053             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6054             dd_warning(cr,fplog,buf);
6055         }
6056
6057         return edlbNO;
6058     }
6059
6060     if (!bRecordLoad)
6061     {
6062         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6063
6064         return edlbNO;
6065     }
6066
6067     if (Flags & MD_REPRODUCIBLE)
6068     {
6069         switch (eDLB)
6070         {
6071                         case edlbNO:
6072                                 break;
6073                         case edlbAUTO:
6074                                 dd_warning(cr,fplog,"NOTE: reproducability requested, will not use dynamic load balancing\n");
6075                                 eDLB = edlbNO;
6076                                 break;
6077                         case edlbYES:
6078                                 dd_warning(cr,fplog,"WARNING: reproducability requested with dynamic load balancing, the simulation will NOT be binary reproducable\n");
6079                                 break;
6080                         default:
6081                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6082                                 break;
6083         }
6084     }
6085
6086     return eDLB;
6087 }
6088
6089 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6090 {
6091     int dim;
6092
6093     dd->ndim = 0;
6094     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6095     {
6096         /* Decomposition order z,y,x */
6097         if (fplog)
6098         {
6099             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6100         }
6101         for(dim=DIM-1; dim>=0; dim--)
6102         {
6103             if (dd->nc[dim] > 1)
6104             {
6105                 dd->dim[dd->ndim++] = dim;
6106             }
6107         }
6108     }
6109     else
6110     {
6111         /* Decomposition order x,y,z */
6112         for(dim=0; dim<DIM; dim++)
6113         {
6114             if (dd->nc[dim] > 1)
6115             {
6116                 dd->dim[dd->ndim++] = dim;
6117             }
6118         }
6119     }
6120 }
6121
6122 static gmx_domdec_comm_t *init_dd_comm()
6123 {
6124     gmx_domdec_comm_t *comm;
6125     int  i;
6126
6127     snew(comm,1);
6128     snew(comm->cggl_flag,DIM*2);
6129     snew(comm->cgcm_state,DIM*2);
6130     for(i=0; i<DIM*2; i++)
6131     {
6132         comm->cggl_flag_nalloc[i]  = 0;
6133         comm->cgcm_state_nalloc[i] = 0;
6134     }
6135
6136     comm->nalloc_int = 0;
6137     comm->buf_int    = NULL;
6138
6139     vec_rvec_init(&comm->vbuf);
6140
6141     comm->n_load_have    = 0;
6142     comm->n_load_collect = 0;
6143
6144     for(i=0; i<ddnatNR-ddnatZONE; i++)
6145     {
6146         comm->sum_nat[i] = 0;
6147     }
6148     comm->ndecomp = 0;
6149     comm->nload   = 0;
6150     comm->load_step = 0;
6151     comm->load_sum  = 0;
6152     comm->load_max  = 0;
6153     clear_ivec(comm->load_lim);
6154     comm->load_mdf  = 0;
6155     comm->load_pme  = 0;
6156
6157     return comm;
6158 }
6159
6160 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6161                                         unsigned long Flags,
6162                                         ivec nc,
6163                                         real comm_distance_min,real rconstr,
6164                                         const char *dlb_opt,real dlb_scale,
6165                                         const char *sizex,const char *sizey,const char *sizez,
6166                                         gmx_mtop_t *mtop,t_inputrec *ir,
6167                                         matrix box,rvec *x,
6168                                         gmx_ddbox_t *ddbox,
6169                                         int *npme_x,int *npme_y)
6170 {
6171     gmx_domdec_t *dd;
6172     gmx_domdec_comm_t *comm;
6173     int  recload;
6174     int  d,i,j;
6175     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6176     bool bC;
6177     char buf[STRLEN];
6178
6179     if (fplog)
6180     {
6181         fprintf(fplog,
6182                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6183     }
6184
6185     snew(dd,1);
6186
6187     dd->comm = init_dd_comm();
6188     comm = dd->comm;
6189     snew(comm->cggl_flag,DIM*2);
6190     snew(comm->cgcm_state,DIM*2);
6191
6192     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6193     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6194
6195     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6196     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6197     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6198     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6199     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6200     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6201     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6202
6203     dd->pme_recv_f_alloc = 0;
6204     dd->pme_recv_f_buf = NULL;
6205
6206     if (dd->bSendRecv2 && fplog)
6207     {
6208         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6209     }
6210     if (comm->eFlop)
6211     {
6212         if (fplog)
6213         {
6214             fprintf(fplog,"Will load balance based on FLOP count\n");
6215         }
6216         if (comm->eFlop > 1)
6217         {
6218             srand(1+cr->nodeid);
6219         }
6220         comm->bRecordLoad = TRUE;
6221     }
6222     else
6223     {
6224         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6225
6226     }
6227
6228     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6229
6230     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6231     if (fplog)
6232     {
6233         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6234     }
6235     dd->bGridJump = comm->bDynLoadBal;
6236
6237     if (comm->nstSortCG)
6238     {
6239         if (fplog)
6240         {
6241             if (comm->nstSortCG == 1)
6242             {
6243                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6244             }
6245             else
6246             {
6247                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6248                         comm->nstSortCG);
6249             }
6250         }
6251         snew(comm->sort,1);
6252     }
6253     else
6254     {
6255         if (fplog)
6256         {
6257             fprintf(fplog,"Will not sort the charge groups\n");
6258         }
6259     }
6260
6261     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6262     if (comm->bInterCGBondeds)
6263     {
6264         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6265     }
6266     else
6267     {
6268         comm->bInterCGMultiBody = FALSE;
6269     }
6270
6271     dd->bInterCGcons = inter_charge_group_constraints(mtop);
6272
6273     if (ir->rlistlong == 0)
6274     {
6275         /* Set the cut-off to some very large value,
6276          * so we don't need if statements everywhere in the code.
6277          * We use sqrt, since the cut-off is squared in some places.
6278          */
6279         comm->cutoff   = GMX_CUTOFF_INF;
6280     }
6281     else
6282     {
6283         comm->cutoff   = ir->rlistlong;
6284     }
6285     comm->cutoff_mbody = 0;
6286
6287     comm->cellsize_limit = 0;
6288     comm->bBondComm = FALSE;
6289
6290     if (comm->bInterCGBondeds)
6291     {
6292         if (comm_distance_min > 0)
6293         {
6294             comm->cutoff_mbody = comm_distance_min;
6295             if (Flags & MD_DDBONDCOMM)
6296             {
6297                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6298             }
6299             else
6300             {
6301                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6302             }
6303             r_bonded_limit = comm->cutoff_mbody;
6304         }
6305         else if (ir->bPeriodicMols)
6306         {
6307             /* Can not easily determine the required cut-off */
6308             dd_warning(cr,fplog,"NOTE: Periodic molecules: can not easily determine the required minimum bonded cut-off, using half the non-bonded cut-off\n");
6309             comm->cutoff_mbody = comm->cutoff/2;
6310             r_bonded_limit = comm->cutoff_mbody;
6311         }
6312         else
6313         {
6314             if (MASTER(cr))
6315             {
6316                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6317                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6318             }
6319             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6320             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6321
6322             /* We use an initial margin of 10% for the minimum cell size,
6323              * except when we are just below the non-bonded cut-off.
6324              */
6325             if (Flags & MD_DDBONDCOMM)
6326             {
6327                 if (max(r_2b,r_mb) > comm->cutoff)
6328                 {
6329                     r_bonded       = max(r_2b,r_mb);
6330                     r_bonded_limit = 1.1*r_bonded;
6331                     comm->bBondComm = TRUE;
6332                 }
6333                 else
6334                 {
6335                     r_bonded       = r_mb;
6336                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6337                 }
6338                 /* We determine cutoff_mbody later */
6339             }
6340             else
6341             {
6342                 /* No special bonded communication,
6343                  * simply increase the DD cut-off.
6344                  */
6345                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6346                 comm->cutoff_mbody = r_bonded_limit;
6347                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6348             }
6349         }
6350         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6351         if (fplog)
6352         {
6353             fprintf(fplog,
6354                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6355                     comm->cellsize_limit);
6356         }
6357     }
6358
6359     if (dd->bInterCGcons && rconstr <= 0)
6360     {
6361         /* There is a cell size limit due to the constraints (P-LINCS) */
6362         rconstr = constr_r_max(fplog,mtop,ir);
6363         if (fplog)
6364         {
6365             fprintf(fplog,
6366                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6367                     rconstr);
6368             if (rconstr > comm->cellsize_limit)
6369             {
6370                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6371             }
6372         }
6373     }
6374     else if (rconstr > 0 && fplog)
6375     {
6376         /* Here we do not check for dd->bInterCGcons,
6377          * because one can also set a cell size limit for virtual sites only
6378          * and at this point we don't know yet if there are intercg v-sites.
6379          */
6380         fprintf(fplog,
6381                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6382                 rconstr);
6383     }
6384     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6385
6386     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6387
6388     if (nc[XX] > 0)
6389     {
6390         copy_ivec(nc,dd->nc);
6391         set_dd_dim(fplog,dd);
6392         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6393
6394         if (cr->npmenodes == -1)
6395         {
6396             cr->npmenodes = 0;
6397         }
6398         acs = average_cellsize_min(dd,ddbox);
6399         if (acs < comm->cellsize_limit)
6400         {
6401             if (fplog)
6402             {
6403                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6404             }
6405             gmx_fatal_collective(FARGS,cr,NULL,
6406                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6407                                  acs,comm->cellsize_limit);
6408         }
6409     }
6410     else
6411     {
6412         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6413
6414         /* We need to choose the optimal DD grid and possibly PME nodes */
6415         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6416                                comm->eDLB!=edlbNO,dlb_scale,
6417                                comm->cellsize_limit,comm->cutoff,
6418                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6419
6420         if (dd->nc[XX] == 0)
6421         {
6422             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6423             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6424                     !bC ? "-rdd" : "-rcon",
6425                     comm->eDLB!=edlbNO ? " or -dds" : "",
6426                     bC ? " or your LINCS settings" : "");
6427
6428             gmx_fatal_collective(FARGS,cr,NULL,
6429                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6430                                  "%s\n"
6431                                  "Look in the log file for details on the domain decomposition",
6432                                  cr->nnodes-cr->npmenodes,limit,buf);
6433         }
6434         set_dd_dim(fplog,dd);
6435     }
6436
6437     if (fplog)
6438     {
6439         fprintf(fplog,
6440                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6441                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6442     }
6443
6444     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6445     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6446     {
6447         gmx_fatal_collective(FARGS,cr,NULL,
6448                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6449                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6450     }
6451     if (cr->npmenodes > dd->nnodes)
6452     {
6453         gmx_fatal_collective(FARGS,cr,NULL,
6454                              "The number of separate PME node (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6455     }
6456     if (cr->npmenodes > 0)
6457     {
6458         comm->npmenodes = cr->npmenodes;
6459     }
6460     else
6461     {
6462         comm->npmenodes = dd->nnodes;
6463     }
6464
6465     if (EEL_PME(ir->coulombtype))
6466     {
6467         /* The following choices should match those
6468          * in comm_cost_est in domdec_setup.c.
6469          * Note that here the checks have to take into account
6470          * that the decomposition might occur in a different order than xyz
6471          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6472          * in which case they will not match those in comm_cost_est,
6473          * but since that is mainly for testing purposes that's fine.
6474          */
6475         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6476             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6477             getenv("GMX_PMEONEDD") == NULL)
6478         {
6479             comm->npmedecompdim = 2;
6480             comm->npmenodes_x   = dd->nc[XX];
6481             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6482         }
6483         else
6484         {
6485             /* In case nc is 1 in both x and y we could still choose to
6486              * decompose pme in y instead of x, but we use x for simplicity.
6487              */
6488             comm->npmedecompdim = 1;
6489             if (dd->dim[0] == YY)
6490             {
6491                 comm->npmenodes_x = 1;
6492                 comm->npmenodes_y = comm->npmenodes;
6493             }
6494             else
6495             {
6496                 comm->npmenodes_x = comm->npmenodes;
6497                 comm->npmenodes_y = 1;
6498             }
6499         }
6500         if (fplog)
6501         {
6502             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6503                     comm->npmenodes_x,comm->npmenodes_y,1);
6504         }
6505     }
6506     else
6507     {
6508         comm->npmedecompdim = 0;
6509         comm->npmenodes_x   = 0;
6510         comm->npmenodes_y   = 0;
6511     }
6512
6513     /* Technically we don't need both of these,
6514      * but it simplifies code not having to recalculate it.
6515      */
6516     *npme_x = comm->npmenodes_x;
6517     *npme_y = comm->npmenodes_y;
6518
6519     snew(comm->slb_frac,DIM);
6520     if (comm->eDLB == edlbNO)
6521     {
6522         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6523         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6524         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6525     }
6526
6527     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6528     {
6529         if (comm->bBondComm || comm->eDLB != edlbNO)
6530         {
6531             /* Set the bonded communication distance to halfway
6532              * the minimum and the maximum,
6533              * since the extra communication cost is nearly zero.
6534              */
6535             acs = average_cellsize_min(dd,ddbox);
6536             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6537             if (comm->eDLB != edlbNO)
6538             {
6539                 /* Check if this does not limit the scaling */
6540                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6541             }
6542             if (!comm->bBondComm)
6543             {
6544                 /* Without bBondComm do not go beyond the n.b. cut-off */
6545                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6546                 if (comm->cellsize_limit >= comm->cutoff)
6547                 {
6548                     /* We don't loose a lot of efficieny
6549                      * when increasing it to the n.b. cut-off.
6550                      * It can even be slightly faster, because we need
6551                      * less checks for the communication setup.
6552                      */
6553                     comm->cutoff_mbody = comm->cutoff;
6554                 }
6555             }
6556             /* Check if we did not end up below our original limit */
6557             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6558
6559             if (comm->cutoff_mbody > comm->cellsize_limit)
6560             {
6561                 comm->cellsize_limit = comm->cutoff_mbody;
6562             }
6563         }
6564         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6565     }
6566
6567     if (debug)
6568     {
6569         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6570                 "cellsize limit %f\n",
6571                 comm->bBondComm,comm->cellsize_limit);
6572     }
6573
6574     if (MASTER(cr))
6575     {
6576         check_dd_restrictions(cr,dd,ir,fplog);
6577     }
6578
6579     comm->partition_step = INT_MIN;
6580     dd->ddp_count = 0;
6581
6582     clear_dd_cycle_counts(dd);
6583
6584     return dd;
6585 }
6586
6587 static void set_dlb_limits(gmx_domdec_t *dd)
6588
6589 {
6590     int d;
6591
6592     for(d=0; d<dd->ndim; d++)
6593     {
6594         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6595         dd->comm->cellsize_min[dd->dim[d]] =
6596             dd->comm->cellsize_min_dlb[dd->dim[d]];
6597     }
6598 }
6599
6600
6601 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6602 {
6603     gmx_domdec_t *dd;
6604     gmx_domdec_comm_t *comm;
6605     real cellsize_min;
6606     int  d,nc,i;
6607     char buf[STRLEN];
6608
6609     dd = cr->dd;
6610     comm = dd->comm;
6611
6612     if (fplog)
6613     {
6614         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6615     }
6616
6617     cellsize_min = comm->cellsize_min[dd->dim[0]];
6618     for(d=1; d<dd->ndim; d++)
6619     {
6620         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6621     }
6622
6623     if (cellsize_min < comm->cellsize_limit*1.05)
6624     {
6625         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6626
6627         /* Change DLB from "auto" to "no". */
6628         comm->eDLB = edlbNO;
6629
6630         return;
6631     }
6632
6633     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6634     comm->bDynLoadBal = TRUE;
6635     dd->bGridJump = TRUE;
6636
6637     set_dlb_limits(dd);
6638
6639     /* We can set the required cell size info here,
6640      * so we do not need to communicate this.
6641      * The grid is completely uniform.
6642      */
6643     for(d=0; d<dd->ndim; d++)
6644     {
6645         if (comm->root[d])
6646         {
6647             comm->load[d].sum_m = comm->load[d].sum;
6648
6649             nc = dd->nc[dd->dim[d]];
6650             for(i=0; i<nc; i++)
6651             {
6652                 comm->root[d]->cell_f[i]    = i/(real)nc;
6653                 if (d > 0)
6654                 {
6655                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6656                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6657                 }
6658             }
6659             comm->root[d]->cell_f[nc] = 1.0;
6660         }
6661     }
6662 }
6663
6664 static char *init_bLocalCG(gmx_mtop_t *mtop)
6665 {
6666     int  ncg,cg;
6667     char *bLocalCG;
6668
6669     ncg = ncg_mtop(mtop);
6670     snew(bLocalCG,ncg);
6671     for(cg=0; cg<ncg; cg++)
6672     {
6673         bLocalCG[cg] = FALSE;
6674     }
6675
6676     return bLocalCG;
6677 }
6678
6679 void dd_init_bondeds(FILE *fplog,
6680                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6681                      gmx_vsite_t *vsite,gmx_constr_t constr,
6682                      t_inputrec *ir,bool bBCheck,cginfo_mb_t *cginfo_mb)
6683 {
6684     gmx_domdec_comm_t *comm;
6685     bool bBondComm;
6686     int  d;
6687
6688     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6689
6690     comm = dd->comm;
6691
6692     if (comm->bBondComm)
6693     {
6694         /* Communicate atoms beyond the cut-off for bonded interactions */
6695         comm = dd->comm;
6696
6697         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6698
6699         comm->bLocalCG = init_bLocalCG(mtop);
6700     }
6701     else
6702     {
6703         /* Only communicate atoms based on cut-off */
6704         comm->cglink   = NULL;
6705         comm->bLocalCG = NULL;
6706     }
6707 }
6708
6709 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6710                               t_inputrec *ir,
6711                               bool bDynLoadBal,real dlb_scale,
6712                               gmx_ddbox_t *ddbox)
6713 {
6714     gmx_domdec_comm_t *comm;
6715     int  d;
6716     ivec np;
6717     real limit,shrink;
6718     char buf[64];
6719
6720     if (fplog == NULL)
6721     {
6722         return;
6723     }
6724
6725     comm = dd->comm;
6726
6727     if (bDynLoadBal)
6728     {
6729         fprintf(fplog,"The maximum number of communication pulses is:");
6730         for(d=0; d<dd->ndim; d++)
6731         {
6732             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6733         }
6734         fprintf(fplog,"\n");
6735         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6736         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6737         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6738         for(d=0; d<DIM; d++)
6739         {
6740             if (dd->nc[d] > 1)
6741             {
6742                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6743                 {
6744                     shrink = 0;
6745                 }
6746                 else
6747                 {
6748                     shrink =
6749                         comm->cellsize_min_dlb[d]/
6750                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6751                 }
6752                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6753             }
6754         }
6755         fprintf(fplog,"\n");
6756     }
6757     else
6758     {
6759         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6760         fprintf(fplog,"The initial number of communication pulses is:");
6761         for(d=0; d<dd->ndim; d++)
6762         {
6763             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
6764         }
6765         fprintf(fplog,"\n");
6766         fprintf(fplog,"The initial domain decomposition cell size is:");
6767         for(d=0; d<DIM; d++) {
6768             if (dd->nc[d] > 1)
6769             {
6770                 fprintf(fplog," %c %.2f nm",
6771                         dim2char(d),dd->comm->cellsize_min[d]);
6772             }
6773         }
6774         fprintf(fplog,"\n\n");
6775     }
6776
6777     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
6778     {
6779         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
6780         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6781                 "non-bonded interactions","",comm->cutoff);
6782
6783         if (bDynLoadBal)
6784         {
6785             limit = dd->comm->cellsize_limit;
6786         }
6787         else
6788         {
6789             if (dynamic_dd_box(ddbox,ir))
6790             {
6791                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
6792             }
6793             limit = dd->comm->cellsize_min[XX];
6794             for(d=1; d<DIM; d++)
6795             {
6796                 limit = min(limit,dd->comm->cellsize_min[d]);
6797             }
6798         }
6799
6800         if (comm->bInterCGBondeds)
6801         {
6802             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6803                     "two-body bonded interactions","(-rdd)",
6804                     max(comm->cutoff,comm->cutoff_mbody));
6805             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6806                     "multi-body bonded interactions","(-rdd)",
6807                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
6808         }
6809         if (dd->vsite_comm)
6810         {
6811             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6812                     "virtual site constructions","(-rcon)",limit);
6813         }
6814         if (dd->constraint_comm)
6815         {
6816             sprintf(buf,"atoms separated by up to %d constraints",
6817                     1+ir->nProjOrder);
6818             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6819                     buf,"(-rcon)",limit);
6820         }
6821         fprintf(fplog,"\n");
6822     }
6823
6824     fflush(fplog);
6825 }
6826
6827 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
6828                        t_inputrec *ir,t_forcerec *fr,
6829                        gmx_ddbox_t *ddbox)
6830 {
6831     gmx_domdec_comm_t *comm;
6832     int  d,dim,npulse,npulse_d_max,npulse_d;
6833     bool bNoCutOff;
6834     int  natoms_tot;
6835     real vol_frac;
6836
6837     comm = dd->comm;
6838
6839     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6840
6841     if (EEL_PME(ir->coulombtype))
6842     {
6843         init_ddpme(dd,&comm->ddpme[0],0);
6844         if (comm->npmedecompdim >= 2)
6845         {
6846             init_ddpme(dd,&comm->ddpme[1],1);
6847         }
6848     }
6849     else
6850     {
6851         comm->npmenodes = 0;
6852         if (dd->pme_nodeid >= 0)
6853         {
6854             gmx_fatal_collective(FARGS,NULL,dd,
6855                                  "Can not have separate PME nodes without PME electrostatics");
6856         }
6857     }
6858
6859     /* If each molecule is a single charge group
6860      * or we use domain decomposition for each periodic dimension,
6861      * we do not need to take pbc into account for the bonded interactions.
6862      */
6863     if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
6864         (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
6865     {
6866         fr->bMolPBC = FALSE;
6867     }
6868     else
6869     {
6870         fr->bMolPBC = TRUE;
6871     }
6872
6873     if (debug)
6874     {
6875         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
6876     }
6877     if (comm->eDLB != edlbNO)
6878     {
6879         /* Determine the maximum number of comm. pulses in one dimension */
6880
6881         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6882
6883         /* Determine the maximum required number of grid pulses */
6884         if (comm->cellsize_limit >= comm->cutoff)
6885         {
6886             /* Only a single pulse is required */
6887             npulse = 1;
6888         }
6889         else if (!bNoCutOff && comm->cellsize_limit > 0)
6890         {
6891             /* We round down slightly here to avoid overhead due to the latency
6892              * of extra communication calls when the cut-off
6893              * would be only slightly longer than the cell size.
6894              * Later cellsize_limit is redetermined,
6895              * so we can not miss interactions due to this rounding.
6896              */
6897             npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6898         }
6899         else
6900         {
6901             /* There is no cell size limit */
6902             npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
6903         }
6904
6905         if (!bNoCutOff && npulse > 1)
6906         {
6907             /* See if we can do with less pulses, based on dlb_scale */
6908             npulse_d_max = 0;
6909             for(d=0; d<dd->ndim; d++)
6910             {
6911                 dim = dd->dim[d];
6912                 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6913                                  /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6914                 npulse_d_max = max(npulse_d_max,npulse_d);
6915             }
6916             npulse = min(npulse,npulse_d_max);
6917         }
6918
6919         /* This env var can override npulse */
6920         d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
6921         if (d > 0)
6922         {
6923             npulse = d;
6924         }
6925
6926         comm->maxpulse = 1;
6927         comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6928         for(d=0; d<dd->ndim; d++)
6929         {
6930             comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
6931             comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6932             snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
6933             comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
6934             if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6935             {
6936                 comm->bVacDLBNoLimit = FALSE;
6937             }
6938         }
6939
6940         /* cellsize_limit is set for LINCS in init_domain_decomposition */
6941         if (!comm->bVacDLBNoLimit)
6942         {
6943             comm->cellsize_limit = max(comm->cellsize_limit,
6944                                        comm->cutoff/comm->maxpulse);
6945         }
6946         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6947         /* Set the minimum cell size for each DD dimension */
6948         for(d=0; d<dd->ndim; d++)
6949         {
6950             if (comm->bVacDLBNoLimit ||
6951                 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
6952             {
6953                 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
6954             }
6955             else
6956             {
6957                 comm->cellsize_min_dlb[dd->dim[d]] =
6958                     comm->cutoff/comm->cd[d].np_dlb;
6959             }
6960         }
6961         if (comm->cutoff_mbody <= 0)
6962         {
6963             comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
6964         }
6965         if (comm->bDynLoadBal)
6966         {
6967             set_dlb_limits(dd);
6968         }
6969     }
6970
6971     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
6972     if (comm->eDLB == edlbAUTO)
6973     {
6974         if (fplog)
6975         {
6976             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
6977         }
6978         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
6979     }
6980
6981     if (ir->ePBC == epbcNONE)
6982     {
6983         vol_frac = 1 - 1/(double)dd->nnodes;
6984     }
6985     else
6986     {
6987         vol_frac =
6988             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
6989     }
6990     if (debug)
6991     {
6992         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
6993     }
6994     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
6995
6996     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
6997 }
6998
6999 static void merge_cg_buffers(int ncell,
7000                              gmx_domdec_comm_dim_t *cd, int pulse,
7001                              int  *ncg_cell,
7002                              int  *index_gl, int  *recv_i,
7003                              rvec *cg_cm,    rvec *recv_vr,
7004                              int *cgindex,
7005                              cginfo_mb_t *cginfo_mb,int *cginfo)
7006 {
7007     gmx_domdec_ind_t *ind,*ind_p;
7008     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7009     int shift,shift_at;
7010
7011     ind = &cd->ind[pulse];
7012
7013     /* First correct the already stored data */
7014     shift = ind->nrecv[ncell];
7015     for(cell=ncell-1; cell>=0; cell--)
7016     {
7017         shift -= ind->nrecv[cell];
7018         if (shift > 0)
7019         {
7020             /* Move the cg's present from previous grid pulses */
7021             cg0 = ncg_cell[ncell+cell];
7022             cg1 = ncg_cell[ncell+cell+1];
7023             cgindex[cg1+shift] = cgindex[cg1];
7024             for(cg=cg1-1; cg>=cg0; cg--)
7025             {
7026                 index_gl[cg+shift] = index_gl[cg];
7027                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7028                 cgindex[cg+shift] = cgindex[cg];
7029                 cginfo[cg+shift] = cginfo[cg];
7030             }
7031             /* Correct the already stored send indices for the shift */
7032             for(p=1; p<=pulse; p++)
7033             {
7034                 ind_p = &cd->ind[p];
7035                 cg0 = 0;
7036                 for(c=0; c<cell; c++)
7037                 {
7038                     cg0 += ind_p->nsend[c];
7039                 }
7040                 cg1 = cg0 + ind_p->nsend[cell];
7041                 for(cg=cg0; cg<cg1; cg++)
7042                 {
7043                     ind_p->index[cg] += shift;
7044                 }
7045             }
7046         }
7047     }
7048
7049     /* Merge in the communicated buffers */
7050     shift = 0;
7051     shift_at = 0;
7052     cg0 = 0;
7053     for(cell=0; cell<ncell; cell++)
7054     {
7055         cg1 = ncg_cell[ncell+cell+1] + shift;
7056         if (shift_at > 0)
7057         {
7058             /* Correct the old cg indices */
7059             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7060             {
7061                 cgindex[cg+1] += shift_at;
7062             }
7063         }
7064         for(cg=0; cg<ind->nrecv[cell]; cg++)
7065         {
7066             /* Copy this charge group from the buffer */
7067             index_gl[cg1] = recv_i[cg0];
7068             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7069             /* Add it to the cgindex */
7070             cg_gl = index_gl[cg1];
7071             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7072             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7073             cgindex[cg1+1] = cgindex[cg1] + nat;
7074             cg0++;
7075             cg1++;
7076             shift_at += nat;
7077         }
7078         shift += ind->nrecv[cell];
7079         ncg_cell[ncell+cell+1] = cg1;
7080     }
7081 }
7082
7083 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7084                                int nzone,int cg0,const int *cgindex)
7085 {
7086     int cg,zone,p;
7087
7088     /* Store the atom block boundaries for easy copying of communication buffers
7089      */
7090     cg = cg0;
7091     for(zone=0; zone<nzone; zone++)
7092     {
7093         for(p=0; p<cd->np; p++) {
7094             cd->ind[p].cell2at0[zone] = cgindex[cg];
7095             cg += cd->ind[p].nrecv[zone];
7096             cd->ind[p].cell2at1[zone] = cgindex[cg];
7097         }
7098     }
7099 }
7100
7101 static bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7102 {
7103     int  i;
7104     bool bMiss;
7105
7106     bMiss = FALSE;
7107     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7108     {
7109         if (!bLocalCG[link->a[i]])
7110         {
7111             bMiss = TRUE;
7112         }
7113     }
7114
7115     return bMiss;
7116 }
7117
7118 static void setup_dd_communication(gmx_domdec_t *dd,
7119                                    matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
7120 {
7121     int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
7122     int nzone,nzone_send,zone,zonei,cg0,cg1;
7123     int c,i,j,cg,cg_gl,nrcg;
7124     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7125     gmx_domdec_comm_t *comm;
7126     gmx_domdec_zones_t *zones;
7127     gmx_domdec_comm_dim_t *cd;
7128     gmx_domdec_ind_t *ind;
7129     cginfo_mb_t *cginfo_mb;
7130     bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
7131     real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
7132     rvec rb,rn;
7133     real corner[DIM][4],corner_round_0=0,corner_round_1[4];
7134     real bcorner[DIM],bcorner_round_1=0;
7135     ivec tric_dist;
7136     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7137     real skew_fac2_d,skew_fac_01;
7138     rvec sf2_round;
7139     int  nsend,nat;
7140
7141     if (debug)
7142     {
7143         fprintf(debug,"Setting up DD communication\n");
7144     }
7145
7146     comm  = dd->comm;
7147     cg_cm = fr->cg_cm;
7148
7149     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7150     {
7151         dim = dd->dim[dim_ind];
7152
7153         /* Check if we need to use triclinic distances */
7154         tric_dist[dim_ind] = 0;
7155         for(i=0; i<=dim_ind; i++)
7156         {
7157             if (ddbox->tric_dir[dd->dim[i]])
7158             {
7159                 tric_dist[dim_ind] = 1;
7160             }
7161         }
7162     }
7163
7164     bBondComm = comm->bBondComm;
7165
7166     /* Do we need to determine extra distances for multi-body bondeds? */
7167     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7168
7169     /* Do we need to determine extra distances for only two-body bondeds? */
7170     bDist2B = (bBondComm && !bDistMB);
7171
7172     r_comm2  = sqr(comm->cutoff);
7173     r_bcomm2 = sqr(comm->cutoff_mbody);
7174
7175     if (debug)
7176     {
7177         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7178     }
7179
7180     zones = &comm->zones;
7181
7182     dim0 = dd->dim[0];
7183     /* The first dimension is equal for all cells */
7184     corner[0][0] = comm->cell_x0[dim0];
7185     if (bDistMB)
7186     {
7187         bcorner[0] = corner[0][0];
7188     }
7189     if (dd->ndim >= 2)
7190     {
7191         dim1 = dd->dim[1];
7192         /* This cell row is only seen from the first row */
7193         corner[1][0] = comm->cell_x0[dim1];
7194         /* All rows can see this row */
7195         corner[1][1] = comm->cell_x0[dim1];
7196         if (dd->bGridJump)
7197         {
7198             corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7199             if (bDistMB)
7200             {
7201                 /* For the multi-body distance we need the maximum */
7202                 bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7203             }
7204         }
7205         /* Set the upper-right corner for rounding */
7206         corner_round_0 = comm->cell_x1[dim0];
7207
7208         if (dd->ndim >= 3)
7209         {
7210             dim2 = dd->dim[2];
7211             for(j=0; j<4; j++)
7212             {
7213                 corner[2][j] = comm->cell_x0[dim2];
7214             }
7215             if (dd->bGridJump)
7216             {
7217                 /* Use the maximum of the i-cells that see a j-cell */
7218                 for(i=0; i<zones->nizone; i++)
7219                 {
7220                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7221                     {
7222                         if (j >= 4)
7223                         {
7224                             corner[2][j-4] =
7225                                 max(corner[2][j-4],
7226                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7227                         }
7228                     }
7229                 }
7230                 if (bDistMB)
7231                 {
7232                     /* For the multi-body distance we need the maximum */
7233                     bcorner[2] = comm->cell_x0[dim2];
7234                     for(i=0; i<2; i++)
7235                     {
7236                         for(j=0; j<2; j++)
7237                         {
7238                             bcorner[2] = max(bcorner[2],
7239                                              comm->zone_d2[i][j].p1_0);
7240                         }
7241                     }
7242                 }
7243             }
7244
7245             /* Set the upper-right corner for rounding */
7246             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7247              * Only cell (0,0,0) can see cell 7 (1,1,1)
7248              */
7249             corner_round_1[0] = comm->cell_x1[dim1];
7250             corner_round_1[3] = comm->cell_x1[dim1];
7251             if (dd->bGridJump)
7252             {
7253                 corner_round_1[0] = max(comm->cell_x1[dim1],
7254                                         comm->zone_d1[1].mch1);
7255                 if (bDistMB)
7256                 {
7257                     /* For the multi-body distance we need the maximum */
7258                     bcorner_round_1 = max(comm->cell_x1[dim1],
7259                                           comm->zone_d1[1].p1_1);
7260                 }
7261             }
7262         }
7263     }
7264
7265     /* Triclinic stuff */
7266     normal = ddbox->normal;
7267     skew_fac_01 = 0;
7268     if (dd->ndim >= 2)
7269     {
7270         v_0 = ddbox->v[dim0];
7271         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7272         {
7273             /* Determine the coupling coefficient for the distances
7274              * to the cell planes along dim0 and dim1 through dim2.
7275              * This is required for correct rounding.
7276              */
7277             skew_fac_01 =
7278                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7279             if (debug)
7280             {
7281                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7282             }
7283         }
7284     }
7285     if (dd->ndim >= 3)
7286     {
7287         v_1 = ddbox->v[dim1];
7288     }
7289
7290     zone_cg_range = zones->cg_range;
7291     index_gl = dd->index_gl;
7292     cgindex  = dd->cgindex;
7293     cginfo_mb = fr->cginfo_mb;
7294
7295     zone_cg_range[0]   = 0;
7296     zone_cg_range[1]   = dd->ncg_home;
7297     comm->zone_ncg1[0] = dd->ncg_home;
7298     pos_cg             = dd->ncg_home;
7299
7300     nat_tot = dd->nat_home;
7301     nzone = 1;
7302     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7303     {
7304         dim = dd->dim[dim_ind];
7305         cd = &comm->cd[dim_ind];
7306
7307         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7308         {
7309             /* No pbc in this dimension, the first node should not comm. */
7310             nzone_send = 0;
7311         }
7312         else
7313         {
7314             nzone_send = nzone;
7315         }
7316
7317         bScrew = (dd->bScrewPBC && dim == XX);
7318
7319         v_d = ddbox->v[dim];
7320         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7321
7322         cd->bInPlace = TRUE;
7323         for(p=0; p<cd->np; p++)
7324         {
7325             /* Only atoms communicated in the first pulse are used
7326              * for multi-body bonded interactions or for bBondComm.
7327              */
7328             bDistBonded   = ((bDistMB || bDist2B) && p == 0);
7329             bDistMB_pulse = (bDistMB && bDistBonded);
7330
7331             ind = &cd->ind[p];
7332             nsend = 0;
7333             nat = 0;
7334             for(zone=0; zone<nzone_send; zone++)
7335             {
7336                 if (tric_dist[dim_ind] && dim_ind > 0)
7337                 {
7338                     /* Determine slightly more optimized skew_fac's
7339                      * for rounding.
7340                      * This reduces the number of communicated atoms
7341                      * by about 10% for 3D DD of rhombic dodecahedra.
7342                      */
7343                     for(dimd=0; dimd<dim; dimd++)
7344                     {
7345                         sf2_round[dimd] = 1;
7346                         if (ddbox->tric_dir[dimd])
7347                         {
7348                             for(i=dd->dim[dimd]+1; i<DIM; i++)
7349                             {
7350                                 /* If we are shifted in dimension i
7351                                  * and the cell plane is tilted forward
7352                                  * in dimension i, skip this coupling.
7353                                  */
7354                                 if (!(zones->shift[nzone+zone][i] &&
7355                                       ddbox->v[dimd][i][dimd] >= 0))
7356                                 {
7357                                     sf2_round[dimd] +=
7358                                         sqr(ddbox->v[dimd][i][dimd]);
7359                                 }
7360                             }
7361                             sf2_round[dimd] = 1/sf2_round[dimd];
7362                         }
7363                     }
7364                 }
7365
7366                 zonei = zone_perm[dim_ind][zone];
7367                 if (p == 0)
7368                 {
7369                     /* Here we permutate the zones to obtain a convenient order
7370                      * for neighbor searching
7371                      */
7372                     cg0 = zone_cg_range[zonei];
7373                     cg1 = zone_cg_range[zonei+1];
7374                 }
7375                 else
7376                 {
7377                     /* Look only at the cg's received in the previous grid pulse
7378                      */
7379                     cg1 = zone_cg_range[nzone+zone+1];
7380                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
7381                 }
7382                 ind->nsend[zone] = 0;
7383                 for(cg=cg0; cg<cg1; cg++)
7384                 {
7385                     r2  = 0;
7386                     rb2 = 0;
7387                     if (tric_dist[dim_ind] == 0)
7388                     {
7389                         /* Rectangular direction, easy */
7390                         r = cg_cm[cg][dim] - corner[dim_ind][zone];
7391                         if (r > 0)
7392                         {
7393                             r2 += r*r;
7394                         }
7395                         if (bDistMB_pulse)
7396                         {
7397                             r = cg_cm[cg][dim] - bcorner[dim_ind];
7398                             if (r > 0)
7399                             {
7400                                 rb2 += r*r;
7401                             }
7402                         }
7403                         /* Rounding gives at most a 16% reduction
7404                          * in communicated atoms
7405                          */
7406                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7407                         {
7408                             r = cg_cm[cg][dim0] - corner_round_0;
7409                             /* This is the first dimension, so always r >= 0 */
7410                             r2 += r*r;
7411                             if (bDistMB_pulse)
7412                             {
7413                                 rb2 += r*r;
7414                             }
7415                         }
7416                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7417                         {
7418                             r = cg_cm[cg][dim1] - corner_round_1[zone];
7419                             if (r > 0)
7420                             {
7421                                 r2 += r*r;
7422                             }
7423                             if (bDistMB_pulse)
7424                             {
7425                                 r = cg_cm[cg][dim1] - bcorner_round_1;
7426                                 if (r > 0)
7427                                 {
7428                                     rb2 += r*r;
7429                                 }
7430                             }
7431                         }
7432                     }
7433                     else
7434                     {
7435                         /* Triclinic direction, more complicated */
7436                         clear_rvec(rn);
7437                         clear_rvec(rb);
7438                         /* Rounding, conservative as the skew_fac multiplication
7439                          * will slightly underestimate the distance.
7440                          */
7441                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7442                         {
7443                             rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
7444                             for(i=dim0+1; i<DIM; i++)
7445                             {
7446                                 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7447                             }
7448                             r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7449                             if (bDistMB_pulse)
7450                             {
7451                                 rb[dim0] = rn[dim0];
7452                                 rb2 = r2;
7453                             }
7454                             /* Take care that the cell planes along dim0 might not
7455                              * be orthogonal to those along dim1 and dim2.
7456                              */
7457                             for(i=1; i<=dim_ind; i++)
7458                             {
7459                                 dimd = dd->dim[i];
7460                                 if (normal[dim0][dimd] > 0)
7461                                 {
7462                                     rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7463                                     if (bDistMB_pulse)
7464                                     {
7465                                         rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7466                                     }
7467                                 }
7468                             }
7469                         }
7470                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7471                         {
7472                             rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
7473                             tric_sh = 0;
7474                             for(i=dim1+1; i<DIM; i++)
7475                             {
7476                                 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7477                             }
7478                             rn[dim1] += tric_sh;
7479                             if (rn[dim1] > 0)
7480                             {
7481                                 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7482                                 /* Take care of coupling of the distances
7483                                  * to the planes along dim0 and dim1 through dim2.
7484                                  */
7485                                 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7486                                 /* Take care that the cell planes along dim1
7487                                  * might not be orthogonal to that along dim2.
7488                                  */
7489                                 if (normal[dim1][dim2] > 0)
7490                                 {
7491                                     rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7492                                 }
7493                             }
7494                             if (bDistMB_pulse)
7495                             {
7496                                 rb[dim1] +=
7497                                     cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
7498                                 if (rb[dim1] > 0)
7499                                 {
7500                                     rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7501                                     /* Take care of coupling of the distances
7502                                      * to the planes along dim0 and dim1 through dim2.
7503                                      */
7504                                     rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7505                                     /* Take care that the cell planes along dim1
7506                                      * might not be orthogonal to that along dim2.
7507                                      */
7508                                     if (normal[dim1][dim2] > 0)
7509                                     {
7510                                         rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7511                                     }
7512                                 }
7513                             }
7514                         }
7515                         /* The distance along the communication direction */
7516                         rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
7517                         tric_sh = 0;
7518                         for(i=dim+1; i<DIM; i++)
7519                         {
7520                             tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7521                         }
7522                         rn[dim] += tric_sh;
7523                         if (rn[dim] > 0)
7524                         {
7525                             r2 += rn[dim]*rn[dim]*skew_fac2_d;
7526                             /* Take care of coupling of the distances
7527                              * to the planes along dim0 and dim1 through dim2.
7528                              */
7529                             if (dim_ind == 1 && zonei == 1)
7530                             {
7531                                 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7532                             }
7533                         }
7534                         if (bDistMB_pulse)
7535                         {
7536                             clear_rvec(rb);
7537                             rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
7538                             if (rb[dim] > 0)
7539                             {
7540                                 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7541                                 /* Take care of coupling of the distances
7542                                  * to the planes along dim0 and dim1 through dim2.
7543                                  */
7544                                 if (dim_ind == 1 && zonei == 1)
7545                                 {
7546                                     rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7547                                 }
7548                             }
7549                         }
7550                     }
7551
7552                     if (r2 < r_comm2 ||
7553                         (bDistBonded &&
7554                          ((bDistMB && rb2 < r_bcomm2) ||
7555                           (bDist2B && r2  < r_bcomm2)) &&
7556                          (!bBondComm ||
7557                           (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
7558                            missing_link(comm->cglink,index_gl[cg],
7559                                         comm->bLocalCG)))))
7560                     {
7561                         /* Make an index to the local charge groups */
7562                         if (nsend+1 > ind->nalloc)
7563                         {
7564                             ind->nalloc = over_alloc_large(nsend+1);
7565                             srenew(ind->index,ind->nalloc);
7566                         }
7567                         if (nsend+1 > comm->nalloc_int)
7568                         {
7569                             comm->nalloc_int = over_alloc_large(nsend+1);
7570                             srenew(comm->buf_int,comm->nalloc_int);
7571                         }
7572                         ind->index[nsend] = cg;
7573                         comm->buf_int[nsend] = index_gl[cg];
7574                         ind->nsend[zone]++;
7575                         vec_rvec_check_alloc(&comm->vbuf,nsend+1);
7576
7577                         if (dd->ci[dim] == 0)
7578                         {
7579                             /* Correct cg_cm for pbc */
7580                             rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
7581                             if (bScrew)
7582                             {
7583                                 comm->vbuf.v[nsend][YY] =
7584                                     box[YY][YY]-comm->vbuf.v[nsend][YY];
7585                                 comm->vbuf.v[nsend][ZZ] =
7586                                     box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
7587                             }
7588                         }
7589                         else
7590                         {
7591                             copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
7592                         }
7593                         nsend++;
7594                         nat += cgindex[cg+1] - cgindex[cg];
7595                     }
7596                 }
7597             }
7598             /* Clear the counts in case we do not have pbc */
7599             for(zone=nzone_send; zone<nzone; zone++)
7600             {
7601                 ind->nsend[zone] = 0;
7602             }
7603             ind->nsend[nzone]   = nsend;
7604             ind->nsend[nzone+1] = nat;
7605             /* Communicate the number of cg's and atoms to receive */
7606             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7607                             ind->nsend, nzone+2,
7608                             ind->nrecv, nzone+2);
7609
7610             /* The rvec buffer is also required for atom buffers of size nsend
7611              * in dd_move_x and dd_move_f.
7612              */
7613             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
7614
7615             if (p > 0)
7616             {
7617                 /* We can receive in place if only the last zone is not empty */
7618                 for(zone=0; zone<nzone-1; zone++)
7619                 {
7620                     if (ind->nrecv[zone] > 0)
7621                     {
7622                         cd->bInPlace = FALSE;
7623                     }
7624                 }
7625                 if (!cd->bInPlace)
7626                 {
7627                     /* The int buffer is only required here for the cg indices */
7628                     if (ind->nrecv[nzone] > comm->nalloc_int2)
7629                     {
7630                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
7631                         srenew(comm->buf_int2,comm->nalloc_int2);
7632                     }
7633                     /* The rvec buffer is also required for atom buffers
7634                      * of size nrecv in dd_move_x and dd_move_f.
7635                      */
7636                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
7637                     vec_rvec_check_alloc(&comm->vbuf2,i);
7638                 }
7639             }
7640
7641             /* Make space for the global cg indices */
7642             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
7643                 || dd->cg_nalloc == 0)
7644             {
7645                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
7646                 srenew(index_gl,dd->cg_nalloc);
7647                 srenew(cgindex,dd->cg_nalloc+1);
7648             }
7649             /* Communicate the global cg indices */
7650             if (cd->bInPlace)
7651             {
7652                 recv_i = index_gl + pos_cg;
7653             }
7654             else
7655             {
7656                 recv_i = comm->buf_int2;
7657             }
7658             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7659                             comm->buf_int, nsend,
7660                             recv_i,        ind->nrecv[nzone]);
7661
7662             /* Make space for cg_cm */
7663             if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
7664             {
7665                 dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
7666                 cg_cm = fr->cg_cm;
7667             }
7668             /* Communicate cg_cm */
7669             if (cd->bInPlace)
7670             {
7671                 recv_vr = cg_cm + pos_cg;
7672             }
7673             else
7674             {
7675                 recv_vr = comm->vbuf2.v;
7676             }
7677             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
7678                              comm->vbuf.v, nsend,
7679                              recv_vr,      ind->nrecv[nzone]);
7680
7681             /* Make the charge group index */
7682             if (cd->bInPlace)
7683             {
7684                 zone = (p == 0 ? 0 : nzone - 1);
7685                 while (zone < nzone)
7686                 {
7687                     for(cg=0; cg<ind->nrecv[zone]; cg++)
7688                     {
7689                         cg_gl = index_gl[pos_cg];
7690                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
7691                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
7692                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
7693                         if (bBondComm)
7694                         {
7695                             /* Update the charge group presence,
7696                              * so we can use it in the next pass of the loop.
7697                              */
7698                             comm->bLocalCG[cg_gl] = TRUE;
7699                         }
7700                         pos_cg++;
7701                     }
7702                     if (p == 0)
7703                     {
7704                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
7705                     }
7706                     zone++;
7707                     zone_cg_range[nzone+zone] = pos_cg;
7708                 }
7709             }
7710             else
7711             {
7712                 /* This part of the code is never executed with bBondComm. */
7713                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
7714                                  index_gl,recv_i,cg_cm,recv_vr,
7715                                  cgindex,fr->cginfo_mb,fr->cginfo);
7716                 pos_cg += ind->nrecv[nzone];
7717             }
7718             nat_tot += ind->nrecv[nzone+1];
7719         }
7720         if (!cd->bInPlace)
7721         {
7722             /* Store the atom block for easy copying of communication buffers */
7723             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
7724         }
7725         nzone += nzone;
7726     }
7727     dd->index_gl = index_gl;
7728     dd->cgindex  = cgindex;
7729
7730     dd->ncg_tot = zone_cg_range[zones->n];
7731     dd->nat_tot = nat_tot;
7732     comm->nat[ddnatHOME] = dd->nat_home;
7733     for(i=ddnatZONE; i<ddnatNR; i++)
7734     {
7735         comm->nat[i] = dd->nat_tot;
7736     }
7737
7738     if (!bBondComm)
7739     {
7740         /* We don't need to update cginfo, since that was alrady done above.
7741          * So we pass NULL for the forcerec.
7742          */
7743         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
7744                       NULL,comm->bLocalCG);
7745     }
7746
7747     if (debug)
7748     {
7749         fprintf(debug,"Finished setting up DD communication, zones:");
7750         for(c=0; c<zones->n; c++)
7751         {
7752             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
7753         }
7754         fprintf(debug,"\n");
7755     }
7756 }
7757
7758 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
7759 {
7760     int c;
7761
7762     for(c=0; c<zones->nizone; c++)
7763     {
7764         zones->izone[c].cg1  = zones->cg_range[c+1];
7765         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
7766         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
7767     }
7768 }
7769
7770 static int comp_cgsort(const void *a,const void *b)
7771 {
7772     int comp;
7773
7774     gmx_cgsort_t *cga,*cgb;
7775     cga = (gmx_cgsort_t *)a;
7776     cgb = (gmx_cgsort_t *)b;
7777
7778     comp = cga->nsc - cgb->nsc;
7779     if (comp == 0)
7780     {
7781         comp = cga->ind_gl - cgb->ind_gl;
7782     }
7783
7784     return comp;
7785 }
7786
7787 static void order_int_cg(int n,gmx_cgsort_t *sort,
7788                          int *a,int *buf)
7789 {
7790     int i;
7791
7792     /* Order the data */
7793     for(i=0; i<n; i++)
7794     {
7795         buf[i] = a[sort[i].ind];
7796     }
7797
7798     /* Copy back to the original array */
7799     for(i=0; i<n; i++)
7800     {
7801         a[i] = buf[i];
7802     }
7803 }
7804
7805 static void order_vec_cg(int n,gmx_cgsort_t *sort,
7806                          rvec *v,rvec *buf)
7807 {
7808     int i;
7809
7810     /* Order the data */
7811     for(i=0; i<n; i++)
7812     {
7813         copy_rvec(v[sort[i].ind],buf[i]);
7814     }
7815
7816     /* Copy back to the original array */
7817     for(i=0; i<n; i++)
7818     {
7819         copy_rvec(buf[i],v[i]);
7820     }
7821 }
7822
7823 static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
7824                            rvec *v,rvec *buf)
7825 {
7826     int a,atot,cg,cg0,cg1,i;
7827
7828     /* Order the data */
7829     a = 0;
7830     for(cg=0; cg<ncg; cg++)
7831     {
7832         cg0 = cgindex[sort[cg].ind];
7833         cg1 = cgindex[sort[cg].ind+1];
7834         for(i=cg0; i<cg1; i++)
7835         {
7836             copy_rvec(v[i],buf[a]);
7837             a++;
7838         }
7839     }
7840     atot = a;
7841
7842     /* Copy back to the original array */
7843     for(a=0; a<atot; a++)
7844     {
7845         copy_rvec(buf[a],v[a]);
7846     }
7847 }
7848
7849 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
7850                          int nsort_new,gmx_cgsort_t *sort_new,
7851                          gmx_cgsort_t *sort1)
7852 {
7853     int i1,i2,i_new;
7854
7855     /* The new indices are not very ordered, so we qsort them */
7856     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
7857
7858     /* sort2 is already ordered, so now we can merge the two arrays */
7859     i1 = 0;
7860     i2 = 0;
7861     i_new = 0;
7862     while(i2 < nsort2 || i_new < nsort_new)
7863     {
7864         if (i2 == nsort2)
7865         {
7866             sort1[i1++] = sort_new[i_new++];
7867         }
7868         else if (i_new == nsort_new)
7869         {
7870             sort1[i1++] = sort2[i2++];
7871         }
7872         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
7873                  (sort2[i2].nsc == sort_new[i_new].nsc &&
7874                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
7875         {
7876             sort1[i1++] = sort2[i2++];
7877         }
7878         else
7879         {
7880             sort1[i1++] = sort_new[i_new++];
7881         }
7882     }
7883 }
7884
7885 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
7886                           rvec *cgcm,t_forcerec *fr,t_state *state,
7887                           int ncg_home_old)
7888 {
7889     gmx_domdec_sort_t *sort;
7890     gmx_cgsort_t *cgsort,*sort_i;
7891     int  ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
7892     rvec *vbuf;
7893
7894     sort = dd->comm->sort;
7895
7896     if (dd->ncg_home > sort->sort_nalloc)
7897     {
7898         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
7899         srenew(sort->sort1,sort->sort_nalloc);
7900         srenew(sort->sort2,sort->sort_nalloc);
7901     }
7902
7903     if (ncg_home_old >= 0)
7904     {
7905         /* The charge groups that remained in the same ns grid cell
7906          * are completely ordered. So we can sort efficiently by sorting
7907          * the charge groups that did move into the stationary list.
7908          */
7909         ncg_new = 0;
7910         nsort2 = 0;
7911         nsort_new = 0;
7912         for(i=0; i<dd->ncg_home; i++)
7913         {
7914             /* Check if this cg did not move to another node */
7915             cell_index = fr->ns.grid->cell_index[i];
7916             if (cell_index !=  4*fr->ns.grid->ncells)
7917             {
7918                 if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
7919                 {
7920                     /* This cg is new on this node or moved ns grid cell */
7921                     if (nsort_new >= sort->sort_new_nalloc)
7922                     {
7923                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
7924                         srenew(sort->sort_new,sort->sort_new_nalloc);
7925                     }
7926                     sort_i = &(sort->sort_new[nsort_new++]);
7927                 }
7928                 else
7929                 {
7930                     /* This cg did not move */
7931                     sort_i = &(sort->sort2[nsort2++]);
7932                 }
7933                 /* Sort on the ns grid cell indices
7934                  * and the global topology index
7935                  */
7936                 sort_i->nsc    = cell_index;
7937                 sort_i->ind_gl = dd->index_gl[i];
7938                 sort_i->ind    = i;
7939                 ncg_new++;
7940             }
7941         }
7942         if (debug)
7943         {
7944             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
7945                     nsort2,nsort_new);
7946         }
7947         /* Sort efficiently */
7948         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
7949     }
7950     else
7951     {
7952         cgsort = sort->sort1;
7953         ncg_new = 0;
7954         for(i=0; i<dd->ncg_home; i++)
7955         {
7956             /* Sort on the ns grid cell indices
7957              * and the global topology index
7958              */
7959             cgsort[i].nsc    = fr->ns.grid->cell_index[i];
7960             cgsort[i].ind_gl = dd->index_gl[i];
7961             cgsort[i].ind    = i;
7962             if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
7963             {
7964                 ncg_new++;
7965             }
7966         }
7967         if (debug)
7968         {
7969             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
7970         }
7971         /* Determine the order of the charge groups using qsort */
7972         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
7973     }
7974     cgsort = sort->sort1;
7975
7976     /* We alloc with the old size, since cgindex is still old */
7977     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
7978     vbuf = dd->comm->vbuf.v;
7979
7980     /* Remove the charge groups which are no longer at home here */
7981     dd->ncg_home = ncg_new;
7982
7983     /* Reorder the state */
7984     for(i=0; i<estNR; i++)
7985     {
7986         if (EST_DISTR(i) && state->flags & (1<<i))
7987         {
7988             switch (i)
7989             {
7990             case estX:
7991                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
7992                 break;
7993             case estV:
7994                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
7995                 break;
7996             case estSDX:
7997                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
7998                 break;
7999             case estCGP:
8000                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
8001                 break;
8002             case estLD_RNG:
8003             case estLD_RNGI:
8004             case estDISRE_INITF:
8005             case estDISRE_RM3TAV:
8006             case estORIRE_INITF:
8007             case estORIRE_DTAV:
8008                 /* No ordering required */
8009                 break;
8010             default:
8011                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8012                 break;
8013             }
8014         }
8015     }
8016     /* Reorder cgcm */
8017     order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8018
8019     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8020     {
8021         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8022         srenew(sort->ibuf,sort->ibuf_nalloc);
8023     }
8024     ibuf = sort->ibuf;
8025     /* Reorder the global cg index */
8026     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8027     /* Reorder the cginfo */
8028     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8029     /* Rebuild the local cg index */
8030     ibuf[0] = 0;
8031     for(i=0; i<dd->ncg_home; i++)
8032     {
8033         cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8034         ibuf[i+1] = ibuf[i] + cgsize;
8035     }
8036     for(i=0; i<dd->ncg_home+1; i++)
8037     {
8038         dd->cgindex[i] = ibuf[i];
8039     }
8040     /* Set the home atom number */
8041     dd->nat_home = dd->cgindex[dd->ncg_home];
8042
8043     /* Copy the sorted ns cell indices back to the ns grid struct */
8044     for(i=0; i<dd->ncg_home; i++)
8045     {
8046         fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8047     }
8048     fr->ns.grid->nr = dd->ncg_home;
8049 }
8050
8051 static void add_dd_statistics(gmx_domdec_t *dd)
8052 {
8053     gmx_domdec_comm_t *comm;
8054     int ddnat;
8055
8056     comm = dd->comm;
8057
8058     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8059     {
8060         comm->sum_nat[ddnat-ddnatZONE] +=
8061             comm->nat[ddnat] - comm->nat[ddnat-1];
8062     }
8063     comm->ndecomp++;
8064 }
8065
8066 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8067 {
8068     gmx_domdec_comm_t *comm;
8069     int ddnat;
8070
8071     comm = dd->comm;
8072
8073     /* Reset all the statistics and counters for total run counting */
8074     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8075     {
8076         comm->sum_nat[ddnat-ddnatZONE] = 0;
8077     }
8078     comm->ndecomp = 0;
8079     comm->nload = 0;
8080     comm->load_step = 0;
8081     comm->load_sum = 0;
8082     comm->load_max = 0;
8083     clear_ivec(comm->load_lim);
8084     comm->load_mdf = 0;
8085     comm->load_pme = 0;
8086 }
8087
8088 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8089 {
8090     gmx_domdec_comm_t *comm;
8091     int ddnat;
8092     double av;
8093
8094     comm = cr->dd->comm;
8095
8096     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8097
8098     if (fplog == NULL)
8099     {
8100         return;
8101     }
8102
8103     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
8104
8105     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8106     {
8107         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8108         switch(ddnat)
8109         {
8110         case ddnatZONE:
8111             fprintf(fplog,
8112                     " av. #atoms communicated per step for force:  %d x %.1f\n",
8113                     2,av);
8114             break;
8115         case ddnatVSITE:
8116             if (cr->dd->vsite_comm)
8117             {
8118                 fprintf(fplog,
8119                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
8120                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8121                         av);
8122             }
8123             break;
8124         case ddnatCON:
8125             if (cr->dd->constraint_comm)
8126             {
8127                 fprintf(fplog,
8128                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
8129                         1 + ir->nLincsIter,av);
8130             }
8131             break;
8132         default:
8133             gmx_incons(" Unknown type for DD statistics");
8134         }
8135     }
8136     fprintf(fplog,"\n");
8137
8138     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8139     {
8140         print_dd_load_av(fplog,cr->dd);
8141     }
8142 }
8143
8144 void dd_partition_system(FILE            *fplog,
8145                          gmx_large_int_t      step,
8146                          t_commrec       *cr,
8147                          bool            bMasterState,
8148                          int             nstglobalcomm,
8149                          t_state         *state_global,
8150                          gmx_mtop_t      *top_global,
8151                          t_inputrec      *ir,
8152                          t_state         *state_local,
8153                          rvec            **f,
8154                          t_mdatoms       *mdatoms,
8155                          gmx_localtop_t  *top_local,
8156                          t_forcerec      *fr,
8157                          gmx_vsite_t     *vsite,
8158                          gmx_shellfc_t   shellfc,
8159                          gmx_constr_t    constr,
8160                          t_nrnb          *nrnb,
8161                          gmx_wallcycle_t wcycle,
8162                          bool            bVerbose)
8163 {
8164     gmx_domdec_t *dd;
8165     gmx_domdec_comm_t *comm;
8166     gmx_ddbox_t ddbox={0};
8167     t_block *cgs_gl;
8168     gmx_large_int_t step_pcoupl;
8169     rvec cell_ns_x0,cell_ns_x1;
8170     int  i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
8171     bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8172     bool bRedist,bSortCG,bResortAll;
8173     ivec ncells_old,np;
8174     real grid_density;
8175     char sbuf[22];
8176
8177     dd = cr->dd;
8178     comm = dd->comm;
8179
8180     bBoxChanged = (bMasterState || DEFORM(*ir));
8181     if (ir->epc != epcNO)
8182     {
8183         /* With nstcalcenery > 1 pressure coupling happens.
8184          * one step after calculating the energies.
8185          * Box scaling happens at the end of the MD step,
8186          * after the DD partitioning.
8187          * We therefore have to do DLB in the first partitioning
8188          * after an MD step where P-coupling occured.
8189          * We need to determine the last step in which p-coupling occurred.
8190          * MRS -- need to validate this for vv?
8191          */
8192         n = ir->nstcalcenergy;
8193         if (n == 1)
8194         {
8195             step_pcoupl = step - 1;
8196         }
8197         else
8198         {
8199             step_pcoupl = ((step - 1)/n)*n + 1;
8200         }
8201         if (step_pcoupl >= comm->partition_step)
8202         {
8203             bBoxChanged = TRUE;
8204         }
8205     }
8206
8207     bNStGlobalComm = (step >= comm->partition_step + nstglobalcomm);
8208
8209     if (!comm->bDynLoadBal)
8210     {
8211         bDoDLB = FALSE;
8212     }
8213     else
8214     {
8215         /* Should we do dynamic load balacing this step?
8216          * Since it requires (possibly expensive) global communication,
8217          * we might want to do DLB less frequently.
8218          */
8219         if (bBoxChanged || ir->epc != epcNO)
8220         {
8221             bDoDLB = bBoxChanged;
8222         }
8223         else
8224         {
8225             bDoDLB = bNStGlobalComm;
8226         }
8227     }
8228
8229     /* Check if we have recorded loads on the nodes */
8230     if (comm->bRecordLoad && dd_load_count(comm))
8231     {
8232         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
8233         {
8234             /* Check if we should use DLB at the second partitioning
8235              * and every 100 partitionings,
8236              * so the extra communication cost is negligible.
8237              */
8238             n = max(100,nstglobalcomm);
8239             bCheckDLB = (comm->n_load_collect == 0 ||
8240                          comm->n_load_have % n == n-1);
8241         }
8242         else
8243         {
8244             bCheckDLB = FALSE;
8245         }
8246
8247         /* Print load every nstlog, first and last step to the log file */
8248         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
8249                     comm->n_load_collect == 0 ||
8250                     (step + ir->nstlist > ir->init_step + ir->nsteps));
8251
8252         /* Avoid extra communication due to verbose screen output
8253          * when nstglobalcomm is set.
8254          */
8255         if (bDoDLB || bLogLoad || bCheckDLB ||
8256             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
8257         {
8258             get_load_distribution(dd,wcycle);
8259             if (DDMASTER(dd))
8260             {
8261                 if (bLogLoad)
8262                 {
8263                     dd_print_load(fplog,dd,step-1);
8264                 }
8265                 if (bVerbose)
8266                 {
8267                     dd_print_load_verbose(dd);
8268                 }
8269             }
8270             comm->n_load_collect++;
8271
8272             if (bCheckDLB) {
8273                 /* Since the timings are node dependent, the master decides */
8274                 if (DDMASTER(dd))
8275                 {
8276                     bTurnOnDLB =
8277                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
8278                     if (debug)
8279                     {
8280                         fprintf(debug,"step %s, imb loss %f\n",
8281                                 gmx_step_str(step,sbuf),
8282                                 dd_force_imb_perf_loss(dd));
8283                     }
8284                 }
8285                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
8286                 if (bTurnOnDLB)
8287                 {
8288                     turn_on_dlb(fplog,cr,step);
8289                     bDoDLB = TRUE;
8290                 }
8291             }
8292         }
8293         comm->n_load_have++;
8294     }
8295
8296     cgs_gl = &comm->cgs_gl;
8297
8298     bRedist = FALSE;
8299     if (bMasterState)
8300     {
8301         /* Clear the old state */
8302         clear_dd_indices(dd,0,0);
8303
8304         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
8305                   TRUE,cgs_gl,state_global->x,&ddbox);
8306
8307         get_cg_distribution(fplog,step,dd,cgs_gl,
8308                             state_global->box,&ddbox,state_global->x);
8309
8310         dd_distribute_state(dd,cgs_gl,
8311                             state_global,state_local,f);
8312
8313         dd_make_local_cgs(dd,&top_local->cgs);
8314
8315         if (dd->ncg_home > fr->cg_nalloc)
8316         {
8317             dd_realloc_fr_cg(fr,dd->ncg_home);
8318         }
8319         calc_cgcm(fplog,0,dd->ncg_home,
8320                   &top_local->cgs,state_local->x,fr->cg_cm);
8321
8322         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8323
8324         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8325
8326         cg0 = 0;
8327     }
8328     else if (state_local->ddp_count != dd->ddp_count)
8329     {
8330         if (state_local->ddp_count > dd->ddp_count)
8331         {
8332             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
8333         }
8334
8335         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
8336         {
8337             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
8338         }
8339
8340         /* Clear the old state */
8341         clear_dd_indices(dd,0,0);
8342
8343         /* Build the new indices */
8344         rebuild_cgindex(dd,cgs_gl->index,state_local);
8345         make_dd_indices(dd,cgs_gl->index,0);
8346
8347         /* Redetermine the cg COMs */
8348         calc_cgcm(fplog,0,dd->ncg_home,
8349                   &top_local->cgs,state_local->x,fr->cg_cm);
8350
8351         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8352
8353         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8354
8355         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8356                   TRUE,&top_local->cgs,state_local->x,&ddbox);
8357
8358         bRedist = comm->bDynLoadBal;
8359     }
8360     else
8361     {
8362         /* We have the full state, only redistribute the cgs */
8363
8364         /* Clear the non-home indices */
8365         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
8366
8367         /* Avoid global communication for dim's without pbc and -gcom */
8368         if (!bNStGlobalComm)
8369         {
8370             copy_rvec(comm->box0    ,ddbox.box0    );
8371             copy_rvec(comm->box_size,ddbox.box_size);
8372         }
8373         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8374                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
8375
8376         bBoxChanged = TRUE;
8377         bRedist = TRUE;
8378     }
8379     /* For dim's without pbc and -gcom */
8380     copy_rvec(ddbox.box0    ,comm->box0    );
8381     copy_rvec(ddbox.box_size,comm->box_size);
8382
8383     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
8384                       step,wcycle);
8385
8386     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
8387     {
8388         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
8389     }
8390
8391     /* Check if we should sort the charge groups */
8392     if (comm->nstSortCG > 0)
8393     {
8394         bSortCG = (bMasterState ||
8395                    (bRedist && (step % comm->nstSortCG == 0)));
8396     }
8397     else
8398     {
8399         bSortCG = FALSE;
8400     }
8401
8402     ncg_home_old = dd->ncg_home;
8403
8404     if (bRedist)
8405     {
8406         cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
8407                                  state_local,f,fr,mdatoms,
8408                                  !bSortCG,nrnb);
8409     }
8410
8411     get_nsgrid_boundaries(fr->ns.grid,dd,
8412                           state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
8413                           dd->ncg_home,fr->cg_cm,
8414                           cell_ns_x0,cell_ns_x1,&grid_density);
8415
8416     if (bBoxChanged)
8417     {
8418         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
8419     }
8420
8421     copy_ivec(fr->ns.grid->n,ncells_old);
8422     grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
8423                state_local->box,cell_ns_x0,cell_ns_x1,
8424                fr->rlistlong,grid_density);
8425     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
8426     copy_ivec(ddbox.tric_dir,comm->tric_dir);
8427
8428     if (bSortCG)
8429     {
8430         /* Sort the state on charge group position.
8431          * This enables exact restarts from this step.
8432          * It also improves performance by about 15% with larger numbers
8433          * of atoms per node.
8434          */
8435
8436         /* Fill the ns grid with the home cell,
8437          * so we can sort with the indices.
8438          */
8439         set_zones_ncg_home(dd);
8440         fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
8441                   0,dd->ncg_home,fr->cg_cm);
8442
8443         /* Check if we can user the old order and ns grid cell indices
8444          * of the charge groups to sort the charge groups efficiently.
8445          */
8446         bResortAll = (bMasterState ||
8447                       fr->ns.grid->n[XX] != ncells_old[XX] ||
8448                       fr->ns.grid->n[YY] != ncells_old[YY] ||
8449                       fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
8450
8451         if (debug)
8452         {
8453             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
8454                     gmx_step_str(step,sbuf),dd->ncg_home);
8455         }
8456         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
8457                       bResortAll ? -1 : ncg_home_old);
8458         /* Rebuild all the indices */
8459         cg0 = 0;
8460         ga2la_clear(dd->ga2la);
8461     }
8462
8463     /* Setup up the communication and communicate the coordinates */
8464     setup_dd_communication(dd,state_local->box,&ddbox,fr);
8465
8466     /* Set the indices */
8467     make_dd_indices(dd,cgs_gl->index,cg0);
8468
8469     /* Set the charge group boundaries for neighbor searching */
8470     set_cg_boundaries(&comm->zones);
8471
8472     /*
8473     write_dd_pdb("dd_home",step,"dump",top_global,cr,
8474                  -1,state_local->x,state_local->box);
8475     */
8476
8477     /* Extract a local topology from the global topology */
8478     for(i=0; i<dd->ndim; i++)
8479     {
8480         np[dd->dim[i]] = comm->cd[i].np;
8481     }
8482     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
8483                       comm->cellsize_min,np,
8484                       fr,vsite,top_global,top_local);
8485
8486     /* Set up the special atom communication */
8487     n = comm->nat[ddnatZONE];
8488     for(i=ddnatZONE+1; i<ddnatNR; i++)
8489     {
8490         switch(i)
8491         {
8492         case ddnatVSITE:
8493             if (vsite && vsite->n_intercg_vsite)
8494             {
8495                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
8496             }
8497             break;
8498         case ddnatCON:
8499             if (dd->bInterCGcons)
8500             {
8501                 /* Only for inter-cg constraints we need special code */
8502                 n = dd_make_local_constraints(dd,n,top_global,
8503                                               constr,ir->nProjOrder,
8504                                               &top_local->idef.il[F_CONSTR]);
8505             }
8506             break;
8507         default:
8508             gmx_incons("Unknown special atom type setup");
8509         }
8510         comm->nat[i] = n;
8511     }
8512
8513     /* Make space for the extra coordinates for virtual site
8514      * or constraint communication.
8515      */
8516     state_local->natoms = comm->nat[ddnatNR-1];
8517     if (state_local->natoms > state_local->nalloc)
8518     {
8519         dd_realloc_state(state_local,f,state_local->natoms);
8520     }
8521
8522     if (fr->bF_NoVirSum)
8523     {
8524         if (vsite && vsite->n_intercg_vsite)
8525         {
8526             nat_f_novirsum = comm->nat[ddnatVSITE];
8527         }
8528         else
8529         {
8530             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
8531             {
8532                 nat_f_novirsum = dd->nat_tot;
8533             }
8534             else
8535             {
8536                 nat_f_novirsum = dd->nat_home;
8537             }
8538         }
8539     }
8540     else
8541     {
8542         nat_f_novirsum = 0;
8543     }
8544
8545     /* Set the number of atoms required for the force calculation.
8546      * Forces need to be constrained when using a twin-range setup
8547      * or with energy minimization. For simple simulations we could
8548      * avoid some allocation, zeroing and copying, but this is
8549      * probably not worth the complications ande checking.
8550      */
8551     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
8552                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
8553
8554     /* We make the all mdatoms up to nat_tot_con.
8555      * We could save some work by only setting invmass
8556      * between nat_tot and nat_tot_con.
8557      */
8558     /* This call also sets the new number of home particles to dd->nat_home */
8559     atoms2md(top_global,ir,
8560              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
8561
8562     /* Now we have the charges we can sort the FE interactions */
8563     dd_sort_local_top(dd,mdatoms,top_local);
8564
8565     if (shellfc)
8566     {
8567         /* Make the local shell stuff, currently no communication is done */
8568         make_local_shells(cr,mdatoms,shellfc);
8569     }
8570
8571         if (ir->implicit_solvent)
8572     {
8573         make_local_gb(cr,fr->born,ir->gb_algorithm);
8574     }
8575
8576     if (!(cr->duty & DUTY_PME))
8577     {
8578         /* Send the charges to our PME only node */
8579         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
8580                        mdatoms->chargeA,mdatoms->chargeB,
8581                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
8582     }
8583
8584     if (constr)
8585     {
8586         set_constraints(constr,top_local,ir,mdatoms,cr);
8587     }
8588
8589     if (ir->ePull != epullNO)
8590     {
8591         /* Update the local pull groups */
8592         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
8593     }
8594
8595     add_dd_statistics(dd);
8596
8597     /* Make sure we only count the cycles for this DD partitioning */
8598     clear_dd_cycle_counts(dd);
8599
8600     /* Because the order of the atoms might have changed since
8601      * the last vsite construction, we need to communicate the constructing
8602      * atom coordinates again (for spreading the forces this MD step).
8603      */
8604     dd_move_x_vsites(dd,state_local->box,state_local->x);
8605
8606     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
8607     {
8608         dd_move_x(dd,state_local->box,state_local->x);
8609         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
8610                      -1,state_local->x,state_local->box);
8611     }
8612
8613     /* Store the partitioning step */
8614     comm->partition_step = step;
8615
8616     /* Increase the DD partitioning counter */
8617     dd->ddp_count++;
8618     /* The state currently matches this DD partitioning count, store it */
8619     state_local->ddp_count = dd->ddp_count;
8620     if (bMasterState)
8621     {
8622         /* The DD master node knows the complete cg distribution,
8623          * store the count so we can possibly skip the cg info communication.
8624          */
8625         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
8626     }
8627
8628     if (comm->DD_debug > 0)
8629     {
8630         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
8631         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
8632                                 "after partitioning");
8633     }
8634 }