fixed segv in Verlet pair-search with trilinic DD
[gromacs.git] / src / mdlib / domdec.c
blob2f19d21b3b39fbdd665e5720ac333fa245dcffc9
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2008
5 * Copyright (c) 2012, by the GROMACS development team, led by
6 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
7 * others, as listed in the AUTHORS file in the top-level source
8 * directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 #ifdef HAVE_CONFIG_H
38 #include <config.h>
39 #endif
41 #include <stdio.h>
42 #include <time.h>
43 #include <math.h>
44 #include <string.h>
45 #include <stdlib.h>
46 #include "typedefs.h"
47 #include "smalloc.h"
48 #include "gmx_fatal.h"
49 #include "gmx_fatal_collective.h"
50 #include "vec.h"
51 #include "domdec.h"
52 #include "domdec_network.h"
53 #include "nrnb.h"
54 #include "pbc.h"
55 #include "chargegroup.h"
56 #include "constr.h"
57 #include "mdatoms.h"
58 #include "names.h"
59 #include "pdbio.h"
60 #include "futil.h"
61 #include "force.h"
62 #include "pme.h"
63 #include "pull.h"
64 #include "pull_rotation.h"
65 #include "gmx_wallcycle.h"
66 #include "mdrun.h"
67 #include "nsgrid.h"
68 #include "shellfc.h"
69 #include "mtop_util.h"
70 #include "gmxfio.h"
71 #include "gmx_ga2la.h"
72 #include "gmx_sort.h"
73 #include "nbnxn_search.h"
74 #include "bondf.h"
75 #include "gmx_omp_nthreads.h"
77 #ifdef GMX_LIB_MPI
78 #include <mpi.h>
79 #endif
80 #ifdef GMX_THREAD_MPI
81 #include "tmpi.h"
82 #endif
84 #define DDRANK(dd,rank) (rank)
85 #define DDMASTERRANK(dd) (dd->masterrank)
87 typedef struct gmx_domdec_master
89 /* The cell boundaries */
90 real **cell_x;
91 /* The global charge group division */
92 int *ncg; /* Number of home charge groups for each node */
93 int *index; /* Index of nnodes+1 into cg */
94 int *cg; /* Global charge group index */
95 int *nat; /* Number of home atoms for each node. */
96 int *ibuf; /* Buffer for communication */
97 rvec *vbuf; /* Buffer for state scattering and gathering */
98 } gmx_domdec_master_t;
100 typedef struct
102 /* The numbers of charge groups to send and receive for each cell
103 * that requires communication, the last entry contains the total
104 * number of atoms that needs to be communicated.
106 int nsend[DD_MAXIZONE+2];
107 int nrecv[DD_MAXIZONE+2];
108 /* The charge groups to send */
109 int *index;
110 int nalloc;
111 /* The atom range for non-in-place communication */
112 int cell2at0[DD_MAXIZONE];
113 int cell2at1[DD_MAXIZONE];
114 } gmx_domdec_ind_t;
116 typedef struct
118 int np; /* Number of grid pulses in this dimension */
119 int np_dlb; /* For dlb, for use with edlbAUTO */
120 gmx_domdec_ind_t *ind; /* The indices to communicate, size np */
121 int np_nalloc;
122 gmx_bool bInPlace; /* Can we communicate in place? */
123 } gmx_domdec_comm_dim_t;
125 typedef struct
127 gmx_bool *bCellMin; /* Temp. var.: is this cell size at the limit */
128 real *cell_f; /* State var.: cell boundaries, box relative */
129 real *old_cell_f; /* Temp. var.: old cell size */
130 real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
131 real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
132 real *bound_min; /* Temp. var.: lower limit for cell boundary */
133 real *bound_max; /* Temp. var.: upper limit for cell boundary */
134 gmx_bool bLimited; /* State var.: is DLB limited in this dim and row */
135 real *buf_ncd; /* Temp. var. */
136 } gmx_domdec_root_t;
138 #define DD_NLOAD_MAX 9
140 /* Here floats are accurate enough, since these variables
141 * only influence the load balancing, not the actual MD results.
143 typedef struct
145 int nload;
146 float *load;
147 float sum;
148 float max;
149 float sum_m;
150 float cvol_min;
151 float mdf;
152 float pme;
153 int flags;
154 } gmx_domdec_load_t;
156 typedef struct
158 int nsc;
159 int ind_gl;
160 int ind;
161 } gmx_cgsort_t;
163 typedef struct
165 gmx_cgsort_t *sort;
166 gmx_cgsort_t *sort2;
167 int sort_nalloc;
168 gmx_cgsort_t *sort_new;
169 int sort_new_nalloc;
170 int *ibuf;
171 int ibuf_nalloc;
172 } gmx_domdec_sort_t;
174 typedef struct
176 rvec *v;
177 int nalloc;
178 } vec_rvec_t;
180 /* This enum determines the order of the coordinates.
181 * ddnatHOME and ddnatZONE should be first and second,
182 * the others can be ordered as wanted.
184 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
186 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
187 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
189 typedef struct
191 int dim; /* The dimension */
192 gmx_bool dim_match;/* Tells if DD and PME dims match */
193 int nslab; /* The number of PME slabs in this dimension */
194 real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB */
195 int *pp_min; /* The minimum pp node location, size nslab */
196 int *pp_max; /* The maximum pp node location,size nslab */
197 int maxshift; /* The maximum shift for coordinate redistribution in PME */
198 } gmx_ddpme_t;
200 typedef struct
202 real min0; /* The minimum bottom of this zone */
203 real max1; /* The maximum top of this zone */
204 real min1; /* The minimum top of this zone */
205 real mch0; /* The maximum bottom communicaton height for this zone */
206 real mch1; /* The maximum top communicaton height for this zone */
207 real p1_0; /* The bottom value of the first cell in this zone */
208 real p1_1; /* The top value of the first cell in this zone */
209 } gmx_ddzone_t;
211 typedef struct
213 gmx_domdec_ind_t ind;
214 int *ibuf;
215 int ibuf_nalloc;
216 vec_rvec_t vbuf;
217 int nsend;
218 int nat;
219 int nsend_zone;
220 } dd_comm_setup_work_t;
222 typedef struct gmx_domdec_comm
224 /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
225 * unless stated otherwise.
228 /* The number of decomposition dimensions for PME, 0: no PME */
229 int npmedecompdim;
230 /* The number of nodes doing PME (PP/PME or only PME) */
231 int npmenodes;
232 int npmenodes_x;
233 int npmenodes_y;
234 /* The communication setup including the PME only nodes */
235 gmx_bool bCartesianPP_PME;
236 ivec ntot;
237 int cartpmedim;
238 int *pmenodes; /* size npmenodes */
239 int *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
240 * but with bCartesianPP_PME */
241 gmx_ddpme_t ddpme[2];
243 /* The DD particle-particle nodes only */
244 gmx_bool bCartesianPP;
245 int *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
247 /* The global charge groups */
248 t_block cgs_gl;
250 /* Should we sort the cgs */
251 int nstSortCG;
252 gmx_domdec_sort_t *sort;
254 /* Are there charge groups? */
255 gmx_bool bCGs;
257 /* Are there bonded and multi-body interactions between charge groups? */
258 gmx_bool bInterCGBondeds;
259 gmx_bool bInterCGMultiBody;
261 /* Data for the optional bonded interaction atom communication range */
262 gmx_bool bBondComm;
263 t_blocka *cglink;
264 char *bLocalCG;
266 /* The DLB option */
267 int eDLB;
268 /* Are we actually using DLB? */
269 gmx_bool bDynLoadBal;
271 /* Cell sizes for static load balancing, first index cartesian */
272 real **slb_frac;
274 /* The width of the communicated boundaries */
275 real cutoff_mbody;
276 real cutoff;
277 /* The minimum cell size (including triclinic correction) */
278 rvec cellsize_min;
279 /* For dlb, for use with edlbAUTO */
280 rvec cellsize_min_dlb;
281 /* The lower limit for the DD cell size with DLB */
282 real cellsize_limit;
283 /* Effectively no NB cut-off limit with DLB for systems without PBC? */
284 gmx_bool bVacDLBNoLimit;
286 /* tric_dir is only stored here because dd_get_ns_ranges needs it */
287 ivec tric_dir;
288 /* box0 and box_size are required with dim's without pbc and -gcom */
289 rvec box0;
290 rvec box_size;
292 /* The cell boundaries */
293 rvec cell_x0;
294 rvec cell_x1;
296 /* The old location of the cell boundaries, to check cg displacements */
297 rvec old_cell_x0;
298 rvec old_cell_x1;
300 /* The communication setup and charge group boundaries for the zones */
301 gmx_domdec_zones_t zones;
303 /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
304 * cell boundaries of neighboring cells for dynamic load balancing.
306 gmx_ddzone_t zone_d1[2];
307 gmx_ddzone_t zone_d2[2][2];
309 /* The coordinate/force communication setup and indices */
310 gmx_domdec_comm_dim_t cd[DIM];
311 /* The maximum number of cells to communicate with in one dimension */
312 int maxpulse;
314 /* Which cg distribution is stored on the master node */
315 int master_cg_ddp_count;
317 /* The number of cg's received from the direct neighbors */
318 int zone_ncg1[DD_MAXZONE];
320 /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
321 int nat[ddnatNR];
323 /* Array for signalling if atoms have moved to another domain */
324 int *moved;
325 int moved_nalloc;
327 /* Communication buffer for general use */
328 int *buf_int;
329 int nalloc_int;
331 /* Communication buffer for general use */
332 vec_rvec_t vbuf;
334 /* Temporary storage for thread parallel communication setup */
335 int nth;
336 dd_comm_setup_work_t *dth;
338 /* Communication buffers only used with multiple grid pulses */
339 int *buf_int2;
340 int nalloc_int2;
341 vec_rvec_t vbuf2;
343 /* Communication buffers for local redistribution */
344 int **cggl_flag;
345 int cggl_flag_nalloc[DIM*2];
346 rvec **cgcm_state;
347 int cgcm_state_nalloc[DIM*2];
349 /* Cell sizes for dynamic load balancing */
350 gmx_domdec_root_t **root;
351 real *cell_f_row;
352 real cell_f0[DIM];
353 real cell_f1[DIM];
354 real cell_f_max0[DIM];
355 real cell_f_min1[DIM];
357 /* Stuff for load communication */
358 gmx_bool bRecordLoad;
359 gmx_domdec_load_t *load;
360 #ifdef GMX_MPI
361 MPI_Comm *mpi_comm_load;
362 #endif
364 /* Maximum DLB scaling per load balancing step in percent */
365 int dlb_scale_lim;
367 /* Cycle counters */
368 float cycl[ddCyclNr];
369 int cycl_n[ddCyclNr];
370 float cycl_max[ddCyclNr];
371 /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
372 int eFlop;
373 double flop;
374 int flop_n;
375 /* Have often have did we have load measurements */
376 int n_load_have;
377 /* Have often have we collected the load measurements */
378 int n_load_collect;
380 /* Statistics */
381 double sum_nat[ddnatNR-ddnatZONE];
382 int ndecomp;
383 int nload;
384 double load_step;
385 double load_sum;
386 double load_max;
387 ivec load_lim;
388 double load_mdf;
389 double load_pme;
391 /* The last partition step */
392 gmx_large_int_t partition_step;
394 /* Debugging */
395 int nstDDDump;
396 int nstDDDumpGrid;
397 int DD_debug;
398 } gmx_domdec_comm_t;
400 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
401 #define DD_CGIBS 2
403 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
404 #define DD_FLAG_NRCG 65535
405 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
406 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
408 /* Zone permutation required to obtain consecutive charge groups
409 * for neighbor searching.
411 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
413 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
414 * components see only j zones with that component 0.
417 /* The DD zone order */
418 static const ivec dd_zo[DD_MAXZONE] =
419 {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
421 /* The 3D setup */
422 #define dd_z3n 8
423 #define dd_zp3n 4
424 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
426 /* The 2D setup */
427 #define dd_z2n 4
428 #define dd_zp2n 2
429 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
431 /* The 1D setup */
432 #define dd_z1n 2
433 #define dd_zp1n 1
434 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
436 /* Factors used to avoid problems due to rounding issues */
437 #define DD_CELL_MARGIN 1.0001
438 #define DD_CELL_MARGIN2 1.00005
439 /* Factor to account for pressure scaling during nstlist steps */
440 #define DD_PRES_SCALE_MARGIN 1.02
442 /* Allowed performance loss before we DLB or warn */
443 #define DD_PERF_LOSS 0.05
445 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
447 /* Use separate MPI send and receive commands
448 * when nnodes <= GMX_DD_NNODES_SENDRECV.
449 * This saves memory (and some copying for small nnodes).
450 * For high parallelization scatter and gather calls are used.
452 #define GMX_DD_NNODES_SENDRECV 4
456 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
458 static void index2xyz(ivec nc,int ind,ivec xyz)
460 xyz[XX] = ind % nc[XX];
461 xyz[YY] = (ind / nc[XX]) % nc[YY];
462 xyz[ZZ] = ind / (nc[YY]*nc[XX]);
466 /* This order is required to minimize the coordinate communication in PME
467 * which uses decomposition in the x direction.
469 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
471 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
473 xyz[XX] = ind / (nc[YY]*nc[ZZ]);
474 xyz[YY] = (ind / nc[ZZ]) % nc[YY];
475 xyz[ZZ] = ind % nc[ZZ];
478 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
480 int ddindex;
481 int ddnodeid=-1;
483 ddindex = dd_index(dd->nc,c);
484 if (dd->comm->bCartesianPP_PME)
486 ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
488 else if (dd->comm->bCartesianPP)
490 #ifdef GMX_MPI
491 MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
492 #endif
494 else
496 ddnodeid = ddindex;
499 return ddnodeid;
502 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
504 return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
507 int ddglatnr(gmx_domdec_t *dd,int i)
509 int atnr;
511 if (dd == NULL)
513 atnr = i + 1;
515 else
517 if (i >= dd->comm->nat[ddnatNR-1])
519 gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
521 atnr = dd->gatindex[i] + 1;
524 return atnr;
527 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
529 return &dd->comm->cgs_gl;
532 static void vec_rvec_init(vec_rvec_t *v)
534 v->nalloc = 0;
535 v->v = NULL;
538 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
540 if (n > v->nalloc)
542 v->nalloc = over_alloc_dd(n);
543 srenew(v->v,v->nalloc);
547 void dd_store_state(gmx_domdec_t *dd,t_state *state)
549 int i;
551 if (state->ddp_count != dd->ddp_count)
553 gmx_incons("The state does not the domain decomposition state");
556 state->ncg_gl = dd->ncg_home;
557 if (state->ncg_gl > state->cg_gl_nalloc)
559 state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
560 srenew(state->cg_gl,state->cg_gl_nalloc);
562 for(i=0; i<state->ncg_gl; i++)
564 state->cg_gl[i] = dd->index_gl[i];
567 state->ddp_count_cg_gl = dd->ddp_count;
570 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
572 return &dd->comm->zones;
575 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
576 int *jcg0,int *jcg1,ivec shift0,ivec shift1)
578 gmx_domdec_zones_t *zones;
579 int izone,d,dim;
581 zones = &dd->comm->zones;
583 izone = 0;
584 while (icg >= zones->izone[izone].cg1)
586 izone++;
589 if (izone == 0)
591 *jcg0 = icg;
593 else if (izone < zones->nizone)
595 *jcg0 = zones->izone[izone].jcg0;
597 else
599 gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
600 icg,izone,zones->nizone);
603 *jcg1 = zones->izone[izone].jcg1;
605 for(d=0; d<dd->ndim; d++)
607 dim = dd->dim[d];
608 shift0[dim] = zones->izone[izone].shift0[dim];
609 shift1[dim] = zones->izone[izone].shift1[dim];
610 if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
612 /* A conservative approach, this can be optimized */
613 shift0[dim] -= 1;
614 shift1[dim] += 1;
619 int dd_natoms_vsite(gmx_domdec_t *dd)
621 return dd->comm->nat[ddnatVSITE];
624 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
626 *at_start = dd->comm->nat[ddnatCON-1];
627 *at_end = dd->comm->nat[ddnatCON];
630 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
632 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
633 int *index,*cgindex;
634 gmx_domdec_comm_t *comm;
635 gmx_domdec_comm_dim_t *cd;
636 gmx_domdec_ind_t *ind;
637 rvec shift={0,0,0},*buf,*rbuf;
638 gmx_bool bPBC,bScrew;
640 comm = dd->comm;
642 cgindex = dd->cgindex;
644 buf = comm->vbuf.v;
646 nzone = 1;
647 nat_tot = dd->nat_home;
648 for(d=0; d<dd->ndim; d++)
650 bPBC = (dd->ci[dd->dim[d]] == 0);
651 bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
652 if (bPBC)
654 copy_rvec(box[dd->dim[d]],shift);
656 cd = &comm->cd[d];
657 for(p=0; p<cd->np; p++)
659 ind = &cd->ind[p];
660 index = ind->index;
661 n = 0;
662 if (!bPBC)
664 for(i=0; i<ind->nsend[nzone]; i++)
666 at0 = cgindex[index[i]];
667 at1 = cgindex[index[i]+1];
668 for(j=at0; j<at1; j++)
670 copy_rvec(x[j],buf[n]);
671 n++;
675 else if (!bScrew)
677 for(i=0; i<ind->nsend[nzone]; i++)
679 at0 = cgindex[index[i]];
680 at1 = cgindex[index[i]+1];
681 for(j=at0; j<at1; j++)
683 /* We need to shift the coordinates */
684 rvec_add(x[j],shift,buf[n]);
685 n++;
689 else
691 for(i=0; i<ind->nsend[nzone]; i++)
693 at0 = cgindex[index[i]];
694 at1 = cgindex[index[i]+1];
695 for(j=at0; j<at1; j++)
697 /* Shift x */
698 buf[n][XX] = x[j][XX] + shift[XX];
699 /* Rotate y and z.
700 * This operation requires a special shift force
701 * treatment, which is performed in calc_vir.
703 buf[n][YY] = box[YY][YY] - x[j][YY];
704 buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
705 n++;
710 if (cd->bInPlace)
712 rbuf = x + nat_tot;
714 else
716 rbuf = comm->vbuf2.v;
718 /* Send and receive the coordinates */
719 dd_sendrecv_rvec(dd, d, dddirBackward,
720 buf, ind->nsend[nzone+1],
721 rbuf, ind->nrecv[nzone+1]);
722 if (!cd->bInPlace)
724 j = 0;
725 for(zone=0; zone<nzone; zone++)
727 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
729 copy_rvec(rbuf[j],x[i]);
730 j++;
734 nat_tot += ind->nrecv[nzone+1];
736 nzone += nzone;
740 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
742 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
743 int *index,*cgindex;
744 gmx_domdec_comm_t *comm;
745 gmx_domdec_comm_dim_t *cd;
746 gmx_domdec_ind_t *ind;
747 rvec *buf,*sbuf;
748 ivec vis;
749 int is;
750 gmx_bool bPBC,bScrew;
752 comm = dd->comm;
754 cgindex = dd->cgindex;
756 buf = comm->vbuf.v;
758 n = 0;
759 nzone = comm->zones.n/2;
760 nat_tot = dd->nat_tot;
761 for(d=dd->ndim-1; d>=0; d--)
763 bPBC = (dd->ci[dd->dim[d]] == 0);
764 bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
765 if (fshift == NULL && !bScrew)
767 bPBC = FALSE;
769 /* Determine which shift vector we need */
770 clear_ivec(vis);
771 vis[dd->dim[d]] = 1;
772 is = IVEC2IS(vis);
774 cd = &comm->cd[d];
775 for(p=cd->np-1; p>=0; p--) {
776 ind = &cd->ind[p];
777 nat_tot -= ind->nrecv[nzone+1];
778 if (cd->bInPlace)
780 sbuf = f + nat_tot;
782 else
784 sbuf = comm->vbuf2.v;
785 j = 0;
786 for(zone=0; zone<nzone; zone++)
788 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
790 copy_rvec(f[i],sbuf[j]);
791 j++;
795 /* Communicate the forces */
796 dd_sendrecv_rvec(dd, d, dddirForward,
797 sbuf, ind->nrecv[nzone+1],
798 buf, ind->nsend[nzone+1]);
799 index = ind->index;
800 /* Add the received forces */
801 n = 0;
802 if (!bPBC)
804 for(i=0; i<ind->nsend[nzone]; i++)
806 at0 = cgindex[index[i]];
807 at1 = cgindex[index[i]+1];
808 for(j=at0; j<at1; j++)
810 rvec_inc(f[j],buf[n]);
811 n++;
815 else if (!bScrew)
817 for(i=0; i<ind->nsend[nzone]; i++)
819 at0 = cgindex[index[i]];
820 at1 = cgindex[index[i]+1];
821 for(j=at0; j<at1; j++)
823 rvec_inc(f[j],buf[n]);
824 /* Add this force to the shift force */
825 rvec_inc(fshift[is],buf[n]);
826 n++;
830 else
832 for(i=0; i<ind->nsend[nzone]; i++)
834 at0 = cgindex[index[i]];
835 at1 = cgindex[index[i]+1];
836 for(j=at0; j<at1; j++)
838 /* Rotate the force */
839 f[j][XX] += buf[n][XX];
840 f[j][YY] -= buf[n][YY];
841 f[j][ZZ] -= buf[n][ZZ];
842 if (fshift)
844 /* Add this force to the shift force */
845 rvec_inc(fshift[is],buf[n]);
847 n++;
852 nzone /= 2;
856 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
858 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
859 int *index,*cgindex;
860 gmx_domdec_comm_t *comm;
861 gmx_domdec_comm_dim_t *cd;
862 gmx_domdec_ind_t *ind;
863 real *buf,*rbuf;
865 comm = dd->comm;
867 cgindex = dd->cgindex;
869 buf = &comm->vbuf.v[0][0];
871 nzone = 1;
872 nat_tot = dd->nat_home;
873 for(d=0; d<dd->ndim; d++)
875 cd = &comm->cd[d];
876 for(p=0; p<cd->np; p++)
878 ind = &cd->ind[p];
879 index = ind->index;
880 n = 0;
881 for(i=0; i<ind->nsend[nzone]; i++)
883 at0 = cgindex[index[i]];
884 at1 = cgindex[index[i]+1];
885 for(j=at0; j<at1; j++)
887 buf[n] = v[j];
888 n++;
892 if (cd->bInPlace)
894 rbuf = v + nat_tot;
896 else
898 rbuf = &comm->vbuf2.v[0][0];
900 /* Send and receive the coordinates */
901 dd_sendrecv_real(dd, d, dddirBackward,
902 buf, ind->nsend[nzone+1],
903 rbuf, ind->nrecv[nzone+1]);
904 if (!cd->bInPlace)
906 j = 0;
907 for(zone=0; zone<nzone; zone++)
909 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
911 v[i] = rbuf[j];
912 j++;
916 nat_tot += ind->nrecv[nzone+1];
918 nzone += nzone;
922 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
924 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
925 int *index,*cgindex;
926 gmx_domdec_comm_t *comm;
927 gmx_domdec_comm_dim_t *cd;
928 gmx_domdec_ind_t *ind;
929 real *buf,*sbuf;
931 comm = dd->comm;
933 cgindex = dd->cgindex;
935 buf = &comm->vbuf.v[0][0];
937 n = 0;
938 nzone = comm->zones.n/2;
939 nat_tot = dd->nat_tot;
940 for(d=dd->ndim-1; d>=0; d--)
942 cd = &comm->cd[d];
943 for(p=cd->np-1; p>=0; p--) {
944 ind = &cd->ind[p];
945 nat_tot -= ind->nrecv[nzone+1];
946 if (cd->bInPlace)
948 sbuf = v + nat_tot;
950 else
952 sbuf = &comm->vbuf2.v[0][0];
953 j = 0;
954 for(zone=0; zone<nzone; zone++)
956 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
958 sbuf[j] = v[i];
959 j++;
963 /* Communicate the forces */
964 dd_sendrecv_real(dd, d, dddirForward,
965 sbuf, ind->nrecv[nzone+1],
966 buf, ind->nsend[nzone+1]);
967 index = ind->index;
968 /* Add the received forces */
969 n = 0;
970 for(i=0; i<ind->nsend[nzone]; i++)
972 at0 = cgindex[index[i]];
973 at1 = cgindex[index[i]+1];
974 for(j=at0; j<at1; j++)
976 v[j] += buf[n];
977 n++;
981 nzone /= 2;
985 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
987 fprintf(fp,"zone d0 %d d1 %d d2 %d min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
988 d,i,j,
989 zone->min0,zone->max1,
990 zone->mch0,zone->mch0,
991 zone->p1_0,zone->p1_1);
995 #define DDZONECOMM_MAXZONE 5
996 #define DDZONECOMM_BUFSIZE 3
998 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
999 int ddimind,int direction,
1000 gmx_ddzone_t *buf_s,int n_s,
1001 gmx_ddzone_t *buf_r,int n_r)
1003 #define ZBS DDZONECOMM_BUFSIZE
1004 rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
1005 rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
1006 int i;
1008 for(i=0; i<n_s; i++)
1010 vbuf_s[i*ZBS ][0] = buf_s[i].min0;
1011 vbuf_s[i*ZBS ][1] = buf_s[i].max1;
1012 vbuf_s[i*ZBS ][2] = buf_s[i].min1;
1013 vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1014 vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1015 vbuf_s[i*ZBS+1][2] = 0;
1016 vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1017 vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1018 vbuf_s[i*ZBS+2][2] = 0;
1021 dd_sendrecv_rvec(dd, ddimind, direction,
1022 vbuf_s, n_s*ZBS,
1023 vbuf_r, n_r*ZBS);
1025 for(i=0; i<n_r; i++)
1027 buf_r[i].min0 = vbuf_r[i*ZBS ][0];
1028 buf_r[i].max1 = vbuf_r[i*ZBS ][1];
1029 buf_r[i].min1 = vbuf_r[i*ZBS ][2];
1030 buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1031 buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1032 buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1033 buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1036 #undef ZBS
1039 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
1040 rvec cell_ns_x0,rvec cell_ns_x1)
1042 int d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
1043 gmx_ddzone_t *zp;
1044 gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
1045 gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
1046 gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
1047 rvec extr_s[2],extr_r[2];
1048 rvec dh;
1049 real dist_d,c=0,det;
1050 gmx_domdec_comm_t *comm;
1051 gmx_bool bPBC,bUse;
1053 comm = dd->comm;
1055 for(d=1; d<dd->ndim; d++)
1057 dim = dd->dim[d];
1058 zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1059 zp->min0 = cell_ns_x0[dim];
1060 zp->max1 = cell_ns_x1[dim];
1061 zp->min1 = cell_ns_x1[dim];
1062 zp->mch0 = cell_ns_x0[dim];
1063 zp->mch1 = cell_ns_x1[dim];
1064 zp->p1_0 = cell_ns_x0[dim];
1065 zp->p1_1 = cell_ns_x1[dim];
1068 for(d=dd->ndim-2; d>=0; d--)
1070 dim = dd->dim[d];
1071 bPBC = (dim < ddbox->npbcdim);
1073 /* Use an rvec to store two reals */
1074 extr_s[d][0] = comm->cell_f0[d+1];
1075 extr_s[d][1] = comm->cell_f1[d+1];
1076 extr_s[d][2] = comm->cell_f1[d+1];
1078 pos = 0;
1079 /* Store the extremes in the backward sending buffer,
1080 * so the get updated separately from the forward communication.
1082 for(d1=d; d1<dd->ndim-1; d1++)
1084 /* We invert the order to be able to use the same loop for buf_e */
1085 buf_s[pos].min0 = extr_s[d1][1];
1086 buf_s[pos].max1 = extr_s[d1][0];
1087 buf_s[pos].min1 = extr_s[d1][2];
1088 buf_s[pos].mch0 = 0;
1089 buf_s[pos].mch1 = 0;
1090 /* Store the cell corner of the dimension we communicate along */
1091 buf_s[pos].p1_0 = comm->cell_x0[dim];
1092 buf_s[pos].p1_1 = 0;
1093 pos++;
1096 buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1097 pos++;
1099 if (dd->ndim == 3 && d == 0)
1101 buf_s[pos] = comm->zone_d2[0][1];
1102 pos++;
1103 buf_s[pos] = comm->zone_d1[0];
1104 pos++;
1107 /* We only need to communicate the extremes
1108 * in the forward direction
1110 npulse = comm->cd[d].np;
1111 if (bPBC)
1113 /* Take the minimum to avoid double communication */
1114 npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1116 else
1118 /* Without PBC we should really not communicate over
1119 * the boundaries, but implementing that complicates
1120 * the communication setup and therefore we simply
1121 * do all communication, but ignore some data.
1123 npulse_min = npulse;
1125 for(p=0; p<npulse_min; p++)
1127 /* Communicate the extremes forward */
1128 bUse = (bPBC || dd->ci[dim] > 0);
1130 dd_sendrecv_rvec(dd, d, dddirForward,
1131 extr_s+d, dd->ndim-d-1,
1132 extr_r+d, dd->ndim-d-1);
1134 if (bUse)
1136 for(d1=d; d1<dd->ndim-1; d1++)
1138 extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1139 extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1140 extr_s[d1][2] = min(extr_s[d1][2],extr_r[d1][2]);
1145 buf_size = pos;
1146 for(p=0; p<npulse; p++)
1148 /* Communicate all the zone information backward */
1149 bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1151 dd_sendrecv_ddzone(dd, d, dddirBackward,
1152 buf_s, buf_size,
1153 buf_r, buf_size);
1155 clear_rvec(dh);
1156 if (p > 0)
1158 for(d1=d+1; d1<dd->ndim; d1++)
1160 /* Determine the decrease of maximum required
1161 * communication height along d1 due to the distance along d,
1162 * this avoids a lot of useless atom communication.
1164 dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1166 if (ddbox->tric_dir[dim])
1168 /* c is the off-diagonal coupling between the cell planes
1169 * along directions d and d1.
1171 c = ddbox->v[dim][dd->dim[d1]][dim];
1173 else
1175 c = 0;
1177 det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1178 if (det > 0)
1180 dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1182 else
1184 /* A negative value signals out of range */
1185 dh[d1] = -1;
1190 /* Accumulate the extremes over all pulses */
1191 for(i=0; i<buf_size; i++)
1193 if (p == 0)
1195 buf_e[i] = buf_r[i];
1197 else
1199 if (bUse)
1201 buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1202 buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1203 buf_e[i].min1 = min(buf_e[i].min1,buf_r[i].min1);
1206 if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1208 d1 = 1;
1210 else
1212 d1 = d + 1;
1214 if (bUse && dh[d1] >= 0)
1216 buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1217 buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1220 /* Copy the received buffer to the send buffer,
1221 * to pass the data through with the next pulse.
1223 buf_s[i] = buf_r[i];
1225 if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1226 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1228 /* Store the extremes */
1229 pos = 0;
1231 for(d1=d; d1<dd->ndim-1; d1++)
1233 extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1234 extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1235 extr_s[d1][2] = min(extr_s[d1][2],buf_e[pos].min1);
1236 pos++;
1239 if (d == 1 || (d == 0 && dd->ndim == 3))
1241 for(i=d; i<2; i++)
1243 comm->zone_d2[1-d][i] = buf_e[pos];
1244 pos++;
1247 if (d == 0)
1249 comm->zone_d1[1] = buf_e[pos];
1250 pos++;
1256 if (dd->ndim >= 2)
1258 dim = dd->dim[1];
1259 for(i=0; i<2; i++)
1261 if (debug)
1263 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1265 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1266 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1269 if (dd->ndim >= 3)
1271 dim = dd->dim[2];
1272 for(i=0; i<2; i++)
1274 for(j=0; j<2; j++)
1276 if (debug)
1278 print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1280 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1281 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1285 for(d=1; d<dd->ndim; d++)
1287 comm->cell_f_max0[d] = extr_s[d-1][0];
1288 comm->cell_f_min1[d] = extr_s[d-1][1];
1289 if (debug)
1291 fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1292 d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1297 static void dd_collect_cg(gmx_domdec_t *dd,
1298 t_state *state_local)
1300 gmx_domdec_master_t *ma=NULL;
1301 int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1302 t_block *cgs_gl;
1304 if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1306 /* The master has the correct distribution */
1307 return;
1310 if (state_local->ddp_count == dd->ddp_count)
1312 ncg_home = dd->ncg_home;
1313 cg = dd->index_gl;
1314 nat_home = dd->nat_home;
1316 else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1318 cgs_gl = &dd->comm->cgs_gl;
1320 ncg_home = state_local->ncg_gl;
1321 cg = state_local->cg_gl;
1322 nat_home = 0;
1323 for(i=0; i<ncg_home; i++)
1325 nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1328 else
1330 gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1333 buf2[0] = dd->ncg_home;
1334 buf2[1] = dd->nat_home;
1335 if (DDMASTER(dd))
1337 ma = dd->ma;
1338 ibuf = ma->ibuf;
1340 else
1342 ibuf = NULL;
1344 /* Collect the charge group and atom counts on the master */
1345 dd_gather(dd,2*sizeof(int),buf2,ibuf);
1347 if (DDMASTER(dd))
1349 ma->index[0] = 0;
1350 for(i=0; i<dd->nnodes; i++)
1352 ma->ncg[i] = ma->ibuf[2*i];
1353 ma->nat[i] = ma->ibuf[2*i+1];
1354 ma->index[i+1] = ma->index[i] + ma->ncg[i];
1357 /* Make byte counts and indices */
1358 for(i=0; i<dd->nnodes; i++)
1360 ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1361 ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1363 if (debug)
1365 fprintf(debug,"Initial charge group distribution: ");
1366 for(i=0; i<dd->nnodes; i++)
1367 fprintf(debug," %d",ma->ncg[i]);
1368 fprintf(debug,"\n");
1372 /* Collect the charge group indices on the master */
1373 dd_gatherv(dd,
1374 dd->ncg_home*sizeof(int),dd->index_gl,
1375 DDMASTER(dd) ? ma->ibuf : NULL,
1376 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1377 DDMASTER(dd) ? ma->cg : NULL);
1379 dd->comm->master_cg_ddp_count = state_local->ddp_count;
1382 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1383 rvec *lv,rvec *v)
1385 gmx_domdec_master_t *ma;
1386 int n,i,c,a,nalloc=0;
1387 rvec *buf=NULL;
1388 t_block *cgs_gl;
1390 ma = dd->ma;
1392 if (!DDMASTER(dd))
1394 #ifdef GMX_MPI
1395 MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1396 dd->rank,dd->mpi_comm_all);
1397 #endif
1398 } else {
1399 /* Copy the master coordinates to the global array */
1400 cgs_gl = &dd->comm->cgs_gl;
1402 n = DDMASTERRANK(dd);
1403 a = 0;
1404 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1406 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1408 copy_rvec(lv[a++],v[c]);
1412 for(n=0; n<dd->nnodes; n++)
1414 if (n != dd->rank)
1416 if (ma->nat[n] > nalloc)
1418 nalloc = over_alloc_dd(ma->nat[n]);
1419 srenew(buf,nalloc);
1421 #ifdef GMX_MPI
1422 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1423 n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1424 #endif
1425 a = 0;
1426 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1428 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1430 copy_rvec(buf[a++],v[c]);
1435 sfree(buf);
1439 static void get_commbuffer_counts(gmx_domdec_t *dd,
1440 int **counts,int **disps)
1442 gmx_domdec_master_t *ma;
1443 int n;
1445 ma = dd->ma;
1447 /* Make the rvec count and displacment arrays */
1448 *counts = ma->ibuf;
1449 *disps = ma->ibuf + dd->nnodes;
1450 for(n=0; n<dd->nnodes; n++)
1452 (*counts)[n] = ma->nat[n]*sizeof(rvec);
1453 (*disps)[n] = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1457 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1458 rvec *lv,rvec *v)
1460 gmx_domdec_master_t *ma;
1461 int *rcounts=NULL,*disps=NULL;
1462 int n,i,c,a;
1463 rvec *buf=NULL;
1464 t_block *cgs_gl;
1466 ma = dd->ma;
1468 if (DDMASTER(dd))
1470 get_commbuffer_counts(dd,&rcounts,&disps);
1472 buf = ma->vbuf;
1475 dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1477 if (DDMASTER(dd))
1479 cgs_gl = &dd->comm->cgs_gl;
1481 a = 0;
1482 for(n=0; n<dd->nnodes; n++)
1484 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1486 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1488 copy_rvec(buf[a++],v[c]);
1495 void dd_collect_vec(gmx_domdec_t *dd,
1496 t_state *state_local,rvec *lv,rvec *v)
1498 gmx_domdec_master_t *ma;
1499 int n,i,c,a,nalloc=0;
1500 rvec *buf=NULL;
1502 dd_collect_cg(dd,state_local);
1504 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1506 dd_collect_vec_sendrecv(dd,lv,v);
1508 else
1510 dd_collect_vec_gatherv(dd,lv,v);
1515 void dd_collect_state(gmx_domdec_t *dd,
1516 t_state *state_local,t_state *state)
1518 int est,i,j,nh;
1520 nh = state->nhchainlength;
1522 if (DDMASTER(dd))
1524 for (i=0;i<efptNR;i++) {
1525 state->lambda[i] = state_local->lambda[i];
1527 state->fep_state = state_local->fep_state;
1528 state->veta = state_local->veta;
1529 state->vol0 = state_local->vol0;
1530 copy_mat(state_local->box,state->box);
1531 copy_mat(state_local->boxv,state->boxv);
1532 copy_mat(state_local->svir_prev,state->svir_prev);
1533 copy_mat(state_local->fvir_prev,state->fvir_prev);
1534 copy_mat(state_local->pres_prev,state->pres_prev);
1537 for(i=0; i<state_local->ngtc; i++)
1539 for(j=0; j<nh; j++) {
1540 state->nosehoover_xi[i*nh+j] = state_local->nosehoover_xi[i*nh+j];
1541 state->nosehoover_vxi[i*nh+j] = state_local->nosehoover_vxi[i*nh+j];
1543 state->therm_integral[i] = state_local->therm_integral[i];
1545 for(i=0; i<state_local->nnhpres; i++)
1547 for(j=0; j<nh; j++) {
1548 state->nhpres_xi[i*nh+j] = state_local->nhpres_xi[i*nh+j];
1549 state->nhpres_vxi[i*nh+j] = state_local->nhpres_vxi[i*nh+j];
1553 for(est=0; est<estNR; est++)
1555 if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1557 switch (est) {
1558 case estX:
1559 dd_collect_vec(dd,state_local,state_local->x,state->x);
1560 break;
1561 case estV:
1562 dd_collect_vec(dd,state_local,state_local->v,state->v);
1563 break;
1564 case estSDX:
1565 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1566 break;
1567 case estCGP:
1568 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1569 break;
1570 case estLD_RNG:
1571 if (state->nrngi == 1)
1573 if (DDMASTER(dd))
1575 for(i=0; i<state_local->nrng; i++)
1577 state->ld_rng[i] = state_local->ld_rng[i];
1581 else
1583 dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1584 state_local->ld_rng,state->ld_rng);
1586 break;
1587 case estLD_RNGI:
1588 if (state->nrngi == 1)
1590 if (DDMASTER(dd))
1592 state->ld_rngi[0] = state_local->ld_rngi[0];
1595 else
1597 dd_gather(dd,sizeof(state->ld_rngi[0]),
1598 state_local->ld_rngi,state->ld_rngi);
1600 break;
1601 case estDISRE_INITF:
1602 case estDISRE_RM3TAV:
1603 case estORIRE_INITF:
1604 case estORIRE_DTAV:
1605 break;
1606 default:
1607 gmx_incons("Unknown state entry encountered in dd_collect_state");
1613 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1615 int est;
1617 if (debug)
1619 fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1622 state->nalloc = over_alloc_dd(nalloc);
1624 for(est=0; est<estNR; est++)
1626 if (EST_DISTR(est) && (state->flags & (1<<est)))
1628 switch(est) {
1629 case estX:
1630 srenew(state->x,state->nalloc);
1631 break;
1632 case estV:
1633 srenew(state->v,state->nalloc);
1634 break;
1635 case estSDX:
1636 srenew(state->sd_X,state->nalloc);
1637 break;
1638 case estCGP:
1639 srenew(state->cg_p,state->nalloc);
1640 break;
1641 case estLD_RNG:
1642 case estLD_RNGI:
1643 case estDISRE_INITF:
1644 case estDISRE_RM3TAV:
1645 case estORIRE_INITF:
1646 case estORIRE_DTAV:
1647 /* No reallocation required */
1648 break;
1649 default:
1650 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1655 if (f != NULL)
1657 srenew(*f,state->nalloc);
1661 static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
1662 int nalloc)
1664 if (nalloc > fr->cg_nalloc)
1666 if (debug)
1668 fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1670 fr->cg_nalloc = over_alloc_dd(nalloc);
1671 srenew(fr->cginfo,fr->cg_nalloc);
1672 if (fr->cutoff_scheme == ecutsGROUP)
1674 srenew(fr->cg_cm,fr->cg_nalloc);
1677 if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1679 /* We don't use charge groups, we use x in state to set up
1680 * the atom communication.
1682 dd_realloc_state(state,f,nalloc);
1686 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1687 rvec *v,rvec *lv)
1689 gmx_domdec_master_t *ma;
1690 int n,i,c,a,nalloc=0;
1691 rvec *buf=NULL;
1693 if (DDMASTER(dd))
1695 ma = dd->ma;
1697 for(n=0; n<dd->nnodes; n++)
1699 if (n != dd->rank)
1701 if (ma->nat[n] > nalloc)
1703 nalloc = over_alloc_dd(ma->nat[n]);
1704 srenew(buf,nalloc);
1706 /* Use lv as a temporary buffer */
1707 a = 0;
1708 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1710 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1712 copy_rvec(v[c],buf[a++]);
1715 if (a != ma->nat[n])
1717 gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1718 a,ma->nat[n]);
1721 #ifdef GMX_MPI
1722 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1723 DDRANK(dd,n),n,dd->mpi_comm_all);
1724 #endif
1727 sfree(buf);
1728 n = DDMASTERRANK(dd);
1729 a = 0;
1730 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1732 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1734 copy_rvec(v[c],lv[a++]);
1738 else
1740 #ifdef GMX_MPI
1741 MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1742 MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1743 #endif
1747 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1748 rvec *v,rvec *lv)
1750 gmx_domdec_master_t *ma;
1751 int *scounts=NULL,*disps=NULL;
1752 int n,i,c,a,nalloc=0;
1753 rvec *buf=NULL;
1755 if (DDMASTER(dd))
1757 ma = dd->ma;
1759 get_commbuffer_counts(dd,&scounts,&disps);
1761 buf = ma->vbuf;
1762 a = 0;
1763 for(n=0; n<dd->nnodes; n++)
1765 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1767 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1769 copy_rvec(v[c],buf[a++]);
1775 dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1778 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1780 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1782 dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1784 else
1786 dd_distribute_vec_scatterv(dd,cgs,v,lv);
1790 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1791 t_state *state,t_state *state_local,
1792 rvec **f)
1794 int i,j,nh;
1796 nh = state->nhchainlength;
1798 if (DDMASTER(dd))
1800 for(i=0;i<efptNR;i++)
1802 state_local->lambda[i] = state->lambda[i];
1804 state_local->fep_state = state->fep_state;
1805 state_local->veta = state->veta;
1806 state_local->vol0 = state->vol0;
1807 copy_mat(state->box,state_local->box);
1808 copy_mat(state->box_rel,state_local->box_rel);
1809 copy_mat(state->boxv,state_local->boxv);
1810 copy_mat(state->svir_prev,state_local->svir_prev);
1811 copy_mat(state->fvir_prev,state_local->fvir_prev);
1812 for(i=0; i<state_local->ngtc; i++)
1814 for(j=0; j<nh; j++) {
1815 state_local->nosehoover_xi[i*nh+j] = state->nosehoover_xi[i*nh+j];
1816 state_local->nosehoover_vxi[i*nh+j] = state->nosehoover_vxi[i*nh+j];
1818 state_local->therm_integral[i] = state->therm_integral[i];
1820 for(i=0; i<state_local->nnhpres; i++)
1822 for(j=0; j<nh; j++) {
1823 state_local->nhpres_xi[i*nh+j] = state->nhpres_xi[i*nh+j];
1824 state_local->nhpres_vxi[i*nh+j] = state->nhpres_vxi[i*nh+j];
1828 dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
1829 dd_bcast(dd,sizeof(int),&state_local->fep_state);
1830 dd_bcast(dd,sizeof(real),&state_local->veta);
1831 dd_bcast(dd,sizeof(real),&state_local->vol0);
1832 dd_bcast(dd,sizeof(state_local->box),state_local->box);
1833 dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1834 dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1835 dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1836 dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1837 dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1838 dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1839 dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1840 dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1841 dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1843 if (dd->nat_home > state_local->nalloc)
1845 dd_realloc_state(state_local,f,dd->nat_home);
1847 for(i=0; i<estNR; i++)
1849 if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1851 switch (i) {
1852 case estX:
1853 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1854 break;
1855 case estV:
1856 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1857 break;
1858 case estSDX:
1859 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1860 break;
1861 case estCGP:
1862 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1863 break;
1864 case estLD_RNG:
1865 if (state->nrngi == 1)
1867 dd_bcastc(dd,
1868 state_local->nrng*sizeof(state_local->ld_rng[0]),
1869 state->ld_rng,state_local->ld_rng);
1871 else
1873 dd_scatter(dd,
1874 state_local->nrng*sizeof(state_local->ld_rng[0]),
1875 state->ld_rng,state_local->ld_rng);
1877 break;
1878 case estLD_RNGI:
1879 if (state->nrngi == 1)
1881 dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1882 state->ld_rngi,state_local->ld_rngi);
1884 else
1886 dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1887 state->ld_rngi,state_local->ld_rngi);
1889 break;
1890 case estDISRE_INITF:
1891 case estDISRE_RM3TAV:
1892 case estORIRE_INITF:
1893 case estORIRE_DTAV:
1894 /* Not implemented yet */
1895 break;
1896 default:
1897 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1903 static char dim2char(int dim)
1905 char c='?';
1907 switch (dim)
1909 case XX: c = 'X'; break;
1910 case YY: c = 'Y'; break;
1911 case ZZ: c = 'Z'; break;
1912 default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1915 return c;
1918 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1919 gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1921 rvec grid_s[2],*grid_r=NULL,cx,r;
1922 char fname[STRLEN],format[STRLEN],buf[22];
1923 FILE *out;
1924 int a,i,d,z,y,x;
1925 matrix tric;
1926 real vol;
1928 copy_rvec(dd->comm->cell_x0,grid_s[0]);
1929 copy_rvec(dd->comm->cell_x1,grid_s[1]);
1931 if (DDMASTER(dd))
1933 snew(grid_r,2*dd->nnodes);
1936 dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1938 if (DDMASTER(dd))
1940 for(d=0; d<DIM; d++)
1942 for(i=0; i<DIM; i++)
1944 if (d == i)
1946 tric[d][i] = 1;
1948 else
1950 if (d < ddbox->npbcdim && dd->nc[d] > 1)
1952 tric[d][i] = box[i][d]/box[i][i];
1954 else
1956 tric[d][i] = 0;
1961 sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1962 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1963 out = gmx_fio_fopen(fname,"w");
1964 gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1965 a = 1;
1966 for(i=0; i<dd->nnodes; i++)
1968 vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1969 for(d=0; d<DIM; d++)
1971 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1973 for(z=0; z<2; z++)
1975 for(y=0; y<2; y++)
1977 for(x=0; x<2; x++)
1979 cx[XX] = grid_r[i*2+x][XX];
1980 cx[YY] = grid_r[i*2+y][YY];
1981 cx[ZZ] = grid_r[i*2+z][ZZ];
1982 mvmul(tric,cx,r);
1983 fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1984 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1988 for(d=0; d<DIM; d++)
1990 for(x=0; x<4; x++)
1992 switch(d)
1994 case 0: y = 1 + i*8 + 2*x; break;
1995 case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1996 case 2: y = 1 + i*8 + x; break;
1998 fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
2002 gmx_fio_fclose(out);
2003 sfree(grid_r);
2007 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
2008 gmx_mtop_t *mtop,t_commrec *cr,
2009 int natoms,rvec x[],matrix box)
2011 char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
2012 FILE *out;
2013 int i,ii,resnr,c;
2014 char *atomname,*resname;
2015 real b;
2016 gmx_domdec_t *dd;
2018 dd = cr->dd;
2019 if (natoms == -1)
2021 natoms = dd->comm->nat[ddnatVSITE];
2024 sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
2026 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
2027 sprintf(format4,"%s%s\n",pdbformat4,"%6.2f%6.2f");
2029 out = gmx_fio_fopen(fname,"w");
2031 fprintf(out,"TITLE %s\n",title);
2032 gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
2033 for(i=0; i<natoms; i++)
2035 ii = dd->gatindex[i];
2036 gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
2037 if (i < dd->comm->nat[ddnatZONE])
2039 c = 0;
2040 while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2042 c++;
2044 b = c;
2046 else if (i < dd->comm->nat[ddnatVSITE])
2048 b = dd->comm->zones.n;
2050 else
2052 b = dd->comm->zones.n + 1;
2054 fprintf(out,strlen(atomname)<4 ? format : format4,
2055 "ATOM",(ii+1)%100000,
2056 atomname,resname,' ',resnr%10000,' ',
2057 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
2059 fprintf(out,"TER\n");
2061 gmx_fio_fclose(out);
2064 real dd_cutoff_mbody(gmx_domdec_t *dd)
2066 gmx_domdec_comm_t *comm;
2067 int di;
2068 real r;
2070 comm = dd->comm;
2072 r = -1;
2073 if (comm->bInterCGBondeds)
2075 if (comm->cutoff_mbody > 0)
2077 r = comm->cutoff_mbody;
2079 else
2081 /* cutoff_mbody=0 means we do not have DLB */
2082 r = comm->cellsize_min[dd->dim[0]];
2083 for(di=1; di<dd->ndim; di++)
2085 r = min(r,comm->cellsize_min[dd->dim[di]]);
2087 if (comm->bBondComm)
2089 r = max(r,comm->cutoff_mbody);
2091 else
2093 r = min(r,comm->cutoff);
2098 return r;
2101 real dd_cutoff_twobody(gmx_domdec_t *dd)
2103 real r_mb;
2105 r_mb = dd_cutoff_mbody(dd);
2107 return max(dd->comm->cutoff,r_mb);
2111 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2113 int nc,ntot;
2115 nc = dd->nc[dd->comm->cartpmedim];
2116 ntot = dd->comm->ntot[dd->comm->cartpmedim];
2117 copy_ivec(coord,coord_pme);
2118 coord_pme[dd->comm->cartpmedim] =
2119 nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2122 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2124 /* Here we assign a PME node to communicate with this DD node
2125 * by assuming that the major index of both is x.
2126 * We add cr->npmenodes/2 to obtain an even distribution.
2128 return (ddindex*npme + npme/2)/ndd;
2131 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2133 return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2136 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2138 return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2141 static int *dd_pmenodes(t_commrec *cr)
2143 int *pmenodes;
2144 int n,i,p0,p1;
2146 snew(pmenodes,cr->npmenodes);
2147 n = 0;
2148 for(i=0; i<cr->dd->nnodes; i++) {
2149 p0 = cr_ddindex2pmeindex(cr,i);
2150 p1 = cr_ddindex2pmeindex(cr,i+1);
2151 if (i+1 == cr->dd->nnodes || p1 > p0) {
2152 if (debug)
2153 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2154 pmenodes[n] = i + 1 + n;
2155 n++;
2159 return pmenodes;
2162 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2164 gmx_domdec_t *dd;
2165 ivec coords,coords_pme,nc;
2166 int slab;
2168 dd = cr->dd;
2170 if (dd->comm->bCartesian) {
2171 gmx_ddindex2xyz(dd->nc,ddindex,coords);
2172 dd_coords2pmecoords(dd,coords,coords_pme);
2173 copy_ivec(dd->ntot,nc);
2174 nc[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2175 coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2177 slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2178 } else {
2179 slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2182 coords[XX] = x;
2183 coords[YY] = y;
2184 coords[ZZ] = z;
2185 slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2187 return slab;
2190 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2192 gmx_domdec_comm_t *comm;
2193 ivec coords;
2194 int ddindex,nodeid=-1;
2196 comm = cr->dd->comm;
2198 coords[XX] = x;
2199 coords[YY] = y;
2200 coords[ZZ] = z;
2201 if (comm->bCartesianPP_PME)
2203 #ifdef GMX_MPI
2204 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2205 #endif
2207 else
2209 ddindex = dd_index(cr->dd->nc,coords);
2210 if (comm->bCartesianPP)
2212 nodeid = comm->ddindex2simnodeid[ddindex];
2214 else
2216 if (comm->pmenodes)
2218 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2220 else
2222 nodeid = ddindex;
2227 return nodeid;
2230 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2232 gmx_domdec_t *dd;
2233 gmx_domdec_comm_t *comm;
2234 ivec coord,coord_pme;
2235 int i;
2236 int pmenode=-1;
2238 dd = cr->dd;
2239 comm = dd->comm;
2241 /* This assumes a uniform x domain decomposition grid cell size */
2242 if (comm->bCartesianPP_PME)
2244 #ifdef GMX_MPI
2245 MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2246 if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2248 /* This is a PP node */
2249 dd_cart_coord2pmecoord(dd,coord,coord_pme);
2250 MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2252 #endif
2254 else if (comm->bCartesianPP)
2256 if (sim_nodeid < dd->nnodes)
2258 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2261 else
2263 /* This assumes DD cells with identical x coordinates
2264 * are numbered sequentially.
2266 if (dd->comm->pmenodes == NULL)
2268 if (sim_nodeid < dd->nnodes)
2270 /* The DD index equals the nodeid */
2271 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2274 else
2276 i = 0;
2277 while (sim_nodeid > dd->comm->pmenodes[i])
2279 i++;
2281 if (sim_nodeid < dd->comm->pmenodes[i])
2283 pmenode = dd->comm->pmenodes[i];
2288 return pmenode;
2291 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2293 gmx_bool bPMEOnlyNode;
2295 if (DOMAINDECOMP(cr))
2297 bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2299 else
2301 bPMEOnlyNode = FALSE;
2304 return bPMEOnlyNode;
2307 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2308 int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2310 gmx_domdec_t *dd;
2311 int x,y,z;
2312 ivec coord,coord_pme;
2314 dd = cr->dd;
2316 snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2318 *nmy_ddnodes = 0;
2319 for(x=0; x<dd->nc[XX]; x++)
2321 for(y=0; y<dd->nc[YY]; y++)
2323 for(z=0; z<dd->nc[ZZ]; z++)
2325 if (dd->comm->bCartesianPP_PME)
2327 coord[XX] = x;
2328 coord[YY] = y;
2329 coord[ZZ] = z;
2330 dd_cart_coord2pmecoord(dd,coord,coord_pme);
2331 if (dd->ci[XX] == coord_pme[XX] &&
2332 dd->ci[YY] == coord_pme[YY] &&
2333 dd->ci[ZZ] == coord_pme[ZZ])
2334 (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2336 else
2338 /* The slab corresponds to the nodeid in the PME group */
2339 if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2341 (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2348 /* The last PP-only node is the peer node */
2349 *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2351 if (debug)
2353 fprintf(debug,"Receive coordinates from PP nodes:");
2354 for(x=0; x<*nmy_ddnodes; x++)
2356 fprintf(debug," %d",(*my_ddnodes)[x]);
2358 fprintf(debug,"\n");
2362 static gmx_bool receive_vir_ener(t_commrec *cr)
2364 gmx_domdec_comm_t *comm;
2365 int pmenode,coords[DIM],rank;
2366 gmx_bool bReceive;
2368 bReceive = TRUE;
2369 if (cr->npmenodes < cr->dd->nnodes)
2371 comm = cr->dd->comm;
2372 if (comm->bCartesianPP_PME)
2374 pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2375 #ifdef GMX_MPI
2376 MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2377 coords[comm->cartpmedim]++;
2378 if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2380 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2381 if (dd_simnode2pmenode(cr,rank) == pmenode)
2383 /* This is not the last PP node for pmenode */
2384 bReceive = FALSE;
2387 #endif
2389 else
2391 pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2392 if (cr->sim_nodeid+1 < cr->nnodes &&
2393 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2395 /* This is not the last PP node for pmenode */
2396 bReceive = FALSE;
2401 return bReceive;
2404 static void set_zones_ncg_home(gmx_domdec_t *dd)
2406 gmx_domdec_zones_t *zones;
2407 int i;
2409 zones = &dd->comm->zones;
2411 zones->cg_range[0] = 0;
2412 for(i=1; i<zones->n+1; i++)
2414 zones->cg_range[i] = dd->ncg_home;
2418 static void rebuild_cgindex(gmx_domdec_t *dd,
2419 const int *gcgs_index,t_state *state)
2421 int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2423 ind = state->cg_gl;
2424 dd_cg_gl = dd->index_gl;
2425 cgindex = dd->cgindex;
2426 nat = 0;
2427 cgindex[0] = nat;
2428 for(i=0; i<state->ncg_gl; i++)
2430 cgindex[i] = nat;
2431 cg_gl = ind[i];
2432 dd_cg_gl[i] = cg_gl;
2433 nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2435 cgindex[i] = nat;
2437 dd->ncg_home = state->ncg_gl;
2438 dd->nat_home = nat;
2440 set_zones_ncg_home(dd);
2443 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2445 while (cg >= cginfo_mb->cg_end)
2447 cginfo_mb++;
2450 return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2453 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2454 t_forcerec *fr,char *bLocalCG)
2456 cginfo_mb_t *cginfo_mb;
2457 int *cginfo;
2458 int cg;
2460 if (fr != NULL)
2462 cginfo_mb = fr->cginfo_mb;
2463 cginfo = fr->cginfo;
2465 for(cg=cg0; cg<cg1; cg++)
2467 cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2471 if (bLocalCG != NULL)
2473 for(cg=cg0; cg<cg1; cg++)
2475 bLocalCG[index_gl[cg]] = TRUE;
2480 static void make_dd_indices(gmx_domdec_t *dd,
2481 const int *gcgs_index,int cg_start)
2483 int nzone,zone,zone1,cg0,cg1,cg1_p1,cg,cg_gl,a,a_gl;
2484 int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2485 gmx_ga2la_t *ga2la;
2486 char *bLocalCG;
2487 gmx_bool bCGs;
2489 bLocalCG = dd->comm->bLocalCG;
2491 if (dd->nat_tot > dd->gatindex_nalloc)
2493 dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2494 srenew(dd->gatindex,dd->gatindex_nalloc);
2497 nzone = dd->comm->zones.n;
2498 zone2cg = dd->comm->zones.cg_range;
2499 zone_ncg1 = dd->comm->zone_ncg1;
2500 index_gl = dd->index_gl;
2501 gatindex = dd->gatindex;
2502 bCGs = dd->comm->bCGs;
2504 if (zone2cg[1] != dd->ncg_home)
2506 gmx_incons("dd->ncg_zone is not up to date");
2509 /* Make the local to global and global to local atom index */
2510 a = dd->cgindex[cg_start];
2511 for(zone=0; zone<nzone; zone++)
2513 if (zone == 0)
2515 cg0 = cg_start;
2517 else
2519 cg0 = zone2cg[zone];
2521 cg1 = zone2cg[zone+1];
2522 cg1_p1 = cg0 + zone_ncg1[zone];
2524 for(cg=cg0; cg<cg1; cg++)
2526 zone1 = zone;
2527 if (cg >= cg1_p1)
2529 /* Signal that this cg is from more than one pulse away */
2530 zone1 += nzone;
2532 cg_gl = index_gl[cg];
2533 if (bCGs)
2535 for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2537 gatindex[a] = a_gl;
2538 ga2la_set(dd->ga2la,a_gl,a,zone1);
2539 a++;
2542 else
2544 gatindex[a] = cg_gl;
2545 ga2la_set(dd->ga2la,cg_gl,a,zone1);
2546 a++;
2552 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2553 const char *where)
2555 int ncg,i,ngl,nerr;
2557 nerr = 0;
2558 if (bLocalCG == NULL)
2560 return nerr;
2562 for(i=0; i<dd->ncg_tot; i++)
2564 if (!bLocalCG[dd->index_gl[i]])
2566 fprintf(stderr,
2567 "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2568 nerr++;
2571 ngl = 0;
2572 for(i=0; i<ncg_sys; i++)
2574 if (bLocalCG[i])
2576 ngl++;
2579 if (ngl != dd->ncg_tot)
2581 fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2582 nerr++;
2585 return nerr;
2588 static void check_index_consistency(gmx_domdec_t *dd,
2589 int natoms_sys,int ncg_sys,
2590 const char *where)
2592 int nerr,ngl,i,a,cell;
2593 int *have;
2595 nerr = 0;
2597 if (dd->comm->DD_debug > 1)
2599 snew(have,natoms_sys);
2600 for(a=0; a<dd->nat_tot; a++)
2602 if (have[dd->gatindex[a]] > 0)
2604 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2606 else
2608 have[dd->gatindex[a]] = a + 1;
2611 sfree(have);
2614 snew(have,dd->nat_tot);
2616 ngl = 0;
2617 for(i=0; i<natoms_sys; i++)
2619 if (ga2la_get(dd->ga2la,i,&a,&cell))
2621 if (a >= dd->nat_tot)
2623 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2624 nerr++;
2626 else
2628 have[a] = 1;
2629 if (dd->gatindex[a] != i)
2631 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2632 nerr++;
2635 ngl++;
2638 if (ngl != dd->nat_tot)
2640 fprintf(stderr,
2641 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2642 dd->rank,where,ngl,dd->nat_tot);
2644 for(a=0; a<dd->nat_tot; a++)
2646 if (have[a] == 0)
2648 fprintf(stderr,
2649 "DD node %d, %s: local atom %d, global %d has no global index\n",
2650 dd->rank,where,a+1,dd->gatindex[a]+1);
2653 sfree(have);
2655 nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2657 if (nerr > 0) {
2658 gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2659 dd->rank,where,nerr);
2663 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2665 int i;
2666 char *bLocalCG;
2668 if (a_start == 0)
2670 /* Clear the whole list without searching */
2671 ga2la_clear(dd->ga2la);
2673 else
2675 for(i=a_start; i<dd->nat_tot; i++)
2677 ga2la_del(dd->ga2la,dd->gatindex[i]);
2681 bLocalCG = dd->comm->bLocalCG;
2682 if (bLocalCG)
2684 for(i=cg_start; i<dd->ncg_tot; i++)
2686 bLocalCG[dd->index_gl[i]] = FALSE;
2690 dd_clear_local_vsite_indices(dd);
2692 if (dd->constraints)
2694 dd_clear_local_constraint_indices(dd);
2698 static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
2699 int dim_ind)
2701 real grid_jump_limit;
2703 /* The distance between the boundaries of cells at distance
2704 * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2705 * and by the fact that cells should not be shifted by more than
2706 * half their size, such that cg's only shift by one cell
2707 * at redecomposition.
2709 grid_jump_limit = comm->cellsize_limit;
2710 if (!comm->bVacDLBNoLimit)
2712 grid_jump_limit = max(grid_jump_limit,
2713 cutoff/comm->cd[dim_ind].np);
2716 return grid_jump_limit;
2719 static gmx_bool check_grid_jump(gmx_large_int_t step,
2720 gmx_domdec_t *dd,
2721 real cutoff,
2722 gmx_ddbox_t *ddbox,
2723 gmx_bool bFatal)
2725 gmx_domdec_comm_t *comm;
2726 int d,dim;
2727 real limit,bfac;
2728 gmx_bool bInvalid;
2730 bInvalid = FALSE;
2732 comm = dd->comm;
2734 for(d=1; d<dd->ndim; d++)
2736 dim = dd->dim[d];
2737 limit = grid_jump_limit(comm,cutoff,d);
2738 bfac = ddbox->box_size[dim];
2739 if (ddbox->tric_dir[dim])
2741 bfac *= ddbox->skew_fac[dim];
2743 if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac < limit ||
2744 (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2746 bInvalid = TRUE;
2748 if (bFatal)
2750 char buf[22];
2752 /* This error should never be triggered under normal
2753 * circumstances, but you never know ...
2755 gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2756 gmx_step_str(step,buf),
2757 dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2762 return bInvalid;
2765 static int dd_load_count(gmx_domdec_comm_t *comm)
2767 return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2770 static float dd_force_load(gmx_domdec_comm_t *comm)
2772 float load;
2774 if (comm->eFlop)
2776 load = comm->flop;
2777 if (comm->eFlop > 1)
2779 load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2782 else
2784 load = comm->cycl[ddCyclF];
2785 if (comm->cycl_n[ddCyclF] > 1)
2787 /* Subtract the maximum of the last n cycle counts
2788 * to get rid of possible high counts due to other soures,
2789 * for instance system activity, that would otherwise
2790 * affect the dynamic load balancing.
2792 load -= comm->cycl_max[ddCyclF];
2796 return load;
2799 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2801 gmx_domdec_comm_t *comm;
2802 int i;
2804 comm = dd->comm;
2806 snew(*dim_f,dd->nc[dim]+1);
2807 (*dim_f)[0] = 0;
2808 for(i=1; i<dd->nc[dim]; i++)
2810 if (comm->slb_frac[dim])
2812 (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2814 else
2816 (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2819 (*dim_f)[dd->nc[dim]] = 1;
2822 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2824 int pmeindex,slab,nso,i;
2825 ivec xyz;
2827 if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2829 ddpme->dim = YY;
2831 else
2833 ddpme->dim = dimind;
2835 ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2837 ddpme->nslab = (ddpme->dim == 0 ?
2838 dd->comm->npmenodes_x :
2839 dd->comm->npmenodes_y);
2841 if (ddpme->nslab <= 1)
2843 return;
2846 nso = dd->comm->npmenodes/ddpme->nslab;
2847 /* Determine for each PME slab the PP location range for dimension dim */
2848 snew(ddpme->pp_min,ddpme->nslab);
2849 snew(ddpme->pp_max,ddpme->nslab);
2850 for(slab=0; slab<ddpme->nslab; slab++) {
2851 ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2852 ddpme->pp_max[slab] = 0;
2854 for(i=0; i<dd->nnodes; i++) {
2855 ddindex2xyz(dd->nc,i,xyz);
2856 /* For y only use our y/z slab.
2857 * This assumes that the PME x grid size matches the DD grid size.
2859 if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2860 pmeindex = ddindex2pmeindex(dd,i);
2861 if (dimind == 0) {
2862 slab = pmeindex/nso;
2863 } else {
2864 slab = pmeindex % ddpme->nslab;
2866 ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2867 ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2871 set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2874 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2876 if (dd->comm->ddpme[0].dim == XX)
2878 return dd->comm->ddpme[0].maxshift;
2880 else
2882 return 0;
2886 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2888 if (dd->comm->ddpme[0].dim == YY)
2890 return dd->comm->ddpme[0].maxshift;
2892 else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2894 return dd->comm->ddpme[1].maxshift;
2896 else
2898 return 0;
2902 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2903 gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2905 gmx_domdec_comm_t *comm;
2906 int nc,ns,s;
2907 int *xmin,*xmax;
2908 real range,pme_boundary;
2909 int sh;
2911 comm = dd->comm;
2912 nc = dd->nc[ddpme->dim];
2913 ns = ddpme->nslab;
2915 if (!ddpme->dim_match)
2917 /* PP decomposition is not along dim: the worst situation */
2918 sh = ns/2;
2920 else if (ns <= 3 || (bUniform && ns == nc))
2922 /* The optimal situation */
2923 sh = 1;
2925 else
2927 /* We need to check for all pme nodes which nodes they
2928 * could possibly need to communicate with.
2930 xmin = ddpme->pp_min;
2931 xmax = ddpme->pp_max;
2932 /* Allow for atoms to be maximally 2/3 times the cut-off
2933 * out of their DD cell. This is a reasonable balance between
2934 * between performance and support for most charge-group/cut-off
2935 * combinations.
2937 range = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2938 /* Avoid extra communication when we are exactly at a boundary */
2939 range *= 0.999;
2941 sh = 1;
2942 for(s=0; s<ns; s++)
2944 /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2945 pme_boundary = (real)s/ns;
2946 while (sh+1 < ns &&
2947 ((s-(sh+1) >= 0 &&
2948 cell_f[xmax[s-(sh+1) ]+1] + range > pme_boundary) ||
2949 (s-(sh+1) < 0 &&
2950 cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2952 sh++;
2954 pme_boundary = (real)(s+1)/ns;
2955 while (sh+1 < ns &&
2956 ((s+(sh+1) < ns &&
2957 cell_f[xmin[s+(sh+1) ] ] - range < pme_boundary) ||
2958 (s+(sh+1) >= ns &&
2959 cell_f[xmin[s+(sh+1)-ns] ] + 1 - range < pme_boundary)))
2961 sh++;
2966 ddpme->maxshift = sh;
2968 if (debug)
2970 fprintf(debug,"PME slab communication range for dim %d is %d\n",
2971 ddpme->dim,ddpme->maxshift);
2975 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2977 int d,dim;
2979 for(d=0; d<dd->ndim; d++)
2981 dim = dd->dim[d];
2982 if (dim < ddbox->nboundeddim &&
2983 ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2984 dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2986 gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2987 dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2988 dd->nc[dim],dd->comm->cellsize_limit);
2993 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2994 gmx_bool bMaster,ivec npulse)
2996 gmx_domdec_comm_t *comm;
2997 int d,j;
2998 rvec cellsize_min;
2999 real *cell_x,cell_dx,cellsize;
3001 comm = dd->comm;
3003 for(d=0; d<DIM; d++)
3005 cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3006 npulse[d] = 1;
3007 if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3009 /* Uniform grid */
3010 cell_dx = ddbox->box_size[d]/dd->nc[d];
3011 if (bMaster)
3013 for(j=0; j<dd->nc[d]+1; j++)
3015 dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3018 else
3020 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d] )*cell_dx;
3021 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3023 cellsize = cell_dx*ddbox->skew_fac[d];
3024 while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3026 npulse[d]++;
3028 cellsize_min[d] = cellsize;
3030 else
3032 /* Statically load balanced grid */
3033 /* Also when we are not doing a master distribution we determine
3034 * all cell borders in a loop to obtain identical values
3035 * to the master distribution case and to determine npulse.
3037 if (bMaster)
3039 cell_x = dd->ma->cell_x[d];
3041 else
3043 snew(cell_x,dd->nc[d]+1);
3045 cell_x[0] = ddbox->box0[d];
3046 for(j=0; j<dd->nc[d]; j++)
3048 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
3049 cell_x[j+1] = cell_x[j] + cell_dx;
3050 cellsize = cell_dx*ddbox->skew_fac[d];
3051 while (cellsize*npulse[d] < comm->cutoff &&
3052 npulse[d] < dd->nc[d]-1)
3054 npulse[d]++;
3056 cellsize_min[d] = min(cellsize_min[d],cellsize);
3058 if (!bMaster)
3060 comm->cell_x0[d] = cell_x[dd->ci[d]];
3061 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3062 sfree(cell_x);
3065 /* The following limitation is to avoid that a cell would receive
3066 * some of its own home charge groups back over the periodic boundary.
3067 * Double charge groups cause trouble with the global indices.
3069 if (d < ddbox->npbcdim &&
3070 dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3072 gmx_fatal_collective(FARGS,NULL,dd,
3073 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3074 dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
3075 comm->cutoff,
3076 dd->nc[d],dd->nc[d],
3077 dd->nnodes > dd->nc[d] ? "cells" : "processors");
3081 if (!comm->bDynLoadBal)
3083 copy_rvec(cellsize_min,comm->cellsize_min);
3086 for(d=0; d<comm->npmedecompdim; d++)
3088 set_pme_maxshift(dd,&comm->ddpme[d],
3089 comm->slb_frac[dd->dim[d]]==NULL,ddbox,
3090 comm->ddpme[d].slb_dim_f);
3095 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3096 int d,int dim,gmx_domdec_root_t *root,
3097 gmx_ddbox_t *ddbox,
3098 gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
3100 gmx_domdec_comm_t *comm;
3101 int ncd,i,j,nmin,nmin_old;
3102 gmx_bool bLimLo,bLimHi;
3103 real *cell_size;
3104 real fac,halfway,cellsize_limit_f_i,region_size;
3105 gmx_bool bPBC,bLastHi=FALSE;
3106 int nrange[]={range[0],range[1]};
3108 region_size= root->cell_f[range[1]]-root->cell_f[range[0]];
3110 comm = dd->comm;
3112 ncd = dd->nc[dim];
3114 bPBC = (dim < ddbox->npbcdim);
3116 cell_size = root->buf_ncd;
3118 if (debug)
3120 fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
3123 /* First we need to check if the scaling does not make cells
3124 * smaller than the smallest allowed size.
3125 * We need to do this iteratively, since if a cell is too small,
3126 * it needs to be enlarged, which makes all the other cells smaller,
3127 * which could in turn make another cell smaller than allowed.
3129 for(i=range[0]; i<range[1]; i++)
3131 root->bCellMin[i] = FALSE;
3133 nmin = 0;
3136 nmin_old = nmin;
3137 /* We need the total for normalization */
3138 fac = 0;
3139 for(i=range[0]; i<range[1]; i++)
3141 if (root->bCellMin[i] == FALSE)
3143 fac += cell_size[i];
3146 fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3147 /* Determine the cell boundaries */
3148 for(i=range[0]; i<range[1]; i++)
3150 if (root->bCellMin[i] == FALSE)
3152 cell_size[i] *= fac;
3153 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3155 cellsize_limit_f_i = 0;
3157 else
3159 cellsize_limit_f_i = cellsize_limit_f;
3161 if (cell_size[i] < cellsize_limit_f_i)
3163 root->bCellMin[i] = TRUE;
3164 cell_size[i] = cellsize_limit_f_i;
3165 nmin++;
3168 root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3171 while (nmin > nmin_old);
3173 i=range[1]-1;
3174 cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3175 /* For this check we should not use DD_CELL_MARGIN,
3176 * but a slightly smaller factor,
3177 * since rounding could get use below the limit.
3179 if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3181 char buf[22];
3182 gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3183 gmx_step_str(step,buf),
3184 dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3185 ncd,comm->cellsize_min[dim]);
3188 root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3190 if (!bUniform)
3192 /* Check if the boundary did not displace more than halfway
3193 * each of the cells it bounds, as this could cause problems,
3194 * especially when the differences between cell sizes are large.
3195 * If changes are applied, they will not make cells smaller
3196 * than the cut-off, as we check all the boundaries which
3197 * might be affected by a change and if the old state was ok,
3198 * the cells will at most be shrunk back to their old size.
3200 for(i=range[0]+1; i<range[1]; i++)
3202 halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3203 if (root->cell_f[i] < halfway)
3205 root->cell_f[i] = halfway;
3206 /* Check if the change also causes shifts of the next boundaries */
3207 for(j=i+1; j<range[1]; j++)
3209 if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3210 root->cell_f[j] = root->cell_f[j-1] + cellsize_limit_f;
3213 halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3214 if (root->cell_f[i] > halfway)
3216 root->cell_f[i] = halfway;
3217 /* Check if the change also causes shifts of the next boundaries */
3218 for(j=i-1; j>=range[0]+1; j--)
3220 if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3221 root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3227 /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3228 /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3229 * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3230 * for a and b nrange is used */
3231 if (d > 0)
3233 /* Take care of the staggering of the cell boundaries */
3234 if (bUniform)
3236 for(i=range[0]; i<range[1]; i++)
3238 root->cell_f_max0[i] = root->cell_f[i];
3239 root->cell_f_min1[i] = root->cell_f[i+1];
3242 else
3244 for(i=range[0]+1; i<range[1]; i++)
3246 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3247 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3248 if (bLimLo && bLimHi)
3250 /* Both limits violated, try the best we can */
3251 /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3252 root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3253 nrange[0]=range[0];
3254 nrange[1]=i;
3255 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3257 nrange[0]=i;
3258 nrange[1]=range[1];
3259 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3261 return;
3263 else if (bLimLo)
3265 /* root->cell_f[i] = root->bound_min[i]; */
3266 nrange[1]=i; /* only store violation location. There could be a LimLo violation following with an higher index */
3267 bLastHi=FALSE;
3269 else if (bLimHi && !bLastHi)
3271 bLastHi=TRUE;
3272 if (nrange[1] < range[1]) /* found a LimLo before */
3274 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3275 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3276 nrange[0]=nrange[1];
3278 root->cell_f[i] = root->bound_max[i];
3279 nrange[1]=i;
3280 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3281 nrange[0]=i;
3282 nrange[1]=range[1];
3285 if (nrange[1] < range[1]) /* found last a LimLo */
3287 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3288 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3289 nrange[0]=nrange[1];
3290 nrange[1]=range[1];
3291 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3293 else if (nrange[0] > range[0]) /* found at least one LimHi */
3295 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3302 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3303 int d,int dim,gmx_domdec_root_t *root,
3304 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3305 gmx_bool bUniform,gmx_large_int_t step)
3307 gmx_domdec_comm_t *comm;
3308 int ncd,d1,i,j,pos;
3309 real *cell_size;
3310 real load_aver,load_i,imbalance,change,change_max,sc;
3311 real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3312 real change_limit;
3313 real relax = 0.5;
3314 gmx_bool bPBC;
3315 int range[] = { 0, 0 };
3317 comm = dd->comm;
3319 /* Convert the maximum change from the input percentage to a fraction */
3320 change_limit = comm->dlb_scale_lim*0.01;
3322 ncd = dd->nc[dim];
3324 bPBC = (dim < ddbox->npbcdim);
3326 cell_size = root->buf_ncd;
3328 /* Store the original boundaries */
3329 for(i=0; i<ncd+1; i++)
3331 root->old_cell_f[i] = root->cell_f[i];
3333 if (bUniform) {
3334 for(i=0; i<ncd; i++)
3336 cell_size[i] = 1.0/ncd;
3339 else if (dd_load_count(comm))
3341 load_aver = comm->load[d].sum_m/ncd;
3342 change_max = 0;
3343 for(i=0; i<ncd; i++)
3345 /* Determine the relative imbalance of cell i */
3346 load_i = comm->load[d].load[i*comm->load[d].nload+2];
3347 imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3348 /* Determine the change of the cell size using underrelaxation */
3349 change = -relax*imbalance;
3350 change_max = max(change_max,max(change,-change));
3352 /* Limit the amount of scaling.
3353 * We need to use the same rescaling for all cells in one row,
3354 * otherwise the load balancing might not converge.
3356 sc = relax;
3357 if (change_max > change_limit)
3359 sc *= change_limit/change_max;
3361 for(i=0; i<ncd; i++)
3363 /* Determine the relative imbalance of cell i */
3364 load_i = comm->load[d].load[i*comm->load[d].nload+2];
3365 imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3366 /* Determine the change of the cell size using underrelaxation */
3367 change = -sc*imbalance;
3368 cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3372 cellsize_limit_f = comm->cellsize_min[dim]/ddbox->box_size[dim];
3373 cellsize_limit_f *= DD_CELL_MARGIN;
3374 dist_min_f_hard = grid_jump_limit(comm,comm->cutoff,d)/ddbox->box_size[dim];
3375 dist_min_f = dist_min_f_hard * DD_CELL_MARGIN;
3376 if (ddbox->tric_dir[dim])
3378 cellsize_limit_f /= ddbox->skew_fac[dim];
3379 dist_min_f /= ddbox->skew_fac[dim];
3381 if (bDynamicBox && d > 0)
3383 dist_min_f *= DD_PRES_SCALE_MARGIN;
3385 if (d > 0 && !bUniform)
3387 /* Make sure that the grid is not shifted too much */
3388 for(i=1; i<ncd; i++) {
3389 if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3391 gmx_incons("Inconsistent DD boundary staggering limits!");
3393 root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3394 space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3395 if (space > 0) {
3396 root->bound_min[i] += 0.5*space;
3398 root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3399 space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3400 if (space < 0) {
3401 root->bound_max[i] += 0.5*space;
3403 if (debug)
3405 fprintf(debug,
3406 "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3407 d,i,
3408 root->cell_f_max0[i-1] + dist_min_f,
3409 root->bound_min[i],root->cell_f[i],root->bound_max[i],
3410 root->cell_f_min1[i] - dist_min_f);
3414 range[1]=ncd;
3415 root->cell_f[0] = 0;
3416 root->cell_f[ncd] = 1;
3417 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3420 /* After the checks above, the cells should obey the cut-off
3421 * restrictions, but it does not hurt to check.
3423 for(i=0; i<ncd; i++)
3425 if (debug)
3427 fprintf(debug,"Relative bounds dim %d cell %d: %f %f\n",
3428 dim,i,root->cell_f[i],root->cell_f[i+1]);
3431 if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3432 root->cell_f[i+1] - root->cell_f[i] <
3433 cellsize_limit_f/DD_CELL_MARGIN)
3435 char buf[22];
3436 fprintf(stderr,
3437 "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3438 gmx_step_str(step,buf),dim2char(dim),i,
3439 (root->cell_f[i+1] - root->cell_f[i])
3440 *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3444 pos = ncd + 1;
3445 /* Store the cell boundaries of the lower dimensions at the end */
3446 for(d1=0; d1<d; d1++)
3448 root->cell_f[pos++] = comm->cell_f0[d1];
3449 root->cell_f[pos++] = comm->cell_f1[d1];
3452 if (d < comm->npmedecompdim)
3454 /* The master determines the maximum shift for
3455 * the coordinate communication between separate PME nodes.
3457 set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3459 root->cell_f[pos++] = comm->ddpme[0].maxshift;
3460 if (d >= 1)
3462 root->cell_f[pos++] = comm->ddpme[1].maxshift;
3466 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3467 gmx_ddbox_t *ddbox,int dimind)
3469 gmx_domdec_comm_t *comm;
3470 int dim;
3472 comm = dd->comm;
3474 /* Set the cell dimensions */
3475 dim = dd->dim[dimind];
3476 comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3477 comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3478 if (dim >= ddbox->nboundeddim)
3480 comm->cell_x0[dim] += ddbox->box0[dim];
3481 comm->cell_x1[dim] += ddbox->box0[dim];
3485 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3486 int d,int dim,real *cell_f_row,
3487 gmx_ddbox_t *ddbox)
3489 gmx_domdec_comm_t *comm;
3490 int d1,dim1,pos;
3492 comm = dd->comm;
3494 #ifdef GMX_MPI
3495 /* Each node would only need to know two fractions,
3496 * but it is probably cheaper to broadcast the whole array.
3498 MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3499 0,comm->mpi_comm_load[d]);
3500 #endif
3501 /* Copy the fractions for this dimension from the buffer */
3502 comm->cell_f0[d] = cell_f_row[dd->ci[dim] ];
3503 comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3504 /* The whole array was communicated, so set the buffer position */
3505 pos = dd->nc[dim] + 1;
3506 for(d1=0; d1<=d; d1++)
3508 if (d1 < d)
3510 /* Copy the cell fractions of the lower dimensions */
3511 comm->cell_f0[d1] = cell_f_row[pos++];
3512 comm->cell_f1[d1] = cell_f_row[pos++];
3514 relative_to_absolute_cell_bounds(dd,ddbox,d1);
3516 /* Convert the communicated shift from float to int */
3517 comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3518 if (d >= 1)
3520 comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3524 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3525 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3526 gmx_bool bUniform,gmx_large_int_t step)
3528 gmx_domdec_comm_t *comm;
3529 int d,dim,d1;
3530 gmx_bool bRowMember,bRowRoot;
3531 real *cell_f_row;
3533 comm = dd->comm;
3535 for(d=0; d<dd->ndim; d++)
3537 dim = dd->dim[d];
3538 bRowMember = TRUE;
3539 bRowRoot = TRUE;
3540 for(d1=d; d1<dd->ndim; d1++)
3542 if (dd->ci[dd->dim[d1]] > 0)
3544 if (d1 > d)
3546 bRowMember = FALSE;
3548 bRowRoot = FALSE;
3551 if (bRowMember)
3553 if (bRowRoot)
3555 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3556 ddbox,bDynamicBox,bUniform,step);
3557 cell_f_row = comm->root[d]->cell_f;
3559 else
3561 cell_f_row = comm->cell_f_row;
3563 distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3568 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3570 int d;
3572 /* This function assumes the box is static and should therefore
3573 * not be called when the box has changed since the last
3574 * call to dd_partition_system.
3576 for(d=0; d<dd->ndim; d++)
3578 relative_to_absolute_cell_bounds(dd,ddbox,d);
3584 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3585 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3586 gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3587 gmx_wallcycle_t wcycle)
3589 gmx_domdec_comm_t *comm;
3590 int dim;
3592 comm = dd->comm;
3594 if (bDoDLB)
3596 wallcycle_start(wcycle,ewcDDCOMMBOUND);
3597 set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3598 wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3600 else if (bDynamicBox)
3602 set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3605 /* Set the dimensions for which no DD is used */
3606 for(dim=0; dim<DIM; dim++) {
3607 if (dd->nc[dim] == 1) {
3608 comm->cell_x0[dim] = 0;
3609 comm->cell_x1[dim] = ddbox->box_size[dim];
3610 if (dim >= ddbox->nboundeddim)
3612 comm->cell_x0[dim] += ddbox->box0[dim];
3613 comm->cell_x1[dim] += ddbox->box0[dim];
3619 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3621 int d,np,i;
3622 gmx_domdec_comm_dim_t *cd;
3624 for(d=0; d<dd->ndim; d++)
3626 cd = &dd->comm->cd[d];
3627 np = npulse[dd->dim[d]];
3628 if (np > cd->np_nalloc)
3630 if (debug)
3632 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3633 dim2char(dd->dim[d]),np);
3635 if (DDMASTER(dd) && cd->np_nalloc > 0)
3637 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3639 srenew(cd->ind,np);
3640 for(i=cd->np_nalloc; i<np; i++)
3642 cd->ind[i].index = NULL;
3643 cd->ind[i].nalloc = 0;
3645 cd->np_nalloc = np;
3647 cd->np = np;
3652 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3653 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3654 gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3655 gmx_wallcycle_t wcycle)
3657 gmx_domdec_comm_t *comm;
3658 int d;
3659 ivec npulse;
3661 comm = dd->comm;
3663 /* Copy the old cell boundaries for the cg displacement check */
3664 copy_rvec(comm->cell_x0,comm->old_cell_x0);
3665 copy_rvec(comm->cell_x1,comm->old_cell_x1);
3667 if (comm->bDynLoadBal)
3669 if (DDMASTER(dd))
3671 check_box_size(dd,ddbox);
3673 set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3675 else
3677 set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3678 realloc_comm_ind(dd,npulse);
3681 if (debug)
3683 for(d=0; d<DIM; d++)
3685 fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3686 d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3691 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3692 gmx_ddbox_t *ddbox,
3693 rvec cell_ns_x0,rvec cell_ns_x1,
3694 gmx_large_int_t step)
3696 gmx_domdec_comm_t *comm;
3697 int dim_ind,dim;
3699 comm = dd->comm;
3701 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3703 dim = dd->dim[dim_ind];
3705 /* Without PBC we don't have restrictions on the outer cells */
3706 if (!(dim >= ddbox->npbcdim &&
3707 (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3708 comm->bDynLoadBal &&
3709 (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3710 comm->cellsize_min[dim])
3712 char buf[22];
3713 gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3714 gmx_step_str(step,buf),dim2char(dim),
3715 comm->cell_x1[dim] - comm->cell_x0[dim],
3716 ddbox->skew_fac[dim],
3717 dd->comm->cellsize_min[dim],
3718 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3722 if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3724 /* Communicate the boundaries and update cell_ns_x0/1 */
3725 dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3726 if (dd->bGridJump && dd->ndim > 1)
3728 check_grid_jump(step,dd,dd->comm->cutoff,ddbox,TRUE);
3733 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3735 if (YY < npbcdim)
3737 tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3739 else
3741 tcm[YY][XX] = 0;
3743 if (ZZ < npbcdim)
3745 tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3746 tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3748 else
3750 tcm[ZZ][XX] = 0;
3751 tcm[ZZ][YY] = 0;
3755 static void check_screw_box(matrix box)
3757 /* Mathematical limitation */
3758 if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3760 gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3763 /* Limitation due to the asymmetry of the eighth shell method */
3764 if (box[ZZ][YY] != 0)
3766 gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3770 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3771 matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3772 gmx_domdec_t *dd)
3774 gmx_domdec_master_t *ma;
3775 int **tmp_ind=NULL,*tmp_nalloc=NULL;
3776 int i,icg,j,k,k0,k1,d,npbcdim;
3777 matrix tcm;
3778 rvec box_size,cg_cm;
3779 ivec ind;
3780 real nrcg,inv_ncg,pos_d;
3781 atom_id *cgindex;
3782 gmx_bool bUnbounded,bScrew;
3784 ma = dd->ma;
3786 if (tmp_ind == NULL)
3788 snew(tmp_nalloc,dd->nnodes);
3789 snew(tmp_ind,dd->nnodes);
3790 for(i=0; i<dd->nnodes; i++)
3792 tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3793 snew(tmp_ind[i],tmp_nalloc[i]);
3797 /* Clear the count */
3798 for(i=0; i<dd->nnodes; i++)
3800 ma->ncg[i] = 0;
3801 ma->nat[i] = 0;
3804 make_tric_corr_matrix(dd->npbcdim,box,tcm);
3806 cgindex = cgs->index;
3808 /* Compute the center of geometry for all charge groups */
3809 for(icg=0; icg<cgs->nr; icg++)
3811 k0 = cgindex[icg];
3812 k1 = cgindex[icg+1];
3813 nrcg = k1 - k0;
3814 if (nrcg == 1)
3816 copy_rvec(pos[k0],cg_cm);
3818 else
3820 inv_ncg = 1.0/nrcg;
3822 clear_rvec(cg_cm);
3823 for(k=k0; (k<k1); k++)
3825 rvec_inc(cg_cm,pos[k]);
3827 for(d=0; (d<DIM); d++)
3829 cg_cm[d] *= inv_ncg;
3832 /* Put the charge group in the box and determine the cell index */
3833 for(d=DIM-1; d>=0; d--) {
3834 pos_d = cg_cm[d];
3835 if (d < dd->npbcdim)
3837 bScrew = (dd->bScrewPBC && d == XX);
3838 if (tric_dir[d] && dd->nc[d] > 1)
3840 /* Use triclinic coordintates for this dimension */
3841 for(j=d+1; j<DIM; j++)
3843 pos_d += cg_cm[j]*tcm[j][d];
3846 while(pos_d >= box[d][d])
3848 pos_d -= box[d][d];
3849 rvec_dec(cg_cm,box[d]);
3850 if (bScrew)
3852 cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3853 cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3855 for(k=k0; (k<k1); k++)
3857 rvec_dec(pos[k],box[d]);
3858 if (bScrew)
3860 pos[k][YY] = box[YY][YY] - pos[k][YY];
3861 pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3865 while(pos_d < 0)
3867 pos_d += box[d][d];
3868 rvec_inc(cg_cm,box[d]);
3869 if (bScrew)
3871 cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3872 cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3874 for(k=k0; (k<k1); k++)
3876 rvec_inc(pos[k],box[d]);
3877 if (bScrew) {
3878 pos[k][YY] = box[YY][YY] - pos[k][YY];
3879 pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3884 /* This could be done more efficiently */
3885 ind[d] = 0;
3886 while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3888 ind[d]++;
3891 i = dd_index(dd->nc,ind);
3892 if (ma->ncg[i] == tmp_nalloc[i])
3894 tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3895 srenew(tmp_ind[i],tmp_nalloc[i]);
3897 tmp_ind[i][ma->ncg[i]] = icg;
3898 ma->ncg[i]++;
3899 ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3902 k1 = 0;
3903 for(i=0; i<dd->nnodes; i++)
3905 ma->index[i] = k1;
3906 for(k=0; k<ma->ncg[i]; k++)
3908 ma->cg[k1++] = tmp_ind[i][k];
3911 ma->index[dd->nnodes] = k1;
3913 for(i=0; i<dd->nnodes; i++)
3915 sfree(tmp_ind[i]);
3917 sfree(tmp_ind);
3918 sfree(tmp_nalloc);
3920 if (fplog)
3922 char buf[22];
3923 fprintf(fplog,"Charge group distribution at step %s:",
3924 gmx_step_str(step,buf));
3925 for(i=0; i<dd->nnodes; i++)
3927 fprintf(fplog," %d",ma->ncg[i]);
3929 fprintf(fplog,"\n");
3933 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3934 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3935 rvec pos[])
3937 gmx_domdec_master_t *ma=NULL;
3938 ivec npulse;
3939 int i,cg_gl;
3940 int *ibuf,buf2[2] = { 0, 0 };
3941 gmx_bool bMaster = DDMASTER(dd);
3942 if (bMaster)
3944 ma = dd->ma;
3946 if (dd->bScrewPBC)
3948 check_screw_box(box);
3951 set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3953 distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3954 for(i=0; i<dd->nnodes; i++)
3956 ma->ibuf[2*i] = ma->ncg[i];
3957 ma->ibuf[2*i+1] = ma->nat[i];
3959 ibuf = ma->ibuf;
3961 else
3963 ibuf = NULL;
3965 dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3967 dd->ncg_home = buf2[0];
3968 dd->nat_home = buf2[1];
3969 dd->ncg_tot = dd->ncg_home;
3970 dd->nat_tot = dd->nat_home;
3971 if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3973 dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3974 srenew(dd->index_gl,dd->cg_nalloc);
3975 srenew(dd->cgindex,dd->cg_nalloc+1);
3977 if (bMaster)
3979 for(i=0; i<dd->nnodes; i++)
3981 ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3982 ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3986 dd_scatterv(dd,
3987 DDMASTER(dd) ? ma->ibuf : NULL,
3988 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3989 DDMASTER(dd) ? ma->cg : NULL,
3990 dd->ncg_home*sizeof(int),dd->index_gl);
3992 /* Determine the home charge group sizes */
3993 dd->cgindex[0] = 0;
3994 for(i=0; i<dd->ncg_home; i++)
3996 cg_gl = dd->index_gl[i];
3997 dd->cgindex[i+1] =
3998 dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4001 if (debug)
4003 fprintf(debug,"Home charge groups:\n");
4004 for(i=0; i<dd->ncg_home; i++)
4006 fprintf(debug," %d",dd->index_gl[i]);
4007 if (i % 10 == 9)
4008 fprintf(debug,"\n");
4010 fprintf(debug,"\n");
4014 static int compact_and_copy_vec_at(int ncg,int *move,
4015 int *cgindex,
4016 int nvec,int vec,
4017 rvec *src,gmx_domdec_comm_t *comm,
4018 gmx_bool bCompact)
4020 int m,icg,i,i0,i1,nrcg;
4021 int home_pos;
4022 int pos_vec[DIM*2];
4024 home_pos = 0;
4026 for(m=0; m<DIM*2; m++)
4028 pos_vec[m] = 0;
4031 i0 = 0;
4032 for(icg=0; icg<ncg; icg++)
4034 i1 = cgindex[icg+1];
4035 m = move[icg];
4036 if (m == -1)
4038 if (bCompact)
4040 /* Compact the home array in place */
4041 for(i=i0; i<i1; i++)
4043 copy_rvec(src[i],src[home_pos++]);
4047 else
4049 /* Copy to the communication buffer */
4050 nrcg = i1 - i0;
4051 pos_vec[m] += 1 + vec*nrcg;
4052 for(i=i0; i<i1; i++)
4054 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
4056 pos_vec[m] += (nvec - vec - 1)*nrcg;
4058 if (!bCompact)
4060 home_pos += i1 - i0;
4062 i0 = i1;
4065 return home_pos;
4068 static int compact_and_copy_vec_cg(int ncg,int *move,
4069 int *cgindex,
4070 int nvec,rvec *src,gmx_domdec_comm_t *comm,
4071 gmx_bool bCompact)
4073 int m,icg,i0,i1,nrcg;
4074 int home_pos;
4075 int pos_vec[DIM*2];
4077 home_pos = 0;
4079 for(m=0; m<DIM*2; m++)
4081 pos_vec[m] = 0;
4084 i0 = 0;
4085 for(icg=0; icg<ncg; icg++)
4087 i1 = cgindex[icg+1];
4088 m = move[icg];
4089 if (m == -1)
4091 if (bCompact)
4093 /* Compact the home array in place */
4094 copy_rvec(src[icg],src[home_pos++]);
4097 else
4099 nrcg = i1 - i0;
4100 /* Copy to the communication buffer */
4101 copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
4102 pos_vec[m] += 1 + nrcg*nvec;
4104 i0 = i1;
4106 if (!bCompact)
4108 home_pos = ncg;
4111 return home_pos;
4114 static int compact_ind(int ncg,int *move,
4115 int *index_gl,int *cgindex,
4116 int *gatindex,
4117 gmx_ga2la_t ga2la,char *bLocalCG,
4118 int *cginfo)
4120 int cg,nat,a0,a1,a,a_gl;
4121 int home_pos;
4123 home_pos = 0;
4124 nat = 0;
4125 for(cg=0; cg<ncg; cg++)
4127 a0 = cgindex[cg];
4128 a1 = cgindex[cg+1];
4129 if (move[cg] == -1)
4131 /* Compact the home arrays in place.
4132 * Anything that can be done here avoids access to global arrays.
4134 cgindex[home_pos] = nat;
4135 for(a=a0; a<a1; a++)
4137 a_gl = gatindex[a];
4138 gatindex[nat] = a_gl;
4139 /* The cell number stays 0, so we don't need to set it */
4140 ga2la_change_la(ga2la,a_gl,nat);
4141 nat++;
4143 index_gl[home_pos] = index_gl[cg];
4144 cginfo[home_pos] = cginfo[cg];
4145 /* The charge group remains local, so bLocalCG does not change */
4146 home_pos++;
4148 else
4150 /* Clear the global indices */
4151 for(a=a0; a<a1; a++)
4153 ga2la_del(ga2la,gatindex[a]);
4155 if (bLocalCG)
4157 bLocalCG[index_gl[cg]] = FALSE;
4161 cgindex[home_pos] = nat;
4163 return home_pos;
4166 static void clear_and_mark_ind(int ncg,int *move,
4167 int *index_gl,int *cgindex,int *gatindex,
4168 gmx_ga2la_t ga2la,char *bLocalCG,
4169 int *cell_index)
4171 int cg,a0,a1,a;
4173 for(cg=0; cg<ncg; cg++)
4175 if (move[cg] >= 0)
4177 a0 = cgindex[cg];
4178 a1 = cgindex[cg+1];
4179 /* Clear the global indices */
4180 for(a=a0; a<a1; a++)
4182 ga2la_del(ga2la,gatindex[a]);
4184 if (bLocalCG)
4186 bLocalCG[index_gl[cg]] = FALSE;
4188 /* Signal that this cg has moved using the ns cell index.
4189 * Here we set it to -1. fill_grid will change it
4190 * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4192 cell_index[cg] = -1;
4197 static void print_cg_move(FILE *fplog,
4198 gmx_domdec_t *dd,
4199 gmx_large_int_t step,int cg,int dim,int dir,
4200 gmx_bool bHaveLimitdAndCMOld,real limitd,
4201 rvec cm_old,rvec cm_new,real pos_d)
4203 gmx_domdec_comm_t *comm;
4204 char buf[22];
4206 comm = dd->comm;
4208 fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4209 if (bHaveLimitdAndCMOld)
4211 fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4212 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4214 else
4216 fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4217 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4219 fprintf(fplog,"distance out of cell %f\n",
4220 dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4221 if (bHaveLimitdAndCMOld)
4223 fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4224 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4226 fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4227 cm_new[XX],cm_new[YY],cm_new[ZZ]);
4228 fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4229 dim2char(dim),
4230 comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4231 fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4232 dim2char(dim),
4233 comm->cell_x0[dim],comm->cell_x1[dim]);
4236 static void cg_move_error(FILE *fplog,
4237 gmx_domdec_t *dd,
4238 gmx_large_int_t step,int cg,int dim,int dir,
4239 gmx_bool bHaveLimitdAndCMOld,real limitd,
4240 rvec cm_old,rvec cm_new,real pos_d)
4242 if (fplog)
4244 print_cg_move(fplog, dd,step,cg,dim,dir,
4245 bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4247 print_cg_move(stderr,dd,step,cg,dim,dir,
4248 bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4249 gmx_fatal(FARGS,
4250 "A charge group moved too far between two domain decomposition steps\n"
4251 "This usually means that your system is not well equilibrated");
4254 static void rotate_state_atom(t_state *state,int a)
4256 int est;
4258 for(est=0; est<estNR; est++)
4260 if (EST_DISTR(est) && (state->flags & (1<<est))) {
4261 switch (est) {
4262 case estX:
4263 /* Rotate the complete state; for a rectangular box only */
4264 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4265 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4266 break;
4267 case estV:
4268 state->v[a][YY] = -state->v[a][YY];
4269 state->v[a][ZZ] = -state->v[a][ZZ];
4270 break;
4271 case estSDX:
4272 state->sd_X[a][YY] = -state->sd_X[a][YY];
4273 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4274 break;
4275 case estCGP:
4276 state->cg_p[a][YY] = -state->cg_p[a][YY];
4277 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4278 break;
4279 case estDISRE_INITF:
4280 case estDISRE_RM3TAV:
4281 case estORIRE_INITF:
4282 case estORIRE_DTAV:
4283 /* These are distances, so not affected by rotation */
4284 break;
4285 default:
4286 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4292 static int *get_moved(gmx_domdec_comm_t *comm,int natoms)
4294 if (natoms > comm->moved_nalloc)
4296 /* Contents should be preserved here */
4297 comm->moved_nalloc = over_alloc_dd(natoms);
4298 srenew(comm->moved,comm->moved_nalloc);
4301 return comm->moved;
4304 static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
4305 gmx_domdec_t *dd,
4306 t_state *state,
4307 ivec tric_dir,matrix tcm,
4308 rvec cell_x0,rvec cell_x1,
4309 rvec limitd,rvec limit0,rvec limit1,
4310 const int *cgindex,
4311 int cg_start,int cg_end,
4312 rvec *cg_cm,
4313 int *move)
4315 int npbcdim;
4316 int c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4317 int mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4318 int flag;
4319 gmx_bool bScrew;
4320 ivec dev;
4321 real inv_ncg,pos_d;
4322 rvec cm_new;
4324 npbcdim = dd->npbcdim;
4326 for(cg=cg_start; cg<cg_end; cg++)
4328 k0 = cgindex[cg];
4329 k1 = cgindex[cg+1];
4330 nrcg = k1 - k0;
4331 if (nrcg == 1)
4333 copy_rvec(state->x[k0],cm_new);
4335 else
4337 inv_ncg = 1.0/nrcg;
4339 clear_rvec(cm_new);
4340 for(k=k0; (k<k1); k++)
4342 rvec_inc(cm_new,state->x[k]);
4344 for(d=0; (d<DIM); d++)
4346 cm_new[d] = inv_ncg*cm_new[d];
4350 clear_ivec(dev);
4351 /* Do pbc and check DD cell boundary crossings */
4352 for(d=DIM-1; d>=0; d--)
4354 if (dd->nc[d] > 1)
4356 bScrew = (dd->bScrewPBC && d == XX);
4357 /* Determine the location of this cg in lattice coordinates */
4358 pos_d = cm_new[d];
4359 if (tric_dir[d])
4361 for(d2=d+1; d2<DIM; d2++)
4363 pos_d += cm_new[d2]*tcm[d2][d];
4366 /* Put the charge group in the triclinic unit-cell */
4367 if (pos_d >= cell_x1[d])
4369 if (pos_d >= limit1[d])
4371 cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4372 cg_cm[cg],cm_new,pos_d);
4374 dev[d] = 1;
4375 if (dd->ci[d] == dd->nc[d] - 1)
4377 rvec_dec(cm_new,state->box[d]);
4378 if (bScrew)
4380 cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4381 cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4383 for(k=k0; (k<k1); k++)
4385 rvec_dec(state->x[k],state->box[d]);
4386 if (bScrew)
4388 rotate_state_atom(state,k);
4393 else if (pos_d < cell_x0[d])
4395 if (pos_d < limit0[d])
4397 cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4398 cg_cm[cg],cm_new,pos_d);
4400 dev[d] = -1;
4401 if (dd->ci[d] == 0)
4403 rvec_inc(cm_new,state->box[d]);
4404 if (bScrew)
4406 cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4407 cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4409 for(k=k0; (k<k1); k++)
4411 rvec_inc(state->x[k],state->box[d]);
4412 if (bScrew)
4414 rotate_state_atom(state,k);
4420 else if (d < npbcdim)
4422 /* Put the charge group in the rectangular unit-cell */
4423 while (cm_new[d] >= state->box[d][d])
4425 rvec_dec(cm_new,state->box[d]);
4426 for(k=k0; (k<k1); k++)
4428 rvec_dec(state->x[k],state->box[d]);
4431 while (cm_new[d] < 0)
4433 rvec_inc(cm_new,state->box[d]);
4434 for(k=k0; (k<k1); k++)
4436 rvec_inc(state->x[k],state->box[d]);
4442 copy_rvec(cm_new,cg_cm[cg]);
4444 /* Determine where this cg should go */
4445 flag = 0;
4446 mc = -1;
4447 for(d=0; d<dd->ndim; d++)
4449 dim = dd->dim[d];
4450 if (dev[dim] == 1)
4452 flag |= DD_FLAG_FW(d);
4453 if (mc == -1)
4455 mc = d*2;
4458 else if (dev[dim] == -1)
4460 flag |= DD_FLAG_BW(d);
4461 if (mc == -1) {
4462 if (dd->nc[dim] > 2)
4464 mc = d*2 + 1;
4466 else
4468 mc = d*2;
4473 /* Temporarily store the flag in move */
4474 move[cg] = mc + flag;
4478 static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4479 gmx_domdec_t *dd,ivec tric_dir,
4480 t_state *state,rvec **f,
4481 t_forcerec *fr,t_mdatoms *md,
4482 gmx_bool bCompact,
4483 t_nrnb *nrnb,
4484 int *ncg_stay_home,
4485 int *ncg_moved)
4487 int *move;
4488 int npbcdim;
4489 int ncg[DIM*2],nat[DIM*2];
4490 int c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4491 int mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4492 int sbuf[2],rbuf[2];
4493 int home_pos_cg,home_pos_at,buf_pos;
4494 int flag;
4495 gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4496 gmx_bool bScrew;
4497 ivec dev;
4498 real inv_ncg,pos_d;
4499 matrix tcm;
4500 rvec *cg_cm=NULL,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4501 atom_id *cgindex;
4502 cginfo_mb_t *cginfo_mb;
4503 gmx_domdec_comm_t *comm;
4504 int *moved;
4505 int nthread,thread;
4507 if (dd->bScrewPBC)
4509 check_screw_box(state->box);
4512 comm = dd->comm;
4513 if (fr->cutoff_scheme == ecutsGROUP)
4515 cg_cm = fr->cg_cm;
4518 for(i=0; i<estNR; i++)
4520 if (EST_DISTR(i))
4522 switch (i)
4524 case estX: /* Always present */ break;
4525 case estV: bV = (state->flags & (1<<i)); break;
4526 case estSDX: bSDX = (state->flags & (1<<i)); break;
4527 case estCGP: bCGP = (state->flags & (1<<i)); break;
4528 case estLD_RNG:
4529 case estLD_RNGI:
4530 case estDISRE_INITF:
4531 case estDISRE_RM3TAV:
4532 case estORIRE_INITF:
4533 case estORIRE_DTAV:
4534 /* No processing required */
4535 break;
4536 default:
4537 gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4542 if (dd->ncg_tot > comm->nalloc_int)
4544 comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4545 srenew(comm->buf_int,comm->nalloc_int);
4547 move = comm->buf_int;
4549 /* Clear the count */
4550 for(c=0; c<dd->ndim*2; c++)
4552 ncg[c] = 0;
4553 nat[c] = 0;
4556 npbcdim = dd->npbcdim;
4558 for(d=0; (d<DIM); d++)
4560 limitd[d] = dd->comm->cellsize_min[d];
4561 if (d >= npbcdim && dd->ci[d] == 0)
4563 cell_x0[d] = -GMX_FLOAT_MAX;
4565 else
4567 cell_x0[d] = comm->cell_x0[d];
4569 if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4571 cell_x1[d] = GMX_FLOAT_MAX;
4573 else
4575 cell_x1[d] = comm->cell_x1[d];
4577 if (d < npbcdim)
4579 limit0[d] = comm->old_cell_x0[d] - limitd[d];
4580 limit1[d] = comm->old_cell_x1[d] + limitd[d];
4582 else
4584 /* We check after communication if a charge group moved
4585 * more than one cell. Set the pre-comm check limit to float_max.
4587 limit0[d] = -GMX_FLOAT_MAX;
4588 limit1[d] = GMX_FLOAT_MAX;
4592 make_tric_corr_matrix(npbcdim,state->box,tcm);
4594 cgindex = dd->cgindex;
4596 nthread = gmx_omp_nthreads_get(emntDomdec);
4598 /* Compute the center of geometry for all home charge groups
4599 * and put them in the box and determine where they should go.
4601 #pragma omp parallel for num_threads(nthread) schedule(static)
4602 for(thread=0; thread<nthread; thread++)
4604 calc_cg_move(fplog,step,dd,state,tric_dir,tcm,
4605 cell_x0,cell_x1,limitd,limit0,limit1,
4606 cgindex,
4607 ( thread *dd->ncg_home)/nthread,
4608 ((thread+1)*dd->ncg_home)/nthread,
4609 fr->cutoff_scheme==ecutsGROUP ? cg_cm : state->x,
4610 move);
4613 for(cg=0; cg<dd->ncg_home; cg++)
4615 if (move[cg] >= 0)
4617 mc = move[cg];
4618 flag = mc & ~DD_FLAG_NRCG;
4619 mc = mc & DD_FLAG_NRCG;
4620 move[cg] = mc;
4622 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4624 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4625 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4627 comm->cggl_flag[mc][ncg[mc]*DD_CGIBS ] = dd->index_gl[cg];
4628 /* We store the cg size in the lower 16 bits
4629 * and the place where the charge group should go
4630 * in the next 6 bits. This saves some communication volume.
4632 nrcg = cgindex[cg+1] - cgindex[cg];
4633 comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4634 ncg[mc] += 1;
4635 nat[mc] += nrcg;
4639 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4640 inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4642 *ncg_moved = 0;
4643 for(i=0; i<dd->ndim*2; i++)
4645 *ncg_moved += ncg[i];
4648 nvec = 1;
4649 if (bV)
4651 nvec++;
4653 if (bSDX)
4655 nvec++;
4657 if (bCGP)
4659 nvec++;
4662 /* Make sure the communication buffers are large enough */
4663 for(mc=0; mc<dd->ndim*2; mc++)
4665 nvr = ncg[mc] + nat[mc]*nvec;
4666 if (nvr > comm->cgcm_state_nalloc[mc])
4668 comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4669 srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4673 switch (fr->cutoff_scheme)
4675 case ecutsGROUP:
4676 /* Recalculating cg_cm might be cheaper than communicating,
4677 * but that could give rise to rounding issues.
4679 home_pos_cg =
4680 compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4681 nvec,cg_cm,comm,bCompact);
4682 break;
4683 case ecutsVERLET:
4684 /* Without charge groups we send the moved atom coordinates
4685 * over twice. This is so the code below can be used without
4686 * many conditionals for both for with and without charge groups.
4688 home_pos_cg =
4689 compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4690 nvec,state->x,comm,FALSE);
4691 if (bCompact)
4693 home_pos_cg -= *ncg_moved;
4695 break;
4696 default:
4697 gmx_incons("unimplemented");
4698 home_pos_cg = 0;
4701 vec = 0;
4702 home_pos_at =
4703 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4704 nvec,vec++,state->x,comm,bCompact);
4705 if (bV)
4707 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4708 nvec,vec++,state->v,comm,bCompact);
4710 if (bSDX)
4712 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4713 nvec,vec++,state->sd_X,comm,bCompact);
4715 if (bCGP)
4717 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4718 nvec,vec++,state->cg_p,comm,bCompact);
4721 if (bCompact)
4723 compact_ind(dd->ncg_home,move,
4724 dd->index_gl,dd->cgindex,dd->gatindex,
4725 dd->ga2la,comm->bLocalCG,
4726 fr->cginfo);
4728 else
4730 if (fr->cutoff_scheme == ecutsVERLET)
4732 moved = get_moved(comm,dd->ncg_home);
4734 for(k=0; k<dd->ncg_home; k++)
4736 moved[k] = 0;
4739 else
4741 moved = fr->ns.grid->cell_index;
4744 clear_and_mark_ind(dd->ncg_home,move,
4745 dd->index_gl,dd->cgindex,dd->gatindex,
4746 dd->ga2la,comm->bLocalCG,
4747 moved);
4750 cginfo_mb = fr->cginfo_mb;
4752 *ncg_stay_home = home_pos_cg;
4753 for(d=0; d<dd->ndim; d++)
4755 dim = dd->dim[d];
4756 ncg_recv = 0;
4757 nat_recv = 0;
4758 nvr = 0;
4759 for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4761 cdd = d*2 + dir;
4762 /* Communicate the cg and atom counts */
4763 sbuf[0] = ncg[cdd];
4764 sbuf[1] = nat[cdd];
4765 if (debug)
4767 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4768 d,dir,sbuf[0],sbuf[1]);
4770 dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4772 if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4774 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4775 srenew(comm->buf_int,comm->nalloc_int);
4778 /* Communicate the charge group indices, sizes and flags */
4779 dd_sendrecv_int(dd, d, dir,
4780 comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4781 comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4783 nvs = ncg[cdd] + nat[cdd]*nvec;
4784 i = rbuf[0] + rbuf[1] *nvec;
4785 vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4787 /* Communicate cgcm and state */
4788 dd_sendrecv_rvec(dd, d, dir,
4789 comm->cgcm_state[cdd], nvs,
4790 comm->vbuf.v+nvr, i);
4791 ncg_recv += rbuf[0];
4792 nat_recv += rbuf[1];
4793 nvr += i;
4796 /* Process the received charge groups */
4797 buf_pos = 0;
4798 for(cg=0; cg<ncg_recv; cg++)
4800 flag = comm->buf_int[cg*DD_CGIBS+1];
4802 if (dim >= npbcdim && dd->nc[dim] > 2)
4804 /* No pbc in this dim and more than one domain boundary.
4805 * We do a separate check if a charge group didn't move too far.
4807 if (((flag & DD_FLAG_FW(d)) &&
4808 comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4809 ((flag & DD_FLAG_BW(d)) &&
4810 comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4812 cg_move_error(fplog,dd,step,cg,dim,
4813 (flag & DD_FLAG_FW(d)) ? 1 : 0,
4814 FALSE,0,
4815 comm->vbuf.v[buf_pos],
4816 comm->vbuf.v[buf_pos],
4817 comm->vbuf.v[buf_pos][dim]);
4821 mc = -1;
4822 if (d < dd->ndim-1)
4824 /* Check which direction this cg should go */
4825 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4827 if (dd->bGridJump)
4829 /* The cell boundaries for dimension d2 are not equal
4830 * for each cell row of the lower dimension(s),
4831 * therefore we might need to redetermine where
4832 * this cg should go.
4834 dim2 = dd->dim[d2];
4835 /* If this cg crosses the box boundary in dimension d2
4836 * we can use the communicated flag, so we do not
4837 * have to worry about pbc.
4839 if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4840 (flag & DD_FLAG_FW(d2))) ||
4841 (dd->ci[dim2] == 0 &&
4842 (flag & DD_FLAG_BW(d2)))))
4844 /* Clear the two flags for this dimension */
4845 flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4846 /* Determine the location of this cg
4847 * in lattice coordinates
4849 pos_d = comm->vbuf.v[buf_pos][dim2];
4850 if (tric_dir[dim2])
4852 for(d3=dim2+1; d3<DIM; d3++)
4854 pos_d +=
4855 comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4858 /* Check of we are not at the box edge.
4859 * pbc is only handled in the first step above,
4860 * but this check could move over pbc while
4861 * the first step did not due to different rounding.
4863 if (pos_d >= cell_x1[dim2] &&
4864 dd->ci[dim2] != dd->nc[dim2]-1)
4866 flag |= DD_FLAG_FW(d2);
4868 else if (pos_d < cell_x0[dim2] &&
4869 dd->ci[dim2] != 0)
4871 flag |= DD_FLAG_BW(d2);
4873 comm->buf_int[cg*DD_CGIBS+1] = flag;
4876 /* Set to which neighboring cell this cg should go */
4877 if (flag & DD_FLAG_FW(d2))
4879 mc = d2*2;
4881 else if (flag & DD_FLAG_BW(d2))
4883 if (dd->nc[dd->dim[d2]] > 2)
4885 mc = d2*2+1;
4887 else
4889 mc = d2*2;
4895 nrcg = flag & DD_FLAG_NRCG;
4896 if (mc == -1)
4898 if (home_pos_cg+1 > dd->cg_nalloc)
4900 dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4901 srenew(dd->index_gl,dd->cg_nalloc);
4902 srenew(dd->cgindex,dd->cg_nalloc+1);
4904 /* Set the global charge group index and size */
4905 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4906 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4907 /* Copy the state from the buffer */
4908 dd_check_alloc_ncg(fr,state,f,home_pos_cg+1);
4909 if (fr->cutoff_scheme == ecutsGROUP)
4911 cg_cm = fr->cg_cm;
4912 copy_rvec(comm->vbuf.v[buf_pos],cg_cm[home_pos_cg]);
4914 buf_pos++;
4916 /* Set the cginfo */
4917 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4918 dd->index_gl[home_pos_cg]);
4919 if (comm->bLocalCG)
4921 comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4924 if (home_pos_at+nrcg > state->nalloc)
4926 dd_realloc_state(state,f,home_pos_at+nrcg);
4928 for(i=0; i<nrcg; i++)
4930 copy_rvec(comm->vbuf.v[buf_pos++],
4931 state->x[home_pos_at+i]);
4933 if (bV)
4935 for(i=0; i<nrcg; i++)
4937 copy_rvec(comm->vbuf.v[buf_pos++],
4938 state->v[home_pos_at+i]);
4941 if (bSDX)
4943 for(i=0; i<nrcg; i++)
4945 copy_rvec(comm->vbuf.v[buf_pos++],
4946 state->sd_X[home_pos_at+i]);
4949 if (bCGP)
4951 for(i=0; i<nrcg; i++)
4953 copy_rvec(comm->vbuf.v[buf_pos++],
4954 state->cg_p[home_pos_at+i]);
4957 home_pos_cg += 1;
4958 home_pos_at += nrcg;
4960 else
4962 /* Reallocate the buffers if necessary */
4963 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4965 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4966 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4968 nvr = ncg[mc] + nat[mc]*nvec;
4969 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4971 comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4972 srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4974 /* Copy from the receive to the send buffers */
4975 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4976 comm->buf_int + cg*DD_CGIBS,
4977 DD_CGIBS*sizeof(int));
4978 memcpy(comm->cgcm_state[mc][nvr],
4979 comm->vbuf.v[buf_pos],
4980 (1+nrcg*nvec)*sizeof(rvec));
4981 buf_pos += 1 + nrcg*nvec;
4982 ncg[mc] += 1;
4983 nat[mc] += nrcg;
4988 /* With sorting (!bCompact) the indices are now only partially up to date
4989 * and ncg_home and nat_home are not the real count, since there are
4990 * "holes" in the arrays for the charge groups that moved to neighbors.
4992 if (fr->cutoff_scheme == ecutsVERLET)
4994 moved = get_moved(comm,home_pos_cg);
4996 for(i=dd->ncg_home; i<home_pos_cg; i++)
4998 moved[i] = 0;
5001 dd->ncg_home = home_pos_cg;
5002 dd->nat_home = home_pos_at;
5004 if (debug)
5006 fprintf(debug,
5007 "Finished repartitioning: cgs moved out %d, new home %d\n",
5008 *ncg_moved,dd->ncg_home-*ncg_moved);
5013 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
5015 dd->comm->cycl[ddCycl] += cycles;
5016 dd->comm->cycl_n[ddCycl]++;
5017 if (cycles > dd->comm->cycl_max[ddCycl])
5019 dd->comm->cycl_max[ddCycl] = cycles;
5023 static double force_flop_count(t_nrnb *nrnb)
5025 int i;
5026 double sum;
5027 const char *name;
5029 sum = 0;
5030 for(i=0; i<eNR_NBKERNEL_FREE_ENERGY; i++)
5032 /* To get closer to the real timings, we half the count
5033 * for the normal loops and again half it for water loops.
5035 name = nrnb_str(i);
5036 if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5038 sum += nrnb->n[i]*0.25*cost_nrnb(i);
5040 else
5042 sum += nrnb->n[i]*0.50*cost_nrnb(i);
5045 for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
5047 name = nrnb_str(i);
5048 if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5049 sum += nrnb->n[i]*cost_nrnb(i);
5051 for(i=eNR_BONDS; i<=eNR_WALLS; i++)
5053 sum += nrnb->n[i]*cost_nrnb(i);
5056 return sum;
5059 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
5061 if (dd->comm->eFlop)
5063 dd->comm->flop -= force_flop_count(nrnb);
5066 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
5068 if (dd->comm->eFlop)
5070 dd->comm->flop += force_flop_count(nrnb);
5071 dd->comm->flop_n++;
5075 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5077 int i;
5079 for(i=0; i<ddCyclNr; i++)
5081 dd->comm->cycl[i] = 0;
5082 dd->comm->cycl_n[i] = 0;
5083 dd->comm->cycl_max[i] = 0;
5085 dd->comm->flop = 0;
5086 dd->comm->flop_n = 0;
5089 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
5091 gmx_domdec_comm_t *comm;
5092 gmx_domdec_load_t *load;
5093 gmx_domdec_root_t *root=NULL;
5094 int d,dim,cid,i,pos;
5095 float cell_frac=0,sbuf[DD_NLOAD_MAX];
5096 gmx_bool bSepPME;
5098 if (debug)
5100 fprintf(debug,"get_load_distribution start\n");
5103 wallcycle_start(wcycle,ewcDDCOMMLOAD);
5105 comm = dd->comm;
5107 bSepPME = (dd->pme_nodeid >= 0);
5109 for(d=dd->ndim-1; d>=0; d--)
5111 dim = dd->dim[d];
5112 /* Check if we participate in the communication in this dimension */
5113 if (d == dd->ndim-1 ||
5114 (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
5116 load = &comm->load[d];
5117 if (dd->bGridJump)
5119 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5121 pos = 0;
5122 if (d == dd->ndim-1)
5124 sbuf[pos++] = dd_force_load(comm);
5125 sbuf[pos++] = sbuf[0];
5126 if (dd->bGridJump)
5128 sbuf[pos++] = sbuf[0];
5129 sbuf[pos++] = cell_frac;
5130 if (d > 0)
5132 sbuf[pos++] = comm->cell_f_max0[d];
5133 sbuf[pos++] = comm->cell_f_min1[d];
5136 if (bSepPME)
5138 sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5139 sbuf[pos++] = comm->cycl[ddCyclPME];
5142 else
5144 sbuf[pos++] = comm->load[d+1].sum;
5145 sbuf[pos++] = comm->load[d+1].max;
5146 if (dd->bGridJump)
5148 sbuf[pos++] = comm->load[d+1].sum_m;
5149 sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5150 sbuf[pos++] = comm->load[d+1].flags;
5151 if (d > 0)
5153 sbuf[pos++] = comm->cell_f_max0[d];
5154 sbuf[pos++] = comm->cell_f_min1[d];
5157 if (bSepPME)
5159 sbuf[pos++] = comm->load[d+1].mdf;
5160 sbuf[pos++] = comm->load[d+1].pme;
5163 load->nload = pos;
5164 /* Communicate a row in DD direction d.
5165 * The communicators are setup such that the root always has rank 0.
5167 #ifdef GMX_MPI
5168 MPI_Gather(sbuf ,load->nload*sizeof(float),MPI_BYTE,
5169 load->load,load->nload*sizeof(float),MPI_BYTE,
5170 0,comm->mpi_comm_load[d]);
5171 #endif
5172 if (dd->ci[dim] == dd->master_ci[dim])
5174 /* We are the root, process this row */
5175 if (comm->bDynLoadBal)
5177 root = comm->root[d];
5179 load->sum = 0;
5180 load->max = 0;
5181 load->sum_m = 0;
5182 load->cvol_min = 1;
5183 load->flags = 0;
5184 load->mdf = 0;
5185 load->pme = 0;
5186 pos = 0;
5187 for(i=0; i<dd->nc[dim]; i++)
5189 load->sum += load->load[pos++];
5190 load->max = max(load->max,load->load[pos]);
5191 pos++;
5192 if (dd->bGridJump)
5194 if (root->bLimited)
5196 /* This direction could not be load balanced properly,
5197 * therefore we need to use the maximum iso the average load.
5199 load->sum_m = max(load->sum_m,load->load[pos]);
5201 else
5203 load->sum_m += load->load[pos];
5205 pos++;
5206 load->cvol_min = min(load->cvol_min,load->load[pos]);
5207 pos++;
5208 if (d < dd->ndim-1)
5210 load->flags = (int)(load->load[pos++] + 0.5);
5212 if (d > 0)
5214 root->cell_f_max0[i] = load->load[pos++];
5215 root->cell_f_min1[i] = load->load[pos++];
5218 if (bSepPME)
5220 load->mdf = max(load->mdf,load->load[pos]);
5221 pos++;
5222 load->pme = max(load->pme,load->load[pos]);
5223 pos++;
5226 if (comm->bDynLoadBal && root->bLimited)
5228 load->sum_m *= dd->nc[dim];
5229 load->flags |= (1<<d);
5235 if (DDMASTER(dd))
5237 comm->nload += dd_load_count(comm);
5238 comm->load_step += comm->cycl[ddCyclStep];
5239 comm->load_sum += comm->load[0].sum;
5240 comm->load_max += comm->load[0].max;
5241 if (comm->bDynLoadBal)
5243 for(d=0; d<dd->ndim; d++)
5245 if (comm->load[0].flags & (1<<d))
5247 comm->load_lim[d]++;
5251 if (bSepPME)
5253 comm->load_mdf += comm->load[0].mdf;
5254 comm->load_pme += comm->load[0].pme;
5258 wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5260 if (debug)
5262 fprintf(debug,"get_load_distribution finished\n");
5266 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5268 /* Return the relative performance loss on the total run time
5269 * due to the force calculation load imbalance.
5271 if (dd->comm->nload > 0)
5273 return
5274 (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5275 (dd->comm->load_step*dd->nnodes);
5277 else
5279 return 0;
5283 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5285 char buf[STRLEN];
5286 int npp,npme,nnodes,d,limp;
5287 float imbal,pme_f_ratio,lossf,lossp=0;
5288 gmx_bool bLim;
5289 gmx_domdec_comm_t *comm;
5291 comm = dd->comm;
5292 if (DDMASTER(dd) && comm->nload > 0)
5294 npp = dd->nnodes;
5295 npme = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5296 nnodes = npp + npme;
5297 imbal = comm->load_max*npp/comm->load_sum - 1;
5298 lossf = dd_force_imb_perf_loss(dd);
5299 sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5300 fprintf(fplog,"%s",buf);
5301 fprintf(stderr,"\n");
5302 fprintf(stderr,"%s",buf);
5303 sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5304 fprintf(fplog,"%s",buf);
5305 fprintf(stderr,"%s",buf);
5306 bLim = FALSE;
5307 if (comm->bDynLoadBal)
5309 sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5310 for(d=0; d<dd->ndim; d++)
5312 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5313 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5314 if (limp >= 50)
5316 bLim = TRUE;
5319 sprintf(buf+strlen(buf),"\n");
5320 fprintf(fplog,"%s",buf);
5321 fprintf(stderr,"%s",buf);
5323 if (npme > 0)
5325 pme_f_ratio = comm->load_pme/comm->load_mdf;
5326 lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5327 if (lossp <= 0)
5329 lossp *= (float)npme/(float)nnodes;
5331 else
5333 lossp *= (float)npp/(float)nnodes;
5335 sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5336 fprintf(fplog,"%s",buf);
5337 fprintf(stderr,"%s",buf);
5338 sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5339 fprintf(fplog,"%s",buf);
5340 fprintf(stderr,"%s",buf);
5342 fprintf(fplog,"\n");
5343 fprintf(stderr,"\n");
5345 if (lossf >= DD_PERF_LOSS)
5347 sprintf(buf,
5348 "NOTE: %.1f %% performance was lost due to load imbalance\n"
5349 " in the domain decomposition.\n",lossf*100);
5350 if (!comm->bDynLoadBal)
5352 sprintf(buf+strlen(buf)," You might want to use dynamic load balancing (option -dlb.)\n");
5354 else if (bLim)
5356 sprintf(buf+strlen(buf)," You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5358 fprintf(fplog,"%s\n",buf);
5359 fprintf(stderr,"%s\n",buf);
5361 if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5363 sprintf(buf,
5364 "NOTE: %.1f %% performance was lost because the PME nodes\n"
5365 " had %s work to do than the PP nodes.\n"
5366 " You might want to %s the number of PME nodes\n"
5367 " or %s the cut-off and the grid spacing.\n",
5368 fabs(lossp*100),
5369 (lossp < 0) ? "less" : "more",
5370 (lossp < 0) ? "decrease" : "increase",
5371 (lossp < 0) ? "decrease" : "increase");
5372 fprintf(fplog,"%s\n",buf);
5373 fprintf(stderr,"%s\n",buf);
5378 static float dd_vol_min(gmx_domdec_t *dd)
5380 return dd->comm->load[0].cvol_min*dd->nnodes;
5383 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5385 return dd->comm->load[0].flags;
5388 static float dd_f_imbal(gmx_domdec_t *dd)
5390 return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5393 float dd_pme_f_ratio(gmx_domdec_t *dd)
5395 if (dd->comm->cycl_n[ddCyclPME] > 0)
5397 return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5399 else
5401 return -1.0;
5405 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5407 int flags,d;
5408 char buf[22];
5410 flags = dd_load_flags(dd);
5411 if (flags)
5413 fprintf(fplog,
5414 "DD load balancing is limited by minimum cell size in dimension");
5415 for(d=0; d<dd->ndim; d++)
5417 if (flags & (1<<d))
5419 fprintf(fplog," %c",dim2char(dd->dim[d]));
5422 fprintf(fplog,"\n");
5424 fprintf(fplog,"DD step %s",gmx_step_str(step,buf));
5425 if (dd->comm->bDynLoadBal)
5427 fprintf(fplog," vol min/aver %5.3f%c",
5428 dd_vol_min(dd),flags ? '!' : ' ');
5430 fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5431 if (dd->comm->cycl_n[ddCyclPME])
5433 fprintf(fplog," pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5435 fprintf(fplog,"\n\n");
5438 static void dd_print_load_verbose(gmx_domdec_t *dd)
5440 if (dd->comm->bDynLoadBal)
5442 fprintf(stderr,"vol %4.2f%c ",
5443 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5445 fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5446 if (dd->comm->cycl_n[ddCyclPME])
5448 fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5452 #ifdef GMX_MPI
5453 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
5455 MPI_Comm c_row;
5456 int dim, i, rank;
5457 ivec loc_c;
5458 gmx_domdec_root_t *root;
5459 gmx_bool bPartOfGroup = FALSE;
5461 dim = dd->dim[dim_ind];
5462 copy_ivec(loc,loc_c);
5463 for(i=0; i<dd->nc[dim]; i++)
5465 loc_c[dim] = i;
5466 rank = dd_index(dd->nc,loc_c);
5467 if (rank == dd->rank)
5469 /* This process is part of the group */
5470 bPartOfGroup = TRUE;
5473 MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
5474 &c_row);
5475 if (bPartOfGroup)
5477 dd->comm->mpi_comm_load[dim_ind] = c_row;
5478 if (dd->comm->eDLB != edlbNO)
5480 if (dd->ci[dim] == dd->master_ci[dim])
5482 /* This is the root process of this row */
5483 snew(dd->comm->root[dim_ind],1);
5484 root = dd->comm->root[dim_ind];
5485 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5486 snew(root->old_cell_f,dd->nc[dim]+1);
5487 snew(root->bCellMin,dd->nc[dim]);
5488 if (dim_ind > 0)
5490 snew(root->cell_f_max0,dd->nc[dim]);
5491 snew(root->cell_f_min1,dd->nc[dim]);
5492 snew(root->bound_min,dd->nc[dim]);
5493 snew(root->bound_max,dd->nc[dim]);
5495 snew(root->buf_ncd,dd->nc[dim]);
5497 else
5499 /* This is not a root process, we only need to receive cell_f */
5500 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5503 if (dd->ci[dim] == dd->master_ci[dim])
5505 snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5509 #endif
5511 static void make_load_communicators(gmx_domdec_t *dd)
5513 #ifdef GMX_MPI
5514 int dim0,dim1,i,j;
5515 ivec loc;
5517 if (debug)
5518 fprintf(debug,"Making load communicators\n");
5520 snew(dd->comm->load,dd->ndim);
5521 snew(dd->comm->mpi_comm_load,dd->ndim);
5523 clear_ivec(loc);
5524 make_load_communicator(dd,0,loc);
5525 if (dd->ndim > 1) {
5526 dim0 = dd->dim[0];
5527 for(i=0; i<dd->nc[dim0]; i++) {
5528 loc[dim0] = i;
5529 make_load_communicator(dd,1,loc);
5532 if (dd->ndim > 2) {
5533 dim0 = dd->dim[0];
5534 for(i=0; i<dd->nc[dim0]; i++) {
5535 loc[dim0] = i;
5536 dim1 = dd->dim[1];
5537 for(j=0; j<dd->nc[dim1]; j++) {
5538 loc[dim1] = j;
5539 make_load_communicator(dd,2,loc);
5544 if (debug)
5545 fprintf(debug,"Finished making load communicators\n");
5546 #endif
5549 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5551 gmx_bool bZYX;
5552 int d,dim,i,j,m;
5553 ivec tmp,s;
5554 int nzone,nzonep;
5555 ivec dd_zp[DD_MAXIZONE];
5556 gmx_domdec_zones_t *zones;
5557 gmx_domdec_ns_ranges_t *izone;
5559 for(d=0; d<dd->ndim; d++)
5561 dim = dd->dim[d];
5562 copy_ivec(dd->ci,tmp);
5563 tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5564 dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5565 copy_ivec(dd->ci,tmp);
5566 tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5567 dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5568 if (debug)
5570 fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5571 dd->rank,dim,
5572 dd->neighbor[d][0],
5573 dd->neighbor[d][1]);
5577 if (DDMASTER(dd))
5579 fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5580 dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5582 if (fplog)
5584 fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5585 dd->ndim,
5586 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5587 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5589 switch (dd->ndim)
5591 case 3:
5592 nzone = dd_z3n;
5593 nzonep = dd_zp3n;
5594 for(i=0; i<nzonep; i++)
5596 copy_ivec(dd_zp3[i],dd_zp[i]);
5598 break;
5599 case 2:
5600 nzone = dd_z2n;
5601 nzonep = dd_zp2n;
5602 for(i=0; i<nzonep; i++)
5604 copy_ivec(dd_zp2[i],dd_zp[i]);
5606 break;
5607 case 1:
5608 nzone = dd_z1n;
5609 nzonep = dd_zp1n;
5610 for(i=0; i<nzonep; i++)
5612 copy_ivec(dd_zp1[i],dd_zp[i]);
5614 break;
5615 default:
5616 gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5617 nzone = 0;
5618 nzonep = 0;
5621 zones = &dd->comm->zones;
5623 for(i=0; i<nzone; i++)
5625 m = 0;
5626 clear_ivec(zones->shift[i]);
5627 for(d=0; d<dd->ndim; d++)
5629 zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5633 zones->n = nzone;
5634 for(i=0; i<nzone; i++)
5636 for(d=0; d<DIM; d++)
5638 s[d] = dd->ci[d] - zones->shift[i][d];
5639 if (s[d] < 0)
5641 s[d] += dd->nc[d];
5643 else if (s[d] >= dd->nc[d])
5645 s[d] -= dd->nc[d];
5649 zones->nizone = nzonep;
5650 for(i=0; i<zones->nizone; i++)
5652 if (dd_zp[i][0] != i)
5654 gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5656 izone = &zones->izone[i];
5657 izone->j0 = dd_zp[i][1];
5658 izone->j1 = dd_zp[i][2];
5659 for(dim=0; dim<DIM; dim++)
5661 if (dd->nc[dim] == 1)
5663 /* All shifts should be allowed */
5664 izone->shift0[dim] = -1;
5665 izone->shift1[dim] = 1;
5667 else
5670 izone->shift0[d] = 0;
5671 izone->shift1[d] = 0;
5672 for(j=izone->j0; j<izone->j1; j++) {
5673 if (dd->shift[j][d] > dd->shift[i][d])
5674 izone->shift0[d] = -1;
5675 if (dd->shift[j][d] < dd->shift[i][d])
5676 izone->shift1[d] = 1;
5680 int shift_diff;
5682 /* Assume the shift are not more than 1 cell */
5683 izone->shift0[dim] = 1;
5684 izone->shift1[dim] = -1;
5685 for(j=izone->j0; j<izone->j1; j++)
5687 shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5688 if (shift_diff < izone->shift0[dim])
5690 izone->shift0[dim] = shift_diff;
5692 if (shift_diff > izone->shift1[dim])
5694 izone->shift1[dim] = shift_diff;
5701 if (dd->comm->eDLB != edlbNO)
5703 snew(dd->comm->root,dd->ndim);
5706 if (dd->comm->bRecordLoad)
5708 make_load_communicators(dd);
5712 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5714 gmx_domdec_t *dd;
5715 gmx_domdec_comm_t *comm;
5716 int i,rank,*buf;
5717 ivec periods;
5718 #ifdef GMX_MPI
5719 MPI_Comm comm_cart;
5720 #endif
5722 dd = cr->dd;
5723 comm = dd->comm;
5725 #ifdef GMX_MPI
5726 if (comm->bCartesianPP)
5728 /* Set up cartesian communication for the particle-particle part */
5729 if (fplog)
5731 fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5732 dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5735 for(i=0; i<DIM; i++)
5737 periods[i] = TRUE;
5739 MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5740 &comm_cart);
5741 /* We overwrite the old communicator with the new cartesian one */
5742 cr->mpi_comm_mygroup = comm_cart;
5745 dd->mpi_comm_all = cr->mpi_comm_mygroup;
5746 MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5748 if (comm->bCartesianPP_PME)
5750 /* Since we want to use the original cartesian setup for sim,
5751 * and not the one after split, we need to make an index.
5753 snew(comm->ddindex2ddnodeid,dd->nnodes);
5754 comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5755 gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5756 /* Get the rank of the DD master,
5757 * above we made sure that the master node is a PP node.
5759 if (MASTER(cr))
5761 rank = dd->rank;
5763 else
5765 rank = 0;
5767 MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5769 else if (comm->bCartesianPP)
5771 if (cr->npmenodes == 0)
5773 /* The PP communicator is also
5774 * the communicator for this simulation
5776 cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5778 cr->nodeid = dd->rank;
5780 MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5782 /* We need to make an index to go from the coordinates
5783 * to the nodeid of this simulation.
5785 snew(comm->ddindex2simnodeid,dd->nnodes);
5786 snew(buf,dd->nnodes);
5787 if (cr->duty & DUTY_PP)
5789 buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5791 /* Communicate the ddindex to simulation nodeid index */
5792 MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5793 cr->mpi_comm_mysim);
5794 sfree(buf);
5796 /* Determine the master coordinates and rank.
5797 * The DD master should be the same node as the master of this sim.
5799 for(i=0; i<dd->nnodes; i++)
5801 if (comm->ddindex2simnodeid[i] == 0)
5803 ddindex2xyz(dd->nc,i,dd->master_ci);
5804 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5807 if (debug)
5809 fprintf(debug,"The master rank is %d\n",dd->masterrank);
5812 else
5814 /* No Cartesian communicators */
5815 /* We use the rank in dd->comm->all as DD index */
5816 ddindex2xyz(dd->nc,dd->rank,dd->ci);
5817 /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5818 dd->masterrank = 0;
5819 clear_ivec(dd->master_ci);
5821 #endif
5823 if (fplog)
5825 fprintf(fplog,
5826 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5827 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5829 if (debug)
5831 fprintf(debug,
5832 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5833 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5837 static void receive_ddindex2simnodeid(t_commrec *cr)
5839 gmx_domdec_t *dd;
5841 gmx_domdec_comm_t *comm;
5842 int *buf;
5844 dd = cr->dd;
5845 comm = dd->comm;
5847 #ifdef GMX_MPI
5848 if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5850 snew(comm->ddindex2simnodeid,dd->nnodes);
5851 snew(buf,dd->nnodes);
5852 if (cr->duty & DUTY_PP)
5854 buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5856 #ifdef GMX_MPI
5857 /* Communicate the ddindex to simulation nodeid index */
5858 MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5859 cr->mpi_comm_mysim);
5860 #endif
5861 sfree(buf);
5863 #endif
5866 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5867 int ncg,int natoms)
5869 gmx_domdec_master_t *ma;
5870 int i;
5872 snew(ma,1);
5874 snew(ma->ncg,dd->nnodes);
5875 snew(ma->index,dd->nnodes+1);
5876 snew(ma->cg,ncg);
5877 snew(ma->nat,dd->nnodes);
5878 snew(ma->ibuf,dd->nnodes*2);
5879 snew(ma->cell_x,DIM);
5880 for(i=0; i<DIM; i++)
5882 snew(ma->cell_x[i],dd->nc[i]+1);
5885 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5887 ma->vbuf = NULL;
5889 else
5891 snew(ma->vbuf,natoms);
5894 return ma;
5897 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5898 int reorder)
5900 gmx_domdec_t *dd;
5901 gmx_domdec_comm_t *comm;
5902 int i,rank;
5903 gmx_bool bDiv[DIM];
5904 ivec periods;
5905 #ifdef GMX_MPI
5906 MPI_Comm comm_cart;
5907 #endif
5909 dd = cr->dd;
5910 comm = dd->comm;
5912 if (comm->bCartesianPP)
5914 for(i=1; i<DIM; i++)
5916 bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5918 if (bDiv[YY] || bDiv[ZZ])
5920 comm->bCartesianPP_PME = TRUE;
5921 /* If we have 2D PME decomposition, which is always in x+y,
5922 * we stack the PME only nodes in z.
5923 * Otherwise we choose the direction that provides the thinnest slab
5924 * of PME only nodes as this will have the least effect
5925 * on the PP communication.
5926 * But for the PME communication the opposite might be better.
5928 if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5929 !bDiv[YY] ||
5930 dd->nc[YY] > dd->nc[ZZ]))
5932 comm->cartpmedim = ZZ;
5934 else
5936 comm->cartpmedim = YY;
5938 comm->ntot[comm->cartpmedim]
5939 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5941 else if (fplog)
5943 fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5944 fprintf(fplog,
5945 "Will not use a Cartesian communicator for PP <-> PME\n\n");
5949 #ifdef GMX_MPI
5950 if (comm->bCartesianPP_PME)
5952 if (fplog)
5954 fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5957 for(i=0; i<DIM; i++)
5959 periods[i] = TRUE;
5961 MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5962 &comm_cart);
5964 MPI_Comm_rank(comm_cart,&rank);
5965 if (MASTERNODE(cr) && rank != 0)
5967 gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5970 /* With this assigment we loose the link to the original communicator
5971 * which will usually be MPI_COMM_WORLD, unless have multisim.
5973 cr->mpi_comm_mysim = comm_cart;
5974 cr->sim_nodeid = rank;
5976 MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5978 if (fplog)
5980 fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5981 cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5984 if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5986 cr->duty = DUTY_PP;
5988 if (cr->npmenodes == 0 ||
5989 dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5991 cr->duty = DUTY_PME;
5994 /* Split the sim communicator into PP and PME only nodes */
5995 MPI_Comm_split(cr->mpi_comm_mysim,
5996 cr->duty,
5997 dd_index(comm->ntot,dd->ci),
5998 &cr->mpi_comm_mygroup);
6000 else
6002 switch (dd_node_order)
6004 case ddnoPP_PME:
6005 if (fplog)
6007 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
6009 break;
6010 case ddnoINTERLEAVE:
6011 /* Interleave the PP-only and PME-only nodes,
6012 * as on clusters with dual-core machines this will double
6013 * the communication bandwidth of the PME processes
6014 * and thus speed up the PP <-> PME and inter PME communication.
6016 if (fplog)
6018 fprintf(fplog,"Interleaving PP and PME nodes\n");
6020 comm->pmenodes = dd_pmenodes(cr);
6021 break;
6022 case ddnoCARTESIAN:
6023 break;
6024 default:
6025 gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
6028 if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
6030 cr->duty = DUTY_PME;
6032 else
6034 cr->duty = DUTY_PP;
6037 /* Split the sim communicator into PP and PME only nodes */
6038 MPI_Comm_split(cr->mpi_comm_mysim,
6039 cr->duty,
6040 cr->nodeid,
6041 &cr->mpi_comm_mygroup);
6042 MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
6044 #endif
6046 if (fplog)
6048 fprintf(fplog,"This is a %s only node\n\n",
6049 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6053 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
6055 gmx_domdec_t *dd;
6056 gmx_domdec_comm_t *comm;
6057 int CartReorder;
6059 dd = cr->dd;
6060 comm = dd->comm;
6062 copy_ivec(dd->nc,comm->ntot);
6064 comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
6065 comm->bCartesianPP_PME = FALSE;
6067 /* Reorder the nodes by default. This might change the MPI ranks.
6068 * Real reordering is only supported on very few architectures,
6069 * Blue Gene is one of them.
6071 CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6073 if (cr->npmenodes > 0)
6075 /* Split the communicator into a PP and PME part */
6076 split_communicator(fplog,cr,dd_node_order,CartReorder);
6077 if (comm->bCartesianPP_PME)
6079 /* We (possibly) reordered the nodes in split_communicator,
6080 * so it is no longer required in make_pp_communicator.
6082 CartReorder = FALSE;
6085 else
6087 /* All nodes do PP and PME */
6088 #ifdef GMX_MPI
6089 /* We do not require separate communicators */
6090 cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6091 #endif
6094 if (cr->duty & DUTY_PP)
6096 /* Copy or make a new PP communicator */
6097 make_pp_communicator(fplog,cr,CartReorder);
6099 else
6101 receive_ddindex2simnodeid(cr);
6104 if (!(cr->duty & DUTY_PME))
6106 /* Set up the commnuication to our PME node */
6107 dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
6108 dd->pme_receive_vir_ener = receive_vir_ener(cr);
6109 if (debug)
6111 fprintf(debug,"My pme_nodeid %d receive ener %d\n",
6112 dd->pme_nodeid,dd->pme_receive_vir_ener);
6115 else
6117 dd->pme_nodeid = -1;
6120 if (DDMASTER(dd))
6122 dd->ma = init_gmx_domdec_master_t(dd,
6123 comm->cgs_gl.nr,
6124 comm->cgs_gl.index[comm->cgs_gl.nr]);
6128 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
6130 real *slb_frac,tot;
6131 int i,n;
6132 double dbl;
6134 slb_frac = NULL;
6135 if (nc > 1 && size_string != NULL)
6137 if (fplog)
6139 fprintf(fplog,"Using static load balancing for the %s direction\n",
6140 dir);
6142 snew(slb_frac,nc);
6143 tot = 0;
6144 for (i=0; i<nc; i++)
6146 dbl = 0;
6147 sscanf(size_string,"%lf%n",&dbl,&n);
6148 if (dbl == 0)
6150 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
6152 slb_frac[i] = dbl;
6153 size_string += n;
6154 tot += slb_frac[i];
6156 /* Normalize */
6157 if (fplog)
6159 fprintf(fplog,"Relative cell sizes:");
6161 for (i=0; i<nc; i++)
6163 slb_frac[i] /= tot;
6164 if (fplog)
6166 fprintf(fplog," %5.3f",slb_frac[i]);
6169 if (fplog)
6171 fprintf(fplog,"\n");
6175 return slb_frac;
6178 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6180 int n,nmol,ftype;
6181 gmx_mtop_ilistloop_t iloop;
6182 t_ilist *il;
6184 n = 0;
6185 iloop = gmx_mtop_ilistloop_init(mtop);
6186 while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
6188 for(ftype=0; ftype<F_NRE; ftype++)
6190 if ((interaction_function[ftype].flags & IF_BOND) &&
6191 NRAL(ftype) > 2)
6193 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6198 return n;
6201 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
6203 char *val;
6204 int nst;
6206 nst = def;
6207 val = getenv(env_var);
6208 if (val)
6210 if (sscanf(val,"%d",&nst) <= 0)
6212 nst = 1;
6214 if (fplog)
6216 fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
6217 env_var,val,nst);
6221 return nst;
6224 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
6226 if (MASTER(cr))
6228 fprintf(stderr,"\n%s\n",warn_string);
6230 if (fplog)
6232 fprintf(fplog,"\n%s\n",warn_string);
6236 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
6237 t_inputrec *ir,FILE *fplog)
6239 if (ir->ePBC == epbcSCREW &&
6240 (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6242 gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6245 if (ir->ns_type == ensSIMPLE)
6247 gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6250 if (ir->nstlist == 0)
6252 gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6255 if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6257 dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6261 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6263 int di,d;
6264 real r;
6266 r = ddbox->box_size[XX];
6267 for(di=0; di<dd->ndim; di++)
6269 d = dd->dim[di];
6270 /* Check using the initial average cell size */
6271 r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6274 return r;
6277 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6278 const char *dlb_opt,gmx_bool bRecordLoad,
6279 unsigned long Flags,t_inputrec *ir)
6281 gmx_domdec_t *dd;
6282 int eDLB=-1;
6283 char buf[STRLEN];
6285 switch (dlb_opt[0])
6287 case 'a': eDLB = edlbAUTO; break;
6288 case 'n': eDLB = edlbNO; break;
6289 case 'y': eDLB = edlbYES; break;
6290 default: gmx_incons("Unknown dlb_opt");
6293 if (Flags & MD_RERUN)
6295 return edlbNO;
6298 if (!EI_DYNAMICS(ir->eI))
6300 if (eDLB == edlbYES)
6302 sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6303 dd_warning(cr,fplog,buf);
6306 return edlbNO;
6309 if (!bRecordLoad)
6311 dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6313 return edlbNO;
6316 if (Flags & MD_REPRODUCIBLE)
6318 switch (eDLB)
6320 case edlbNO:
6321 break;
6322 case edlbAUTO:
6323 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6324 eDLB = edlbNO;
6325 break;
6326 case edlbYES:
6327 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6328 break;
6329 default:
6330 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6331 break;
6335 return eDLB;
6338 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6340 int dim;
6342 dd->ndim = 0;
6343 if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6345 /* Decomposition order z,y,x */
6346 if (fplog)
6348 fprintf(fplog,"Using domain decomposition order z, y, x\n");
6350 for(dim=DIM-1; dim>=0; dim--)
6352 if (dd->nc[dim] > 1)
6354 dd->dim[dd->ndim++] = dim;
6358 else
6360 /* Decomposition order x,y,z */
6361 for(dim=0; dim<DIM; dim++)
6363 if (dd->nc[dim] > 1)
6365 dd->dim[dd->ndim++] = dim;
6371 static gmx_domdec_comm_t *init_dd_comm()
6373 gmx_domdec_comm_t *comm;
6374 int i;
6376 snew(comm,1);
6377 snew(comm->cggl_flag,DIM*2);
6378 snew(comm->cgcm_state,DIM*2);
6379 for(i=0; i<DIM*2; i++)
6381 comm->cggl_flag_nalloc[i] = 0;
6382 comm->cgcm_state_nalloc[i] = 0;
6385 comm->nalloc_int = 0;
6386 comm->buf_int = NULL;
6388 vec_rvec_init(&comm->vbuf);
6390 comm->n_load_have = 0;
6391 comm->n_load_collect = 0;
6393 for(i=0; i<ddnatNR-ddnatZONE; i++)
6395 comm->sum_nat[i] = 0;
6397 comm->ndecomp = 0;
6398 comm->nload = 0;
6399 comm->load_step = 0;
6400 comm->load_sum = 0;
6401 comm->load_max = 0;
6402 clear_ivec(comm->load_lim);
6403 comm->load_mdf = 0;
6404 comm->load_pme = 0;
6406 return comm;
6409 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6410 unsigned long Flags,
6411 ivec nc,
6412 real comm_distance_min,real rconstr,
6413 const char *dlb_opt,real dlb_scale,
6414 const char *sizex,const char *sizey,const char *sizez,
6415 gmx_mtop_t *mtop,t_inputrec *ir,
6416 matrix box,rvec *x,
6417 gmx_ddbox_t *ddbox,
6418 int *npme_x,int *npme_y)
6420 gmx_domdec_t *dd;
6421 gmx_domdec_comm_t *comm;
6422 int recload;
6423 int d,i,j;
6424 real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6425 gmx_bool bC;
6426 char buf[STRLEN];
6428 if (fplog)
6430 fprintf(fplog,
6431 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6434 snew(dd,1);
6436 dd->comm = init_dd_comm();
6437 comm = dd->comm;
6438 snew(comm->cggl_flag,DIM*2);
6439 snew(comm->cgcm_state,DIM*2);
6441 dd->npbcdim = ePBC2npbcdim(ir->ePBC);
6442 dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6444 dd->bSendRecv2 = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6445 comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6446 comm->eFlop = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6447 recload = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6448 comm->nstSortCG = dd_nst_env(fplog,"GMX_DD_SORT",1);
6449 comm->nstDDDump = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6450 comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6451 comm->DD_debug = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6453 dd->pme_recv_f_alloc = 0;
6454 dd->pme_recv_f_buf = NULL;
6456 if (dd->bSendRecv2 && fplog)
6458 fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6460 if (comm->eFlop)
6462 if (fplog)
6464 fprintf(fplog,"Will load balance based on FLOP count\n");
6466 if (comm->eFlop > 1)
6468 srand(1+cr->nodeid);
6470 comm->bRecordLoad = TRUE;
6472 else
6474 comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6478 comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6480 comm->bDynLoadBal = (comm->eDLB == edlbYES);
6481 if (fplog)
6483 fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6485 dd->bGridJump = comm->bDynLoadBal;
6487 if (comm->nstSortCG)
6489 if (fplog)
6491 if (comm->nstSortCG == 1)
6493 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6495 else
6497 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6498 comm->nstSortCG);
6501 snew(comm->sort,1);
6503 else
6505 if (fplog)
6507 fprintf(fplog,"Will not sort the charge groups\n");
6511 comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6513 comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6514 if (comm->bInterCGBondeds)
6516 comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6518 else
6520 comm->bInterCGMultiBody = FALSE;
6523 dd->bInterCGcons = inter_charge_group_constraints(mtop);
6524 dd->bInterCGsettles = inter_charge_group_settles(mtop);
6526 if (ir->rlistlong == 0)
6528 /* Set the cut-off to some very large value,
6529 * so we don't need if statements everywhere in the code.
6530 * We use sqrt, since the cut-off is squared in some places.
6532 comm->cutoff = GMX_CUTOFF_INF;
6534 else
6536 comm->cutoff = ir->rlistlong;
6538 comm->cutoff_mbody = 0;
6540 comm->cellsize_limit = 0;
6541 comm->bBondComm = FALSE;
6543 if (comm->bInterCGBondeds)
6545 if (comm_distance_min > 0)
6547 comm->cutoff_mbody = comm_distance_min;
6548 if (Flags & MD_DDBONDCOMM)
6550 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6552 else
6554 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6556 r_bonded_limit = comm->cutoff_mbody;
6558 else if (ir->bPeriodicMols)
6560 /* Can not easily determine the required cut-off */
6561 dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6562 comm->cutoff_mbody = comm->cutoff/2;
6563 r_bonded_limit = comm->cutoff_mbody;
6565 else
6567 if (MASTER(cr))
6569 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6570 Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6572 gmx_bcast(sizeof(r_2b),&r_2b,cr);
6573 gmx_bcast(sizeof(r_mb),&r_mb,cr);
6575 /* We use an initial margin of 10% for the minimum cell size,
6576 * except when we are just below the non-bonded cut-off.
6578 if (Flags & MD_DDBONDCOMM)
6580 if (max(r_2b,r_mb) > comm->cutoff)
6582 r_bonded = max(r_2b,r_mb);
6583 r_bonded_limit = 1.1*r_bonded;
6584 comm->bBondComm = TRUE;
6586 else
6588 r_bonded = r_mb;
6589 r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6591 /* We determine cutoff_mbody later */
6593 else
6595 /* No special bonded communication,
6596 * simply increase the DD cut-off.
6598 r_bonded_limit = 1.1*max(r_2b,r_mb);
6599 comm->cutoff_mbody = r_bonded_limit;
6600 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6603 comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6604 if (fplog)
6606 fprintf(fplog,
6607 "Minimum cell size due to bonded interactions: %.3f nm\n",
6608 comm->cellsize_limit);
6612 if (dd->bInterCGcons && rconstr <= 0)
6614 /* There is a cell size limit due to the constraints (P-LINCS) */
6615 rconstr = constr_r_max(fplog,mtop,ir);
6616 if (fplog)
6618 fprintf(fplog,
6619 "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6620 rconstr);
6621 if (rconstr > comm->cellsize_limit)
6623 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6627 else if (rconstr > 0 && fplog)
6629 /* Here we do not check for dd->bInterCGcons,
6630 * because one can also set a cell size limit for virtual sites only
6631 * and at this point we don't know yet if there are intercg v-sites.
6633 fprintf(fplog,
6634 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6635 rconstr);
6637 comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6639 comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6641 if (nc[XX] > 0)
6643 copy_ivec(nc,dd->nc);
6644 set_dd_dim(fplog,dd);
6645 set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6647 if (cr->npmenodes == -1)
6649 cr->npmenodes = 0;
6651 acs = average_cellsize_min(dd,ddbox);
6652 if (acs < comm->cellsize_limit)
6654 if (fplog)
6656 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6658 gmx_fatal_collective(FARGS,cr,NULL,
6659 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6660 acs,comm->cellsize_limit);
6663 else
6665 set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6667 /* We need to choose the optimal DD grid and possibly PME nodes */
6668 limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6669 comm->eDLB!=edlbNO,dlb_scale,
6670 comm->cellsize_limit,comm->cutoff,
6671 comm->bInterCGBondeds,comm->bInterCGMultiBody);
6673 if (dd->nc[XX] == 0)
6675 bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6676 sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6677 !bC ? "-rdd" : "-rcon",
6678 comm->eDLB!=edlbNO ? " or -dds" : "",
6679 bC ? " or your LINCS settings" : "");
6681 gmx_fatal_collective(FARGS,cr,NULL,
6682 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6683 "%s\n"
6684 "Look in the log file for details on the domain decomposition",
6685 cr->nnodes-cr->npmenodes,limit,buf);
6687 set_dd_dim(fplog,dd);
6690 if (fplog)
6692 fprintf(fplog,
6693 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6694 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6697 dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6698 if (cr->nnodes - dd->nnodes != cr->npmenodes)
6700 gmx_fatal_collective(FARGS,cr,NULL,
6701 "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6702 dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6704 if (cr->npmenodes > dd->nnodes)
6706 gmx_fatal_collective(FARGS,cr,NULL,
6707 "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6709 if (cr->npmenodes > 0)
6711 comm->npmenodes = cr->npmenodes;
6713 else
6715 comm->npmenodes = dd->nnodes;
6718 if (EEL_PME(ir->coulombtype))
6720 /* The following choices should match those
6721 * in comm_cost_est in domdec_setup.c.
6722 * Note that here the checks have to take into account
6723 * that the decomposition might occur in a different order than xyz
6724 * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6725 * in which case they will not match those in comm_cost_est,
6726 * but since that is mainly for testing purposes that's fine.
6728 if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6729 comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6730 getenv("GMX_PMEONEDD") == NULL)
6732 comm->npmedecompdim = 2;
6733 comm->npmenodes_x = dd->nc[XX];
6734 comm->npmenodes_y = comm->npmenodes/comm->npmenodes_x;
6736 else
6738 /* In case nc is 1 in both x and y we could still choose to
6739 * decompose pme in y instead of x, but we use x for simplicity.
6741 comm->npmedecompdim = 1;
6742 if (dd->dim[0] == YY)
6744 comm->npmenodes_x = 1;
6745 comm->npmenodes_y = comm->npmenodes;
6747 else
6749 comm->npmenodes_x = comm->npmenodes;
6750 comm->npmenodes_y = 1;
6753 if (fplog)
6755 fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6756 comm->npmenodes_x,comm->npmenodes_y,1);
6759 else
6761 comm->npmedecompdim = 0;
6762 comm->npmenodes_x = 0;
6763 comm->npmenodes_y = 0;
6766 /* Technically we don't need both of these,
6767 * but it simplifies code not having to recalculate it.
6769 *npme_x = comm->npmenodes_x;
6770 *npme_y = comm->npmenodes_y;
6772 snew(comm->slb_frac,DIM);
6773 if (comm->eDLB == edlbNO)
6775 comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6776 comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6777 comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6780 if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6782 if (comm->bBondComm || comm->eDLB != edlbNO)
6784 /* Set the bonded communication distance to halfway
6785 * the minimum and the maximum,
6786 * since the extra communication cost is nearly zero.
6788 acs = average_cellsize_min(dd,ddbox);
6789 comm->cutoff_mbody = 0.5*(r_bonded + acs);
6790 if (comm->eDLB != edlbNO)
6792 /* Check if this does not limit the scaling */
6793 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6795 if (!comm->bBondComm)
6797 /* Without bBondComm do not go beyond the n.b. cut-off */
6798 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6799 if (comm->cellsize_limit >= comm->cutoff)
6801 /* We don't loose a lot of efficieny
6802 * when increasing it to the n.b. cut-off.
6803 * It can even be slightly faster, because we need
6804 * less checks for the communication setup.
6806 comm->cutoff_mbody = comm->cutoff;
6809 /* Check if we did not end up below our original limit */
6810 comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6812 if (comm->cutoff_mbody > comm->cellsize_limit)
6814 comm->cellsize_limit = comm->cutoff_mbody;
6817 /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6820 if (debug)
6822 fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6823 "cellsize limit %f\n",
6824 comm->bBondComm,comm->cellsize_limit);
6827 if (MASTER(cr))
6829 check_dd_restrictions(cr,dd,ir,fplog);
6832 comm->partition_step = INT_MIN;
6833 dd->ddp_count = 0;
6835 clear_dd_cycle_counts(dd);
6837 return dd;
6840 static void set_dlb_limits(gmx_domdec_t *dd)
6843 int d;
6845 for(d=0; d<dd->ndim; d++)
6847 dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6848 dd->comm->cellsize_min[dd->dim[d]] =
6849 dd->comm->cellsize_min_dlb[dd->dim[d]];
6854 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6856 gmx_domdec_t *dd;
6857 gmx_domdec_comm_t *comm;
6858 real cellsize_min;
6859 int d,nc,i;
6860 char buf[STRLEN];
6862 dd = cr->dd;
6863 comm = dd->comm;
6865 if (fplog)
6867 fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6870 cellsize_min = comm->cellsize_min[dd->dim[0]];
6871 for(d=1; d<dd->ndim; d++)
6873 cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6876 if (cellsize_min < comm->cellsize_limit*1.05)
6878 dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6880 /* Change DLB from "auto" to "no". */
6881 comm->eDLB = edlbNO;
6883 return;
6886 dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6887 comm->bDynLoadBal = TRUE;
6888 dd->bGridJump = TRUE;
6890 set_dlb_limits(dd);
6892 /* We can set the required cell size info here,
6893 * so we do not need to communicate this.
6894 * The grid is completely uniform.
6896 for(d=0; d<dd->ndim; d++)
6898 if (comm->root[d])
6900 comm->load[d].sum_m = comm->load[d].sum;
6902 nc = dd->nc[dd->dim[d]];
6903 for(i=0; i<nc; i++)
6905 comm->root[d]->cell_f[i] = i/(real)nc;
6906 if (d > 0)
6908 comm->root[d]->cell_f_max0[i] = i /(real)nc;
6909 comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6912 comm->root[d]->cell_f[nc] = 1.0;
6917 static char *init_bLocalCG(gmx_mtop_t *mtop)
6919 int ncg,cg;
6920 char *bLocalCG;
6922 ncg = ncg_mtop(mtop);
6923 snew(bLocalCG,ncg);
6924 for(cg=0; cg<ncg; cg++)
6926 bLocalCG[cg] = FALSE;
6929 return bLocalCG;
6932 void dd_init_bondeds(FILE *fplog,
6933 gmx_domdec_t *dd,gmx_mtop_t *mtop,
6934 gmx_vsite_t *vsite,gmx_constr_t constr,
6935 t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6937 gmx_domdec_comm_t *comm;
6938 gmx_bool bBondComm;
6939 int d;
6941 dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6943 comm = dd->comm;
6945 if (comm->bBondComm)
6947 /* Communicate atoms beyond the cut-off for bonded interactions */
6948 comm = dd->comm;
6950 comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6952 comm->bLocalCG = init_bLocalCG(mtop);
6954 else
6956 /* Only communicate atoms based on cut-off */
6957 comm->cglink = NULL;
6958 comm->bLocalCG = NULL;
6962 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6963 t_inputrec *ir,
6964 gmx_bool bDynLoadBal,real dlb_scale,
6965 gmx_ddbox_t *ddbox)
6967 gmx_domdec_comm_t *comm;
6968 int d;
6969 ivec np;
6970 real limit,shrink;
6971 char buf[64];
6973 if (fplog == NULL)
6975 return;
6978 comm = dd->comm;
6980 if (bDynLoadBal)
6982 fprintf(fplog,"The maximum number of communication pulses is:");
6983 for(d=0; d<dd->ndim; d++)
6985 fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6987 fprintf(fplog,"\n");
6988 fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6989 fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6990 fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6991 for(d=0; d<DIM; d++)
6993 if (dd->nc[d] > 1)
6995 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6997 shrink = 0;
6999 else
7001 shrink =
7002 comm->cellsize_min_dlb[d]/
7003 (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7005 fprintf(fplog," %c %.2f",dim2char(d),shrink);
7008 fprintf(fplog,"\n");
7010 else
7012 set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
7013 fprintf(fplog,"The initial number of communication pulses is:");
7014 for(d=0; d<dd->ndim; d++)
7016 fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
7018 fprintf(fplog,"\n");
7019 fprintf(fplog,"The initial domain decomposition cell size is:");
7020 for(d=0; d<DIM; d++) {
7021 if (dd->nc[d] > 1)
7023 fprintf(fplog," %c %.2f nm",
7024 dim2char(d),dd->comm->cellsize_min[d]);
7027 fprintf(fplog,"\n\n");
7030 if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7032 fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
7033 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7034 "non-bonded interactions","",comm->cutoff);
7036 if (bDynLoadBal)
7038 limit = dd->comm->cellsize_limit;
7040 else
7042 if (dynamic_dd_box(ddbox,ir))
7044 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
7046 limit = dd->comm->cellsize_min[XX];
7047 for(d=1; d<DIM; d++)
7049 limit = min(limit,dd->comm->cellsize_min[d]);
7053 if (comm->bInterCGBondeds)
7055 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7056 "two-body bonded interactions","(-rdd)",
7057 max(comm->cutoff,comm->cutoff_mbody));
7058 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7059 "multi-body bonded interactions","(-rdd)",
7060 (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
7062 if (dd->vsite_comm)
7064 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7065 "virtual site constructions","(-rcon)",limit);
7067 if (dd->constraint_comm)
7069 sprintf(buf,"atoms separated by up to %d constraints",
7070 1+ir->nProjOrder);
7071 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7072 buf,"(-rcon)",limit);
7074 fprintf(fplog,"\n");
7077 fflush(fplog);
7080 static void set_cell_limits_dlb(gmx_domdec_t *dd,
7081 real dlb_scale,
7082 const t_inputrec *ir,
7083 const gmx_ddbox_t *ddbox)
7085 gmx_domdec_comm_t *comm;
7086 int d,dim,npulse,npulse_d_max,npulse_d;
7087 gmx_bool bNoCutOff;
7089 comm = dd->comm;
7091 bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7093 /* Determine the maximum number of comm. pulses in one dimension */
7095 comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7097 /* Determine the maximum required number of grid pulses */
7098 if (comm->cellsize_limit >= comm->cutoff)
7100 /* Only a single pulse is required */
7101 npulse = 1;
7103 else if (!bNoCutOff && comm->cellsize_limit > 0)
7105 /* We round down slightly here to avoid overhead due to the latency
7106 * of extra communication calls when the cut-off
7107 * would be only slightly longer than the cell size.
7108 * Later cellsize_limit is redetermined,
7109 * so we can not miss interactions due to this rounding.
7111 npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7113 else
7115 /* There is no cell size limit */
7116 npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
7119 if (!bNoCutOff && npulse > 1)
7121 /* See if we can do with less pulses, based on dlb_scale */
7122 npulse_d_max = 0;
7123 for(d=0; d<dd->ndim; d++)
7125 dim = dd->dim[d];
7126 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7127 /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7128 npulse_d_max = max(npulse_d_max,npulse_d);
7130 npulse = min(npulse,npulse_d_max);
7133 /* This env var can override npulse */
7134 d = dd_nst_env(debug,"GMX_DD_NPULSE",0);
7135 if (d > 0)
7137 npulse = d;
7140 comm->maxpulse = 1;
7141 comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7142 for(d=0; d<dd->ndim; d++)
7144 comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
7145 comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7146 snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
7147 comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
7148 if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7150 comm->bVacDLBNoLimit = FALSE;
7154 /* cellsize_limit is set for LINCS in init_domain_decomposition */
7155 if (!comm->bVacDLBNoLimit)
7157 comm->cellsize_limit = max(comm->cellsize_limit,
7158 comm->cutoff/comm->maxpulse);
7160 comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7161 /* Set the minimum cell size for each DD dimension */
7162 for(d=0; d<dd->ndim; d++)
7164 if (comm->bVacDLBNoLimit ||
7165 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7167 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7169 else
7171 comm->cellsize_min_dlb[dd->dim[d]] =
7172 comm->cutoff/comm->cd[d].np_dlb;
7175 if (comm->cutoff_mbody <= 0)
7177 comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
7179 if (comm->bDynLoadBal)
7181 set_dlb_limits(dd);
7185 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC)
7187 /* If each molecule is a single charge group
7188 * or we use domain decomposition for each periodic dimension,
7189 * we do not need to take pbc into account for the bonded interactions.
7191 return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7192 !(dd->nc[XX]>1 &&
7193 dd->nc[YY]>1 &&
7194 (dd->nc[ZZ]>1 || ePBC==epbcXY)));
7197 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
7198 t_inputrec *ir,t_forcerec *fr,
7199 gmx_ddbox_t *ddbox)
7201 gmx_domdec_comm_t *comm;
7202 int natoms_tot;
7203 real vol_frac;
7205 comm = dd->comm;
7207 /* Initialize the thread data.
7208 * This can not be done in init_domain_decomposition,
7209 * as the numbers of threads is determined later.
7211 comm->nth = gmx_omp_nthreads_get(emntDomdec);
7212 if (comm->nth > 1)
7214 snew(comm->dth,comm->nth);
7217 if (EEL_PME(ir->coulombtype))
7219 init_ddpme(dd,&comm->ddpme[0],0);
7220 if (comm->npmedecompdim >= 2)
7222 init_ddpme(dd,&comm->ddpme[1],1);
7225 else
7227 comm->npmenodes = 0;
7228 if (dd->pme_nodeid >= 0)
7230 gmx_fatal_collective(FARGS,NULL,dd,
7231 "Can not have separate PME nodes without PME electrostatics");
7235 if (debug)
7237 fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
7239 if (comm->eDLB != edlbNO)
7241 set_cell_limits_dlb(dd,dlb_scale,ir,ddbox);
7244 print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
7245 if (comm->eDLB == edlbAUTO)
7247 if (fplog)
7249 fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
7251 print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
7254 if (ir->ePBC == epbcNONE)
7256 vol_frac = 1 - 1/(double)dd->nnodes;
7258 else
7260 vol_frac =
7261 (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7263 if (debug)
7265 fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7267 natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7269 dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7272 gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
7273 real cutoff_req)
7275 gmx_domdec_t *dd;
7276 gmx_ddbox_t ddbox;
7277 int d,dim,np;
7278 real inv_cell_size;
7279 int LocallyLimited;
7281 dd = cr->dd;
7283 set_ddbox(dd,FALSE,cr,ir,state->box,
7284 TRUE,&dd->comm->cgs_gl,state->x,&ddbox);
7286 LocallyLimited = 0;
7288 for(d=0; d<dd->ndim; d++)
7290 dim = dd->dim[d];
7292 inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7293 if (dynamic_dd_box(&ddbox,ir))
7295 inv_cell_size *= DD_PRES_SCALE_MARGIN;
7298 np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7300 if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7301 dd->comm->cd[d].np_dlb > 0)
7303 if (np > dd->comm->cd[d].np_dlb)
7305 return FALSE;
7308 /* If a current local cell size is smaller than the requested
7309 * cut-off, we could still fix it, but this gets very complicated.
7310 * Without fixing here, we might actually need more checks.
7312 if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7314 LocallyLimited = 1;
7319 if (dd->comm->eDLB != edlbNO)
7321 /* If DLB is not active yet, we don't need to check the grid jumps.
7322 * Actually we shouldn't, because then the grid jump data is not set.
7324 if (dd->comm->bDynLoadBal &&
7325 check_grid_jump(0,dd,cutoff_req,&ddbox,FALSE))
7327 LocallyLimited = 1;
7330 gmx_sumi(1,&LocallyLimited,cr);
7332 if (LocallyLimited > 0)
7334 return FALSE;
7338 dd->comm->cutoff = cutoff_req;
7340 return TRUE;
7343 static void merge_cg_buffers(int ncell,
7344 gmx_domdec_comm_dim_t *cd, int pulse,
7345 int *ncg_cell,
7346 int *index_gl, int *recv_i,
7347 rvec *cg_cm, rvec *recv_vr,
7348 int *cgindex,
7349 cginfo_mb_t *cginfo_mb,int *cginfo)
7351 gmx_domdec_ind_t *ind,*ind_p;
7352 int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7353 int shift,shift_at;
7355 ind = &cd->ind[pulse];
7357 /* First correct the already stored data */
7358 shift = ind->nrecv[ncell];
7359 for(cell=ncell-1; cell>=0; cell--)
7361 shift -= ind->nrecv[cell];
7362 if (shift > 0)
7364 /* Move the cg's present from previous grid pulses */
7365 cg0 = ncg_cell[ncell+cell];
7366 cg1 = ncg_cell[ncell+cell+1];
7367 cgindex[cg1+shift] = cgindex[cg1];
7368 for(cg=cg1-1; cg>=cg0; cg--)
7370 index_gl[cg+shift] = index_gl[cg];
7371 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7372 cgindex[cg+shift] = cgindex[cg];
7373 cginfo[cg+shift] = cginfo[cg];
7375 /* Correct the already stored send indices for the shift */
7376 for(p=1; p<=pulse; p++)
7378 ind_p = &cd->ind[p];
7379 cg0 = 0;
7380 for(c=0; c<cell; c++)
7382 cg0 += ind_p->nsend[c];
7384 cg1 = cg0 + ind_p->nsend[cell];
7385 for(cg=cg0; cg<cg1; cg++)
7387 ind_p->index[cg] += shift;
7393 /* Merge in the communicated buffers */
7394 shift = 0;
7395 shift_at = 0;
7396 cg0 = 0;
7397 for(cell=0; cell<ncell; cell++)
7399 cg1 = ncg_cell[ncell+cell+1] + shift;
7400 if (shift_at > 0)
7402 /* Correct the old cg indices */
7403 for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7405 cgindex[cg+1] += shift_at;
7408 for(cg=0; cg<ind->nrecv[cell]; cg++)
7410 /* Copy this charge group from the buffer */
7411 index_gl[cg1] = recv_i[cg0];
7412 copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7413 /* Add it to the cgindex */
7414 cg_gl = index_gl[cg1];
7415 cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7416 nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7417 cgindex[cg1+1] = cgindex[cg1] + nat;
7418 cg0++;
7419 cg1++;
7420 shift_at += nat;
7422 shift += ind->nrecv[cell];
7423 ncg_cell[ncell+cell+1] = cg1;
7427 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7428 int nzone,int cg0,const int *cgindex)
7430 int cg,zone,p;
7432 /* Store the atom block boundaries for easy copying of communication buffers
7434 cg = cg0;
7435 for(zone=0; zone<nzone; zone++)
7437 for(p=0; p<cd->np; p++) {
7438 cd->ind[p].cell2at0[zone] = cgindex[cg];
7439 cg += cd->ind[p].nrecv[zone];
7440 cd->ind[p].cell2at1[zone] = cgindex[cg];
7445 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7447 int i;
7448 gmx_bool bMiss;
7450 bMiss = FALSE;
7451 for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7453 if (!bLocalCG[link->a[i]])
7455 bMiss = TRUE;
7459 return bMiss;
7462 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7463 typedef struct {
7464 real c[DIM][4]; /* the corners for the non-bonded communication */
7465 real cr0; /* corner for rounding */
7466 real cr1[4]; /* corners for rounding */
7467 real bc[DIM]; /* corners for bounded communication */
7468 real bcr1; /* corner for rounding for bonded communication */
7469 } dd_corners_t;
7471 /* Determine the corners of the domain(s) we are communicating with */
7472 static void
7473 set_dd_corners(const gmx_domdec_t *dd,
7474 int dim0, int dim1, int dim2,
7475 gmx_bool bDistMB,
7476 dd_corners_t *c)
7478 const gmx_domdec_comm_t *comm;
7479 const gmx_domdec_zones_t *zones;
7480 int i,j;
7482 comm = dd->comm;
7484 zones = &comm->zones;
7486 /* Keep the compiler happy */
7487 c->cr0 = 0;
7488 c->bcr1 = 0;
7490 /* The first dimension is equal for all cells */
7491 c->c[0][0] = comm->cell_x0[dim0];
7492 if (bDistMB)
7494 c->bc[0] = c->c[0][0];
7496 if (dd->ndim >= 2)
7498 dim1 = dd->dim[1];
7499 /* This cell row is only seen from the first row */
7500 c->c[1][0] = comm->cell_x0[dim1];
7501 /* All rows can see this row */
7502 c->c[1][1] = comm->cell_x0[dim1];
7503 if (dd->bGridJump)
7505 c->c[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7506 if (bDistMB)
7508 /* For the multi-body distance we need the maximum */
7509 c->bc[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7512 /* Set the upper-right corner for rounding */
7513 c->cr0 = comm->cell_x1[dim0];
7515 if (dd->ndim >= 3)
7517 dim2 = dd->dim[2];
7518 for(j=0; j<4; j++)
7520 c->c[2][j] = comm->cell_x0[dim2];
7522 if (dd->bGridJump)
7524 /* Use the maximum of the i-cells that see a j-cell */
7525 for(i=0; i<zones->nizone; i++)
7527 for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7529 if (j >= 4)
7531 c->c[2][j-4] =
7532 max(c->c[2][j-4],
7533 comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7537 if (bDistMB)
7539 /* For the multi-body distance we need the maximum */
7540 c->bc[2] = comm->cell_x0[dim2];
7541 for(i=0; i<2; i++)
7543 for(j=0; j<2; j++)
7545 c->bc[2] = max(c->bc[2],comm->zone_d2[i][j].p1_0);
7551 /* Set the upper-right corner for rounding */
7552 /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7553 * Only cell (0,0,0) can see cell 7 (1,1,1)
7555 c->cr1[0] = comm->cell_x1[dim1];
7556 c->cr1[3] = comm->cell_x1[dim1];
7557 if (dd->bGridJump)
7559 c->cr1[0] = max(comm->cell_x1[dim1],comm->zone_d1[1].mch1);
7560 if (bDistMB)
7562 /* For the multi-body distance we need the maximum */
7563 c->bcr1 = max(comm->cell_x1[dim1],comm->zone_d1[1].p1_1);
7570 /* Determine which cg's we need to send in this pulse from this zone */
7571 static void
7572 get_zone_pulse_cgs(gmx_domdec_t *dd,
7573 int zonei, int zone,
7574 int cg0, int cg1,
7575 const int *index_gl,
7576 const int *cgindex,
7577 int dim, int dim_ind,
7578 int dim0, int dim1, int dim2,
7579 real r_comm2, real r_bcomm2,
7580 matrix box,
7581 ivec tric_dist,
7582 rvec *normal,
7583 real skew_fac2_d, real skew_fac_01,
7584 rvec *v_d, rvec *v_0, rvec *v_1,
7585 const dd_corners_t *c,
7586 rvec sf2_round,
7587 gmx_bool bDistBonded,
7588 gmx_bool bBondComm,
7589 gmx_bool bDist2B,
7590 gmx_bool bDistMB,
7591 rvec *cg_cm,
7592 int *cginfo,
7593 gmx_domdec_ind_t *ind,
7594 int **ibuf, int *ibuf_nalloc,
7595 vec_rvec_t *vbuf,
7596 int *nsend_ptr,
7597 int *nat_ptr,
7598 int *nsend_z_ptr)
7600 gmx_domdec_comm_t *comm;
7601 gmx_bool bScrew;
7602 gmx_bool bDistMB_pulse;
7603 int cg,i;
7604 real r2,rb2,r,tric_sh;
7605 rvec rn,rb;
7606 int dimd;
7607 int nsend_z,nsend,nat;
7609 comm = dd->comm;
7611 bScrew = (dd->bScrewPBC && dim == XX);
7613 bDistMB_pulse = (bDistMB && bDistBonded);
7615 nsend_z = 0;
7616 nsend = *nsend_ptr;
7617 nat = *nat_ptr;
7619 for(cg=cg0; cg<cg1; cg++)
7621 r2 = 0;
7622 rb2 = 0;
7623 if (tric_dist[dim_ind] == 0)
7625 /* Rectangular direction, easy */
7626 r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7627 if (r > 0)
7629 r2 += r*r;
7631 if (bDistMB_pulse)
7633 r = cg_cm[cg][dim] - c->bc[dim_ind];
7634 if (r > 0)
7636 rb2 += r*r;
7639 /* Rounding gives at most a 16% reduction
7640 * in communicated atoms
7642 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7644 r = cg_cm[cg][dim0] - c->cr0;
7645 /* This is the first dimension, so always r >= 0 */
7646 r2 += r*r;
7647 if (bDistMB_pulse)
7649 rb2 += r*r;
7652 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7654 r = cg_cm[cg][dim1] - c->cr1[zone];
7655 if (r > 0)
7657 r2 += r*r;
7659 if (bDistMB_pulse)
7661 r = cg_cm[cg][dim1] - c->bcr1;
7662 if (r > 0)
7664 rb2 += r*r;
7669 else
7671 /* Triclinic direction, more complicated */
7672 clear_rvec(rn);
7673 clear_rvec(rb);
7674 /* Rounding, conservative as the skew_fac multiplication
7675 * will slightly underestimate the distance.
7677 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7679 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7680 for(i=dim0+1; i<DIM; i++)
7682 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7684 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7685 if (bDistMB_pulse)
7687 rb[dim0] = rn[dim0];
7688 rb2 = r2;
7690 /* Take care that the cell planes along dim0 might not
7691 * be orthogonal to those along dim1 and dim2.
7693 for(i=1; i<=dim_ind; i++)
7695 dimd = dd->dim[i];
7696 if (normal[dim0][dimd] > 0)
7698 rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7699 if (bDistMB_pulse)
7701 rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7706 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7708 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7709 tric_sh = 0;
7710 for(i=dim1+1; i<DIM; i++)
7712 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7714 rn[dim1] += tric_sh;
7715 if (rn[dim1] > 0)
7717 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7718 /* Take care of coupling of the distances
7719 * to the planes along dim0 and dim1 through dim2.
7721 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7722 /* Take care that the cell planes along dim1
7723 * might not be orthogonal to that along dim2.
7725 if (normal[dim1][dim2] > 0)
7727 rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7730 if (bDistMB_pulse)
7732 rb[dim1] +=
7733 cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7734 if (rb[dim1] > 0)
7736 rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7737 /* Take care of coupling of the distances
7738 * to the planes along dim0 and dim1 through dim2.
7740 rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7741 /* Take care that the cell planes along dim1
7742 * might not be orthogonal to that along dim2.
7744 if (normal[dim1][dim2] > 0)
7746 rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7751 /* The distance along the communication direction */
7752 rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7753 tric_sh = 0;
7754 for(i=dim+1; i<DIM; i++)
7756 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7758 rn[dim] += tric_sh;
7759 if (rn[dim] > 0)
7761 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7762 /* Take care of coupling of the distances
7763 * to the planes along dim0 and dim1 through dim2.
7765 if (dim_ind == 1 && zonei == 1)
7767 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7770 if (bDistMB_pulse)
7772 clear_rvec(rb);
7773 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7774 if (rb[dim] > 0)
7776 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7777 /* Take care of coupling of the distances
7778 * to the planes along dim0 and dim1 through dim2.
7780 if (dim_ind == 1 && zonei == 1)
7782 rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7788 if (r2 < r_comm2 ||
7789 (bDistBonded &&
7790 ((bDistMB && rb2 < r_bcomm2) ||
7791 (bDist2B && r2 < r_bcomm2)) &&
7792 (!bBondComm ||
7793 (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7794 missing_link(comm->cglink,index_gl[cg],
7795 comm->bLocalCG)))))
7797 /* Make an index to the local charge groups */
7798 if (nsend+1 > ind->nalloc)
7800 ind->nalloc = over_alloc_large(nsend+1);
7801 srenew(ind->index,ind->nalloc);
7803 if (nsend+1 > *ibuf_nalloc)
7805 *ibuf_nalloc = over_alloc_large(nsend+1);
7806 srenew(*ibuf,*ibuf_nalloc);
7808 ind->index[nsend] = cg;
7809 (*ibuf)[nsend] = index_gl[cg];
7810 nsend_z++;
7811 vec_rvec_check_alloc(vbuf,nsend+1);
7813 if (dd->ci[dim] == 0)
7815 /* Correct cg_cm for pbc */
7816 rvec_add(cg_cm[cg],box[dim],vbuf->v[nsend]);
7817 if (bScrew)
7819 vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7820 vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7823 else
7825 copy_rvec(cg_cm[cg],vbuf->v[nsend]);
7827 nsend++;
7828 nat += cgindex[cg+1] - cgindex[cg];
7832 *nsend_ptr = nsend;
7833 *nat_ptr = nat;
7834 *nsend_z_ptr = nsend_z;
7837 static void setup_dd_communication(gmx_domdec_t *dd,
7838 matrix box,gmx_ddbox_t *ddbox,
7839 t_forcerec *fr,t_state *state,rvec **f)
7841 int dim_ind,dim,dim0,dim1,dim2,dimd,p,nat_tot;
7842 int nzone,nzone_send,zone,zonei,cg0,cg1;
7843 int c,i,j,cg,cg_gl,nrcg;
7844 int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7845 gmx_domdec_comm_t *comm;
7846 gmx_domdec_zones_t *zones;
7847 gmx_domdec_comm_dim_t *cd;
7848 gmx_domdec_ind_t *ind;
7849 cginfo_mb_t *cginfo_mb;
7850 gmx_bool bBondComm,bDist2B,bDistMB,bDistBonded;
7851 real r_mb,r_comm2,r_scomm2,r_bcomm2,r_0,r_1,r2inc,inv_ncg;
7852 dd_corners_t corners;
7853 ivec tric_dist;
7854 rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7855 real skew_fac2_d,skew_fac_01;
7856 rvec sf2_round;
7857 int nsend,nat;
7858 int th;
7860 if (debug)
7862 fprintf(debug,"Setting up DD communication\n");
7865 comm = dd->comm;
7867 switch (fr->cutoff_scheme)
7869 case ecutsGROUP:
7870 cg_cm = fr->cg_cm;
7871 break;
7872 case ecutsVERLET:
7873 cg_cm = state->x;
7874 break;
7875 default:
7876 gmx_incons("unimplemented");
7877 cg_cm = NULL;
7880 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7882 dim = dd->dim[dim_ind];
7884 /* Check if we need to use triclinic distances */
7885 tric_dist[dim_ind] = 0;
7886 for(i=0; i<=dim_ind; i++)
7888 if (ddbox->tric_dir[dd->dim[i]])
7890 tric_dist[dim_ind] = 1;
7895 bBondComm = comm->bBondComm;
7897 /* Do we need to determine extra distances for multi-body bondeds? */
7898 bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7900 /* Do we need to determine extra distances for only two-body bondeds? */
7901 bDist2B = (bBondComm && !bDistMB);
7903 r_comm2 = sqr(comm->cutoff);
7904 r_bcomm2 = sqr(comm->cutoff_mbody);
7906 if (debug)
7908 fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7911 zones = &comm->zones;
7913 dim0 = dd->dim[0];
7914 dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
7915 dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
7917 set_dd_corners(dd,dim0,dim1,dim2,bDistMB,&corners);
7919 /* Triclinic stuff */
7920 normal = ddbox->normal;
7921 skew_fac_01 = 0;
7922 if (dd->ndim >= 2)
7924 v_0 = ddbox->v[dim0];
7925 if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7927 /* Determine the coupling coefficient for the distances
7928 * to the cell planes along dim0 and dim1 through dim2.
7929 * This is required for correct rounding.
7931 skew_fac_01 =
7932 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7933 if (debug)
7935 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7939 if (dd->ndim >= 3)
7941 v_1 = ddbox->v[dim1];
7944 zone_cg_range = zones->cg_range;
7945 index_gl = dd->index_gl;
7946 cgindex = dd->cgindex;
7947 cginfo_mb = fr->cginfo_mb;
7949 zone_cg_range[0] = 0;
7950 zone_cg_range[1] = dd->ncg_home;
7951 comm->zone_ncg1[0] = dd->ncg_home;
7952 pos_cg = dd->ncg_home;
7954 nat_tot = dd->nat_home;
7955 nzone = 1;
7956 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7958 dim = dd->dim[dim_ind];
7959 cd = &comm->cd[dim_ind];
7961 if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7963 /* No pbc in this dimension, the first node should not comm. */
7964 nzone_send = 0;
7966 else
7968 nzone_send = nzone;
7971 v_d = ddbox->v[dim];
7972 skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7974 cd->bInPlace = TRUE;
7975 for(p=0; p<cd->np; p++)
7977 /* Only atoms communicated in the first pulse are used
7978 * for multi-body bonded interactions or for bBondComm.
7980 bDistBonded = ((bDistMB || bDist2B) && p == 0);
7982 ind = &cd->ind[p];
7983 nsend = 0;
7984 nat = 0;
7985 for(zone=0; zone<nzone_send; zone++)
7987 if (tric_dist[dim_ind] && dim_ind > 0)
7989 /* Determine slightly more optimized skew_fac's
7990 * for rounding.
7991 * This reduces the number of communicated atoms
7992 * by about 10% for 3D DD of rhombic dodecahedra.
7994 for(dimd=0; dimd<dim; dimd++)
7996 sf2_round[dimd] = 1;
7997 if (ddbox->tric_dir[dimd])
7999 for(i=dd->dim[dimd]+1; i<DIM; i++)
8001 /* If we are shifted in dimension i
8002 * and the cell plane is tilted forward
8003 * in dimension i, skip this coupling.
8005 if (!(zones->shift[nzone+zone][i] &&
8006 ddbox->v[dimd][i][dimd] >= 0))
8008 sf2_round[dimd] +=
8009 sqr(ddbox->v[dimd][i][dimd]);
8012 sf2_round[dimd] = 1/sf2_round[dimd];
8017 zonei = zone_perm[dim_ind][zone];
8018 if (p == 0)
8020 /* Here we permutate the zones to obtain a convenient order
8021 * for neighbor searching
8023 cg0 = zone_cg_range[zonei];
8024 cg1 = zone_cg_range[zonei+1];
8026 else
8028 /* Look only at the cg's received in the previous grid pulse
8030 cg1 = zone_cg_range[nzone+zone+1];
8031 cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8034 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8035 for(th=0; th<comm->nth; th++)
8037 gmx_domdec_ind_t *ind_p;
8038 int **ibuf_p,*ibuf_nalloc_p;
8039 vec_rvec_t *vbuf_p;
8040 int *nsend_p,*nat_p;
8041 int *nsend_zone_p;
8042 int cg0_th,cg1_th;
8044 if (th == 0)
8046 /* Thread 0 writes in the comm buffers */
8047 ind_p = ind;
8048 ibuf_p = &comm->buf_int;
8049 ibuf_nalloc_p = &comm->nalloc_int;
8050 vbuf_p = &comm->vbuf;
8051 nsend_p = &nsend;
8052 nat_p = &nat;
8053 nsend_zone_p = &ind->nsend[zone];
8055 else
8057 /* Other threads write into temp buffers */
8058 ind_p = &comm->dth[th].ind;
8059 ibuf_p = &comm->dth[th].ibuf;
8060 ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8061 vbuf_p = &comm->dth[th].vbuf;
8062 nsend_p = &comm->dth[th].nsend;
8063 nat_p = &comm->dth[th].nat;
8064 nsend_zone_p = &comm->dth[th].nsend_zone;
8066 comm->dth[th].nsend = 0;
8067 comm->dth[th].nat = 0;
8068 comm->dth[th].nsend_zone = 0;
8071 if (comm->nth == 1)
8073 cg0_th = cg0;
8074 cg1_th = cg1;
8076 else
8078 cg0_th = cg0 + ((cg1 - cg0)* th )/comm->nth;
8079 cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8082 /* Get the cg's for this pulse in this zone */
8083 get_zone_pulse_cgs(dd,zonei,zone,cg0_th,cg1_th,
8084 index_gl,cgindex,
8085 dim,dim_ind,dim0,dim1,dim2,
8086 r_comm2,r_bcomm2,
8087 box,tric_dist,
8088 normal,skew_fac2_d,skew_fac_01,
8089 v_d,v_0,v_1,&corners,sf2_round,
8090 bDistBonded,bBondComm,
8091 bDist2B,bDistMB,
8092 cg_cm,fr->cginfo,
8093 ind_p,
8094 ibuf_p,ibuf_nalloc_p,
8095 vbuf_p,
8096 nsend_p,nat_p,
8097 nsend_zone_p);
8100 /* Append data of threads>=1 to the communication buffers */
8101 for(th=1; th<comm->nth; th++)
8103 dd_comm_setup_work_t *dth;
8104 int i,ns1;
8106 dth = &comm->dth[th];
8108 ns1 = nsend + dth->nsend_zone;
8109 if (ns1 > ind->nalloc)
8111 ind->nalloc = over_alloc_dd(ns1);
8112 srenew(ind->index,ind->nalloc);
8114 if (ns1 > comm->nalloc_int)
8116 comm->nalloc_int = over_alloc_dd(ns1);
8117 srenew(comm->buf_int,comm->nalloc_int);
8119 if (ns1 > comm->vbuf.nalloc)
8121 comm->vbuf.nalloc = over_alloc_dd(ns1);
8122 srenew(comm->vbuf.v,comm->vbuf.nalloc);
8125 for(i=0; i<dth->nsend_zone; i++)
8127 ind->index[nsend] = dth->ind.index[i];
8128 comm->buf_int[nsend] = dth->ibuf[i];
8129 copy_rvec(dth->vbuf.v[i],
8130 comm->vbuf.v[nsend]);
8131 nsend++;
8133 nat += dth->nat;
8134 ind->nsend[zone] += dth->nsend_zone;
8137 /* Clear the counts in case we do not have pbc */
8138 for(zone=nzone_send; zone<nzone; zone++)
8140 ind->nsend[zone] = 0;
8142 ind->nsend[nzone] = nsend;
8143 ind->nsend[nzone+1] = nat;
8144 /* Communicate the number of cg's and atoms to receive */
8145 dd_sendrecv_int(dd, dim_ind, dddirBackward,
8146 ind->nsend, nzone+2,
8147 ind->nrecv, nzone+2);
8149 /* The rvec buffer is also required for atom buffers of size nsend
8150 * in dd_move_x and dd_move_f.
8152 vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
8154 if (p > 0)
8156 /* We can receive in place if only the last zone is not empty */
8157 for(zone=0; zone<nzone-1; zone++)
8159 if (ind->nrecv[zone] > 0)
8161 cd->bInPlace = FALSE;
8164 if (!cd->bInPlace)
8166 /* The int buffer is only required here for the cg indices */
8167 if (ind->nrecv[nzone] > comm->nalloc_int2)
8169 comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8170 srenew(comm->buf_int2,comm->nalloc_int2);
8172 /* The rvec buffer is also required for atom buffers
8173 * of size nrecv in dd_move_x and dd_move_f.
8175 i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
8176 vec_rvec_check_alloc(&comm->vbuf2,i);
8180 /* Make space for the global cg indices */
8181 if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8182 || dd->cg_nalloc == 0)
8184 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8185 srenew(index_gl,dd->cg_nalloc);
8186 srenew(cgindex,dd->cg_nalloc+1);
8188 /* Communicate the global cg indices */
8189 if (cd->bInPlace)
8191 recv_i = index_gl + pos_cg;
8193 else
8195 recv_i = comm->buf_int2;
8197 dd_sendrecv_int(dd, dim_ind, dddirBackward,
8198 comm->buf_int, nsend,
8199 recv_i, ind->nrecv[nzone]);
8201 /* Make space for cg_cm */
8202 dd_check_alloc_ncg(fr,state,f,pos_cg + ind->nrecv[nzone]);
8203 if (fr->cutoff_scheme == ecutsGROUP)
8205 cg_cm = fr->cg_cm;
8207 else
8209 cg_cm = state->x;
8211 /* Communicate cg_cm */
8212 if (cd->bInPlace)
8214 recv_vr = cg_cm + pos_cg;
8216 else
8218 recv_vr = comm->vbuf2.v;
8220 dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8221 comm->vbuf.v, nsend,
8222 recv_vr, ind->nrecv[nzone]);
8224 /* Make the charge group index */
8225 if (cd->bInPlace)
8227 zone = (p == 0 ? 0 : nzone - 1);
8228 while (zone < nzone)
8230 for(cg=0; cg<ind->nrecv[zone]; cg++)
8232 cg_gl = index_gl[pos_cg];
8233 fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
8234 nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8235 cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
8236 if (bBondComm)
8238 /* Update the charge group presence,
8239 * so we can use it in the next pass of the loop.
8241 comm->bLocalCG[cg_gl] = TRUE;
8243 pos_cg++;
8245 if (p == 0)
8247 comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8249 zone++;
8250 zone_cg_range[nzone+zone] = pos_cg;
8253 else
8255 /* This part of the code is never executed with bBondComm. */
8256 merge_cg_buffers(nzone,cd,p,zone_cg_range,
8257 index_gl,recv_i,cg_cm,recv_vr,
8258 cgindex,fr->cginfo_mb,fr->cginfo);
8259 pos_cg += ind->nrecv[nzone];
8261 nat_tot += ind->nrecv[nzone+1];
8263 if (!cd->bInPlace)
8265 /* Store the atom block for easy copying of communication buffers */
8266 make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
8268 nzone += nzone;
8270 dd->index_gl = index_gl;
8271 dd->cgindex = cgindex;
8273 dd->ncg_tot = zone_cg_range[zones->n];
8274 dd->nat_tot = nat_tot;
8275 comm->nat[ddnatHOME] = dd->nat_home;
8276 for(i=ddnatZONE; i<ddnatNR; i++)
8278 comm->nat[i] = dd->nat_tot;
8281 if (!bBondComm)
8283 /* We don't need to update cginfo, since that was alrady done above.
8284 * So we pass NULL for the forcerec.
8286 dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
8287 NULL,comm->bLocalCG);
8290 if (debug)
8292 fprintf(debug,"Finished setting up DD communication, zones:");
8293 for(c=0; c<zones->n; c++)
8295 fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
8297 fprintf(debug,"\n");
8301 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8303 int c;
8305 for(c=0; c<zones->nizone; c++)
8307 zones->izone[c].cg1 = zones->cg_range[c+1];
8308 zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8309 zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8313 static void set_zones_size(gmx_domdec_t *dd,
8314 matrix box,const gmx_ddbox_t *ddbox,
8315 int zone_start,int zone_end)
8317 gmx_domdec_comm_t *comm;
8318 gmx_domdec_zones_t *zones;
8319 gmx_bool bDistMB;
8320 int z,zi,zj0,zj1,d,dim;
8321 real rcs,rcmbs;
8322 int i,j;
8323 real size_j,add_tric;
8324 real vol;
8326 comm = dd->comm;
8328 zones = &comm->zones;
8330 /* Do we need to determine extra distances for multi-body bondeds? */
8331 bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8333 for(z=zone_start; z<zone_end; z++)
8335 /* Copy cell limits to zone limits.
8336 * Valid for non-DD dims and non-shifted dims.
8338 copy_rvec(comm->cell_x0,zones->size[z].x0);
8339 copy_rvec(comm->cell_x1,zones->size[z].x1);
8342 for(d=0; d<dd->ndim; d++)
8344 dim = dd->dim[d];
8346 for(z=0; z<zones->n; z++)
8348 /* With a staggered grid we have different sizes
8349 * for non-shifted dimensions.
8351 if (dd->bGridJump && zones->shift[z][dim] == 0)
8353 if (d == 1)
8355 zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8356 zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8358 else if (d == 2)
8360 zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8361 zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8366 rcs = comm->cutoff;
8367 rcmbs = comm->cutoff_mbody;
8368 if (ddbox->tric_dir[dim])
8370 rcs /= ddbox->skew_fac[dim];
8371 rcmbs /= ddbox->skew_fac[dim];
8374 /* Set the lower limit for the shifted zone dimensions */
8375 for(z=zone_start; z<zone_end; z++)
8377 if (zones->shift[z][dim] > 0)
8379 dim = dd->dim[d];
8380 if (!dd->bGridJump || d == 0)
8382 zones->size[z].x0[dim] = comm->cell_x1[dim];
8383 zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8385 else
8387 /* Here we take the lower limit of the zone from
8388 * the lowest domain of the zone below.
8390 if (z < 4)
8392 zones->size[z].x0[dim] =
8393 comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8395 else
8397 if (d == 1)
8399 zones->size[z].x0[dim] =
8400 zones->size[zone_perm[2][z-4]].x0[dim];
8402 else
8404 zones->size[z].x0[dim] =
8405 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8408 /* A temporary limit, is updated below */
8409 zones->size[z].x1[dim] = zones->size[z].x0[dim];
8411 if (bDistMB)
8413 for(zi=0; zi<zones->nizone; zi++)
8415 if (zones->shift[zi][dim] == 0)
8417 /* This takes the whole zone into account.
8418 * With multiple pulses this will lead
8419 * to a larger zone then strictly necessary.
8421 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8422 zones->size[zi].x1[dim]+rcmbs);
8430 /* Loop over the i-zones to set the upper limit of each
8431 * j-zone they see.
8433 for(zi=0; zi<zones->nizone; zi++)
8435 if (zones->shift[zi][dim] == 0)
8437 for(z=zones->izone[zi].j0; z<zones->izone[zi].j1; z++)
8439 if (zones->shift[z][dim] > 0)
8441 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8442 zones->size[zi].x1[dim]+rcs);
8449 for(z=zone_start; z<zone_end; z++)
8451 /* Initialization only required to keep the compiler happy */
8452 rvec corner_min={0,0,0},corner_max={0,0,0},corner;
8453 int nc,c;
8455 /* To determine the bounding box for a zone we need to find
8456 * the extreme corners of 4, 2 or 1 corners.
8458 nc = 1 << (ddbox->npbcdim - 1);
8460 for(c=0; c<nc; c++)
8462 /* Set up a zone corner at x=0, ignoring trilinic couplings */
8463 corner[XX] = 0;
8464 if ((c & 1) == 0)
8466 corner[YY] = zones->size[z].x0[YY];
8468 else
8470 corner[YY] = zones->size[z].x1[YY];
8472 if ((c & 2) == 0)
8474 corner[ZZ] = zones->size[z].x0[ZZ];
8476 else
8478 corner[ZZ] = zones->size[z].x1[ZZ];
8480 if (dd->ndim == 1 && box[ZZ][YY] != 0)
8482 /* With 1D domain decomposition the cg's are not in
8483 * the triclinic box, but triclinic x-y and rectangular y-z.
8484 * Shift y back, so it will later end up at 0.
8486 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8488 /* Apply the triclinic couplings */
8489 for(i=YY; i<ddbox->npbcdim; i++)
8491 for(j=XX; j<i; j++)
8493 corner[j] += corner[i]*box[i][j]/box[i][i];
8496 if (c == 0)
8498 copy_rvec(corner,corner_min);
8499 copy_rvec(corner,corner_max);
8501 else
8503 for(i=0; i<DIM; i++)
8505 corner_min[i] = min(corner_min[i],corner[i]);
8506 corner_max[i] = max(corner_max[i],corner[i]);
8510 /* Copy the extreme cornes without offset along x */
8511 for(i=0; i<DIM; i++)
8513 zones->size[z].bb_x0[i] = corner_min[i];
8514 zones->size[z].bb_x1[i] = corner_max[i];
8516 /* Add the offset along x */
8517 zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8518 zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8521 if (zone_start == 0)
8523 vol = 1;
8524 for(dim=0; dim<DIM; dim++)
8526 vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8528 zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8531 if (debug)
8533 for(z=zone_start; z<zone_end; z++)
8535 fprintf(debug,"zone %d %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
8537 zones->size[z].x0[XX],zones->size[z].x1[XX],
8538 zones->size[z].x0[YY],zones->size[z].x1[YY],
8539 zones->size[z].x0[ZZ],zones->size[z].x1[ZZ]);
8540 fprintf(debug,"zone %d bb %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
8542 zones->size[z].bb_x0[XX],zones->size[z].bb_x1[XX],
8543 zones->size[z].bb_x0[YY],zones->size[z].bb_x1[YY],
8544 zones->size[z].bb_x0[ZZ],zones->size[z].bb_x1[ZZ]);
8549 static int comp_cgsort(const void *a,const void *b)
8551 int comp;
8553 gmx_cgsort_t *cga,*cgb;
8554 cga = (gmx_cgsort_t *)a;
8555 cgb = (gmx_cgsort_t *)b;
8557 comp = cga->nsc - cgb->nsc;
8558 if (comp == 0)
8560 comp = cga->ind_gl - cgb->ind_gl;
8563 return comp;
8566 static void order_int_cg(int n,const gmx_cgsort_t *sort,
8567 int *a,int *buf)
8569 int i;
8571 /* Order the data */
8572 for(i=0; i<n; i++)
8574 buf[i] = a[sort[i].ind];
8577 /* Copy back to the original array */
8578 for(i=0; i<n; i++)
8580 a[i] = buf[i];
8584 static void order_vec_cg(int n,const gmx_cgsort_t *sort,
8585 rvec *v,rvec *buf)
8587 int i;
8589 /* Order the data */
8590 for(i=0; i<n; i++)
8592 copy_rvec(v[sort[i].ind],buf[i]);
8595 /* Copy back to the original array */
8596 for(i=0; i<n; i++)
8598 copy_rvec(buf[i],v[i]);
8602 static void order_vec_atom(int ncg,const int *cgindex,const gmx_cgsort_t *sort,
8603 rvec *v,rvec *buf)
8605 int a,atot,cg,cg0,cg1,i;
8607 if (cgindex == NULL)
8609 /* Avoid the useless loop of the atoms within a cg */
8610 order_vec_cg(ncg,sort,v,buf);
8612 return;
8615 /* Order the data */
8616 a = 0;
8617 for(cg=0; cg<ncg; cg++)
8619 cg0 = cgindex[sort[cg].ind];
8620 cg1 = cgindex[sort[cg].ind+1];
8621 for(i=cg0; i<cg1; i++)
8623 copy_rvec(v[i],buf[a]);
8624 a++;
8627 atot = a;
8629 /* Copy back to the original array */
8630 for(a=0; a<atot; a++)
8632 copy_rvec(buf[a],v[a]);
8636 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
8637 int nsort_new,gmx_cgsort_t *sort_new,
8638 gmx_cgsort_t *sort1)
8640 int i1,i2,i_new;
8642 /* The new indices are not very ordered, so we qsort them */
8643 qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
8645 /* sort2 is already ordered, so now we can merge the two arrays */
8646 i1 = 0;
8647 i2 = 0;
8648 i_new = 0;
8649 while(i2 < nsort2 || i_new < nsort_new)
8651 if (i2 == nsort2)
8653 sort1[i1++] = sort_new[i_new++];
8655 else if (i_new == nsort_new)
8657 sort1[i1++] = sort2[i2++];
8659 else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8660 (sort2[i2].nsc == sort_new[i_new].nsc &&
8661 sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8663 sort1[i1++] = sort2[i2++];
8665 else
8667 sort1[i1++] = sort_new[i_new++];
8672 static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
8674 gmx_domdec_sort_t *sort;
8675 gmx_cgsort_t *cgsort,*sort_i;
8676 int ncg_new,nsort2,nsort_new,i,*a,moved,*ibuf;
8677 int sort_last,sort_skip;
8679 sort = dd->comm->sort;
8681 a = fr->ns.grid->cell_index;
8683 moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8685 if (ncg_home_old >= 0)
8687 /* The charge groups that remained in the same ns grid cell
8688 * are completely ordered. So we can sort efficiently by sorting
8689 * the charge groups that did move into the stationary list.
8691 ncg_new = 0;
8692 nsort2 = 0;
8693 nsort_new = 0;
8694 for(i=0; i<dd->ncg_home; i++)
8696 /* Check if this cg did not move to another node */
8697 if (a[i] < moved)
8699 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8701 /* This cg is new on this node or moved ns grid cell */
8702 if (nsort_new >= sort->sort_new_nalloc)
8704 sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8705 srenew(sort->sort_new,sort->sort_new_nalloc);
8707 sort_i = &(sort->sort_new[nsort_new++]);
8709 else
8711 /* This cg did not move */
8712 sort_i = &(sort->sort2[nsort2++]);
8714 /* Sort on the ns grid cell indices
8715 * and the global topology index.
8716 * index_gl is irrelevant with cell ns,
8717 * but we set it here anyhow to avoid a conditional.
8719 sort_i->nsc = a[i];
8720 sort_i->ind_gl = dd->index_gl[i];
8721 sort_i->ind = i;
8722 ncg_new++;
8725 if (debug)
8727 fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
8728 nsort2,nsort_new);
8730 /* Sort efficiently */
8731 ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,
8732 sort->sort);
8734 else
8736 cgsort = sort->sort;
8737 ncg_new = 0;
8738 for(i=0; i<dd->ncg_home; i++)
8740 /* Sort on the ns grid cell indices
8741 * and the global topology index
8743 cgsort[i].nsc = a[i];
8744 cgsort[i].ind_gl = dd->index_gl[i];
8745 cgsort[i].ind = i;
8746 if (cgsort[i].nsc < moved)
8748 ncg_new++;
8751 if (debug)
8753 fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
8755 /* Determine the order of the charge groups using qsort */
8756 qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
8759 return ncg_new;
8762 static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
8764 gmx_cgsort_t *sort;
8765 int ncg_new,i,*a,na;
8767 sort = dd->comm->sort->sort;
8769 nbnxn_get_atomorder(fr->nbv->nbs,&a,&na);
8771 ncg_new = 0;
8772 for(i=0; i<na; i++)
8774 if (a[i] >= 0)
8776 sort[ncg_new].ind = a[i];
8777 ncg_new++;
8781 return ncg_new;
8784 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
8785 rvec *cgcm,t_forcerec *fr,t_state *state,
8786 int ncg_home_old)
8788 gmx_domdec_sort_t *sort;
8789 gmx_cgsort_t *cgsort,*sort_i;
8790 int *cgindex;
8791 int ncg_new,i,*ibuf,cgsize;
8792 rvec *vbuf;
8794 sort = dd->comm->sort;
8796 if (dd->ncg_home > sort->sort_nalloc)
8798 sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8799 srenew(sort->sort,sort->sort_nalloc);
8800 srenew(sort->sort2,sort->sort_nalloc);
8802 cgsort = sort->sort;
8804 switch (fr->cutoff_scheme)
8806 case ecutsGROUP:
8807 ncg_new = dd_sort_order(dd,fr,ncg_home_old);
8808 break;
8809 case ecutsVERLET:
8810 ncg_new = dd_sort_order_nbnxn(dd,fr);
8811 break;
8812 default:
8813 gmx_incons("unimplemented");
8814 ncg_new = 0;
8817 /* We alloc with the old size, since cgindex is still old */
8818 vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
8819 vbuf = dd->comm->vbuf.v;
8821 if (dd->comm->bCGs)
8823 cgindex = dd->cgindex;
8825 else
8827 cgindex = NULL;
8830 /* Remove the charge groups which are no longer at home here */
8831 dd->ncg_home = ncg_new;
8832 if (debug)
8834 fprintf(debug,"Set the new home charge group count to %d\n",
8835 dd->ncg_home);
8838 /* Reorder the state */
8839 for(i=0; i<estNR; i++)
8841 if (EST_DISTR(i) && (state->flags & (1<<i)))
8843 switch (i)
8845 case estX:
8846 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->x,vbuf);
8847 break;
8848 case estV:
8849 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->v,vbuf);
8850 break;
8851 case estSDX:
8852 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->sd_X,vbuf);
8853 break;
8854 case estCGP:
8855 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->cg_p,vbuf);
8856 break;
8857 case estLD_RNG:
8858 case estLD_RNGI:
8859 case estDISRE_INITF:
8860 case estDISRE_RM3TAV:
8861 case estORIRE_INITF:
8862 case estORIRE_DTAV:
8863 /* No ordering required */
8864 break;
8865 default:
8866 gmx_incons("Unknown state entry encountered in dd_sort_state");
8867 break;
8871 if (fr->cutoff_scheme == ecutsGROUP)
8873 /* Reorder cgcm */
8874 order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8877 if (dd->ncg_home+1 > sort->ibuf_nalloc)
8879 sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8880 srenew(sort->ibuf,sort->ibuf_nalloc);
8882 ibuf = sort->ibuf;
8883 /* Reorder the global cg index */
8884 order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8885 /* Reorder the cginfo */
8886 order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8887 /* Rebuild the local cg index */
8888 if (dd->comm->bCGs)
8890 ibuf[0] = 0;
8891 for(i=0; i<dd->ncg_home; i++)
8893 cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8894 ibuf[i+1] = ibuf[i] + cgsize;
8896 for(i=0; i<dd->ncg_home+1; i++)
8898 dd->cgindex[i] = ibuf[i];
8901 else
8903 for(i=0; i<dd->ncg_home+1; i++)
8905 dd->cgindex[i] = i;
8908 /* Set the home atom number */
8909 dd->nat_home = dd->cgindex[dd->ncg_home];
8911 if (fr->cutoff_scheme == ecutsVERLET)
8913 /* The atoms are now exactly in grid order, update the grid order */
8914 nbnxn_set_atomorder(fr->nbv->nbs);
8916 else
8918 /* Copy the sorted ns cell indices back to the ns grid struct */
8919 for(i=0; i<dd->ncg_home; i++)
8921 fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8923 fr->ns.grid->nr = dd->ncg_home;
8927 static void add_dd_statistics(gmx_domdec_t *dd)
8929 gmx_domdec_comm_t *comm;
8930 int ddnat;
8932 comm = dd->comm;
8934 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8936 comm->sum_nat[ddnat-ddnatZONE] +=
8937 comm->nat[ddnat] - comm->nat[ddnat-1];
8939 comm->ndecomp++;
8942 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8944 gmx_domdec_comm_t *comm;
8945 int ddnat;
8947 comm = dd->comm;
8949 /* Reset all the statistics and counters for total run counting */
8950 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8952 comm->sum_nat[ddnat-ddnatZONE] = 0;
8954 comm->ndecomp = 0;
8955 comm->nload = 0;
8956 comm->load_step = 0;
8957 comm->load_sum = 0;
8958 comm->load_max = 0;
8959 clear_ivec(comm->load_lim);
8960 comm->load_mdf = 0;
8961 comm->load_pme = 0;
8964 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8966 gmx_domdec_comm_t *comm;
8967 int ddnat;
8968 double av;
8970 comm = cr->dd->comm;
8972 gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8974 if (fplog == NULL)
8976 return;
8979 fprintf(fplog,"\n D O M A I N D E C O M P O S I T I O N S T A T I S T I C S\n\n");
8981 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8983 av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8984 switch(ddnat)
8986 case ddnatZONE:
8987 fprintf(fplog,
8988 " av. #atoms communicated per step for force: %d x %.1f\n",
8989 2,av);
8990 break;
8991 case ddnatVSITE:
8992 if (cr->dd->vsite_comm)
8994 fprintf(fplog,
8995 " av. #atoms communicated per step for vsites: %d x %.1f\n",
8996 (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8997 av);
8999 break;
9000 case ddnatCON:
9001 if (cr->dd->constraint_comm)
9003 fprintf(fplog,
9004 " av. #atoms communicated per step for LINCS: %d x %.1f\n",
9005 1 + ir->nLincsIter,av);
9007 break;
9008 default:
9009 gmx_incons(" Unknown type for DD statistics");
9012 fprintf(fplog,"\n");
9014 if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9016 print_dd_load_av(fplog,cr->dd);
9020 void dd_partition_system(FILE *fplog,
9021 gmx_large_int_t step,
9022 t_commrec *cr,
9023 gmx_bool bMasterState,
9024 int nstglobalcomm,
9025 t_state *state_global,
9026 gmx_mtop_t *top_global,
9027 t_inputrec *ir,
9028 t_state *state_local,
9029 rvec **f,
9030 t_mdatoms *mdatoms,
9031 gmx_localtop_t *top_local,
9032 t_forcerec *fr,
9033 gmx_vsite_t *vsite,
9034 gmx_shellfc_t shellfc,
9035 gmx_constr_t constr,
9036 t_nrnb *nrnb,
9037 gmx_wallcycle_t wcycle,
9038 gmx_bool bVerbose)
9040 gmx_domdec_t *dd;
9041 gmx_domdec_comm_t *comm;
9042 gmx_ddbox_t ddbox={0};
9043 t_block *cgs_gl;
9044 gmx_large_int_t step_pcoupl;
9045 rvec cell_ns_x0,cell_ns_x1;
9046 int i,j,n,cg0=0,ncg_home_old=-1,ncg_moved,nat_f_novirsum;
9047 gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
9048 gmx_bool bRedist,bSortCG,bResortAll;
9049 ivec ncells_old={0,0,0},ncells_new={0,0,0},np;
9050 real grid_density;
9051 char sbuf[22];
9053 dd = cr->dd;
9054 comm = dd->comm;
9056 bBoxChanged = (bMasterState || DEFORM(*ir));
9057 if (ir->epc != epcNO)
9059 /* With nstpcouple > 1 pressure coupling happens.
9060 * one step after calculating the pressure.
9061 * Box scaling happens at the end of the MD step,
9062 * after the DD partitioning.
9063 * We therefore have to do DLB in the first partitioning
9064 * after an MD step where P-coupling occured.
9065 * We need to determine the last step in which p-coupling occurred.
9066 * MRS -- need to validate this for vv?
9068 n = ir->nstpcouple;
9069 if (n == 1)
9071 step_pcoupl = step - 1;
9073 else
9075 step_pcoupl = ((step - 1)/n)*n + 1;
9077 if (step_pcoupl >= comm->partition_step)
9079 bBoxChanged = TRUE;
9083 bNStGlobalComm = (step % nstglobalcomm == 0);
9085 if (!comm->bDynLoadBal)
9087 bDoDLB = FALSE;
9089 else
9091 /* Should we do dynamic load balacing this step?
9092 * Since it requires (possibly expensive) global communication,
9093 * we might want to do DLB less frequently.
9095 if (bBoxChanged || ir->epc != epcNO)
9097 bDoDLB = bBoxChanged;
9099 else
9101 bDoDLB = bNStGlobalComm;
9105 /* Check if we have recorded loads on the nodes */
9106 if (comm->bRecordLoad && dd_load_count(comm))
9108 if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9110 /* Check if we should use DLB at the second partitioning
9111 * and every 100 partitionings,
9112 * so the extra communication cost is negligible.
9114 n = max(100,nstglobalcomm);
9115 bCheckDLB = (comm->n_load_collect == 0 ||
9116 comm->n_load_have % n == n-1);
9118 else
9120 bCheckDLB = FALSE;
9123 /* Print load every nstlog, first and last step to the log file */
9124 bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9125 comm->n_load_collect == 0 ||
9126 (ir->nsteps >= 0 &&
9127 (step + ir->nstlist > ir->init_step + ir->nsteps)));
9129 /* Avoid extra communication due to verbose screen output
9130 * when nstglobalcomm is set.
9132 if (bDoDLB || bLogLoad || bCheckDLB ||
9133 (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9135 get_load_distribution(dd,wcycle);
9136 if (DDMASTER(dd))
9138 if (bLogLoad)
9140 dd_print_load(fplog,dd,step-1);
9142 if (bVerbose)
9144 dd_print_load_verbose(dd);
9147 comm->n_load_collect++;
9149 if (bCheckDLB) {
9150 /* Since the timings are node dependent, the master decides */
9151 if (DDMASTER(dd))
9153 bTurnOnDLB =
9154 (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9155 if (debug)
9157 fprintf(debug,"step %s, imb loss %f\n",
9158 gmx_step_str(step,sbuf),
9159 dd_force_imb_perf_loss(dd));
9162 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
9163 if (bTurnOnDLB)
9165 turn_on_dlb(fplog,cr,step);
9166 bDoDLB = TRUE;
9170 comm->n_load_have++;
9173 cgs_gl = &comm->cgs_gl;
9175 bRedist = FALSE;
9176 if (bMasterState)
9178 /* Clear the old state */
9179 clear_dd_indices(dd,0,0);
9181 set_ddbox(dd,bMasterState,cr,ir,state_global->box,
9182 TRUE,cgs_gl,state_global->x,&ddbox);
9184 get_cg_distribution(fplog,step,dd,cgs_gl,
9185 state_global->box,&ddbox,state_global->x);
9187 dd_distribute_state(dd,cgs_gl,
9188 state_global,state_local,f);
9190 dd_make_local_cgs(dd,&top_local->cgs);
9192 /* Ensure that we have space for the new distribution */
9193 dd_check_alloc_ncg(fr,state_local,f,dd->ncg_home);
9195 if (fr->cutoff_scheme == ecutsGROUP)
9197 calc_cgcm(fplog,0,dd->ncg_home,
9198 &top_local->cgs,state_local->x,fr->cg_cm);
9201 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9203 dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9205 cg0 = 0;
9207 else if (state_local->ddp_count != dd->ddp_count)
9209 if (state_local->ddp_count > dd->ddp_count)
9211 gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
9214 if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9216 gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
9219 /* Clear the old state */
9220 clear_dd_indices(dd,0,0);
9222 /* Build the new indices */
9223 rebuild_cgindex(dd,cgs_gl->index,state_local);
9224 make_dd_indices(dd,cgs_gl->index,0);
9226 if (fr->cutoff_scheme == ecutsGROUP)
9228 /* Redetermine the cg COMs */
9229 calc_cgcm(fplog,0,dd->ncg_home,
9230 &top_local->cgs,state_local->x,fr->cg_cm);
9233 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9235 dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9237 set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9238 TRUE,&top_local->cgs,state_local->x,&ddbox);
9240 bRedist = comm->bDynLoadBal;
9242 else
9244 /* We have the full state, only redistribute the cgs */
9246 /* Clear the non-home indices */
9247 clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
9249 /* Avoid global communication for dim's without pbc and -gcom */
9250 if (!bNStGlobalComm)
9252 copy_rvec(comm->box0 ,ddbox.box0 );
9253 copy_rvec(comm->box_size,ddbox.box_size);
9255 set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9256 bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
9258 bBoxChanged = TRUE;
9259 bRedist = TRUE;
9261 /* For dim's without pbc and -gcom */
9262 copy_rvec(ddbox.box0 ,comm->box0 );
9263 copy_rvec(ddbox.box_size,comm->box_size);
9265 set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
9266 step,wcycle);
9268 if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9270 write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
9273 /* Check if we should sort the charge groups */
9274 if (comm->nstSortCG > 0)
9276 bSortCG = (bMasterState ||
9277 (bRedist && (step % comm->nstSortCG == 0)));
9279 else
9281 bSortCG = FALSE;
9284 ncg_home_old = dd->ncg_home;
9286 ncg_moved = 0;
9287 if (bRedist)
9289 wallcycle_sub_start(wcycle,ewcsDD_REDIST);
9291 dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
9292 state_local,f,fr,mdatoms,
9293 !bSortCG,nrnb,&cg0,&ncg_moved);
9295 wallcycle_sub_stop(wcycle,ewcsDD_REDIST);
9298 get_nsgrid_boundaries(ddbox.nboundeddim,state_local->box,
9299 dd,&ddbox,
9300 &comm->cell_x0,&comm->cell_x1,
9301 dd->ncg_home,fr->cg_cm,
9302 cell_ns_x0,cell_ns_x1,&grid_density);
9304 if (bBoxChanged)
9306 comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
9309 switch (fr->cutoff_scheme)
9311 case ecutsGROUP:
9312 copy_ivec(fr->ns.grid->n,ncells_old);
9313 grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
9314 state_local->box,cell_ns_x0,cell_ns_x1,
9315 fr->rlistlong,grid_density);
9316 break;
9317 case ecutsVERLET:
9318 nbnxn_get_ncells(fr->nbv->nbs,&ncells_old[XX],&ncells_old[YY]);
9319 break;
9320 default:
9321 gmx_incons("unimplemented");
9323 /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9324 copy_ivec(ddbox.tric_dir,comm->tric_dir);
9326 if (bSortCG)
9328 wallcycle_sub_start(wcycle,ewcsDD_GRID);
9330 /* Sort the state on charge group position.
9331 * This enables exact restarts from this step.
9332 * It also improves performance by about 15% with larger numbers
9333 * of atoms per node.
9336 /* Fill the ns grid with the home cell,
9337 * so we can sort with the indices.
9339 set_zones_ncg_home(dd);
9341 switch (fr->cutoff_scheme)
9343 case ecutsVERLET:
9344 set_zones_size(dd,state_local->box,&ddbox,0,1);
9346 nbnxn_put_on_grid(fr->nbv->nbs,fr->ePBC,state_local->box,
9348 comm->zones.size[0].bb_x0,
9349 comm->zones.size[0].bb_x1,
9350 0,dd->ncg_home,
9351 comm->zones.dens_zone0,
9352 fr->cginfo,
9353 state_local->x,
9354 ncg_moved,comm->moved,
9355 fr->nbv->grp[eintLocal].kernel_type,
9356 fr->nbv->grp[eintLocal].nbat);
9358 nbnxn_get_ncells(fr->nbv->nbs,&ncells_new[XX],&ncells_new[YY]);
9359 break;
9360 case ecutsGROUP:
9361 fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
9362 0,dd->ncg_home,fr->cg_cm);
9364 copy_ivec(fr->ns.grid->n,ncells_new);
9365 break;
9366 default:
9367 gmx_incons("unimplemented");
9370 bResortAll = bMasterState;
9372 /* Check if we can user the old order and ns grid cell indices
9373 * of the charge groups to sort the charge groups efficiently.
9375 if (ncells_new[XX] != ncells_old[XX] ||
9376 ncells_new[YY] != ncells_old[YY] ||
9377 ncells_new[ZZ] != ncells_old[ZZ])
9379 bResortAll = TRUE;
9382 if (debug)
9384 fprintf(debug,"Step %s, sorting the %d home charge groups\n",
9385 gmx_step_str(step,sbuf),dd->ncg_home);
9387 dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
9388 bResortAll ? -1 : ncg_home_old);
9389 /* Rebuild all the indices */
9390 cg0 = 0;
9391 ga2la_clear(dd->ga2la);
9393 wallcycle_sub_stop(wcycle,ewcsDD_GRID);
9396 wallcycle_sub_start(wcycle,ewcsDD_SETUPCOMM);
9398 /* Setup up the communication and communicate the coordinates */
9399 setup_dd_communication(dd,state_local->box,&ddbox,fr,state_local,f);
9401 /* Set the indices */
9402 make_dd_indices(dd,cgs_gl->index,cg0);
9404 /* Set the charge group boundaries for neighbor searching */
9405 set_cg_boundaries(&comm->zones);
9407 if (fr->cutoff_scheme == ecutsVERLET)
9409 set_zones_size(dd,state_local->box,&ddbox,
9410 bSortCG ? 1 : 0,comm->zones.n);
9413 wallcycle_sub_stop(wcycle,ewcsDD_SETUPCOMM);
9416 write_dd_pdb("dd_home",step,"dump",top_global,cr,
9417 -1,state_local->x,state_local->box);
9420 wallcycle_sub_start(wcycle,ewcsDD_MAKETOP);
9422 /* Extract a local topology from the global topology */
9423 for(i=0; i<dd->ndim; i++)
9425 np[dd->dim[i]] = comm->cd[i].np;
9427 dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
9428 comm->cellsize_min,np,
9430 fr->cutoff_scheme==ecutsGROUP ? fr->cg_cm : state_local->x,
9431 vsite,top_global,top_local);
9433 wallcycle_sub_stop(wcycle,ewcsDD_MAKETOP);
9435 wallcycle_sub_start(wcycle,ewcsDD_MAKECONSTR);
9437 /* Set up the special atom communication */
9438 n = comm->nat[ddnatZONE];
9439 for(i=ddnatZONE+1; i<ddnatNR; i++)
9441 switch(i)
9443 case ddnatVSITE:
9444 if (vsite && vsite->n_intercg_vsite)
9446 n = dd_make_local_vsites(dd,n,top_local->idef.il);
9448 break;
9449 case ddnatCON:
9450 if (dd->bInterCGcons || dd->bInterCGsettles)
9452 /* Only for inter-cg constraints we need special code */
9453 n = dd_make_local_constraints(dd,n,top_global,fr->cginfo,
9454 constr,ir->nProjOrder,
9455 top_local->idef.il);
9457 break;
9458 default:
9459 gmx_incons("Unknown special atom type setup");
9461 comm->nat[i] = n;
9464 wallcycle_sub_stop(wcycle,ewcsDD_MAKECONSTR);
9466 wallcycle_sub_start(wcycle,ewcsDD_TOPOTHER);
9468 /* Make space for the extra coordinates for virtual site
9469 * or constraint communication.
9471 state_local->natoms = comm->nat[ddnatNR-1];
9472 if (state_local->natoms > state_local->nalloc)
9474 dd_realloc_state(state_local,f,state_local->natoms);
9477 if (fr->bF_NoVirSum)
9479 if (vsite && vsite->n_intercg_vsite)
9481 nat_f_novirsum = comm->nat[ddnatVSITE];
9483 else
9485 if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9487 nat_f_novirsum = dd->nat_tot;
9489 else
9491 nat_f_novirsum = dd->nat_home;
9495 else
9497 nat_f_novirsum = 0;
9500 /* Set the number of atoms required for the force calculation.
9501 * Forces need to be constrained when using a twin-range setup
9502 * or with energy minimization. For simple simulations we could
9503 * avoid some allocation, zeroing and copying, but this is
9504 * probably not worth the complications ande checking.
9506 forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
9507 dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
9509 /* We make the all mdatoms up to nat_tot_con.
9510 * We could save some work by only setting invmass
9511 * between nat_tot and nat_tot_con.
9513 /* This call also sets the new number of home particles to dd->nat_home */
9514 atoms2md(top_global,ir,
9515 comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
9517 /* Now we have the charges we can sort the FE interactions */
9518 dd_sort_local_top(dd,mdatoms,top_local);
9520 if (vsite != NULL)
9522 /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9523 split_vsites_over_threads(top_local->idef.il,mdatoms,FALSE,vsite);
9526 if (shellfc)
9528 /* Make the local shell stuff, currently no communication is done */
9529 make_local_shells(cr,mdatoms,shellfc);
9532 if (ir->implicit_solvent)
9534 make_local_gb(cr,fr->born,ir->gb_algorithm);
9537 init_bonded_thread_force_reduction(fr,&top_local->idef);
9539 if (!(cr->duty & DUTY_PME))
9541 /* Send the charges to our PME only node */
9542 gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
9543 mdatoms->chargeA,mdatoms->chargeB,
9544 dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
9547 if (constr)
9549 set_constraints(constr,top_local,ir,mdatoms,cr);
9552 if (ir->ePull != epullNO)
9554 /* Update the local pull groups */
9555 dd_make_local_pull_groups(dd,ir->pull,mdatoms);
9558 if (ir->bRot)
9560 /* Update the local rotation groups */
9561 dd_make_local_rotation_groups(dd,ir->rot);
9565 add_dd_statistics(dd);
9567 /* Make sure we only count the cycles for this DD partitioning */
9568 clear_dd_cycle_counts(dd);
9570 /* Because the order of the atoms might have changed since
9571 * the last vsite construction, we need to communicate the constructing
9572 * atom coordinates again (for spreading the forces this MD step).
9574 dd_move_x_vsites(dd,state_local->box,state_local->x);
9576 wallcycle_sub_stop(wcycle,ewcsDD_TOPOTHER);
9578 if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9580 dd_move_x(dd,state_local->box,state_local->x);
9581 write_dd_pdb("dd_dump",step,"dump",top_global,cr,
9582 -1,state_local->x,state_local->box);
9585 /* Store the partitioning step */
9586 comm->partition_step = step;
9588 /* Increase the DD partitioning counter */
9589 dd->ddp_count++;
9590 /* The state currently matches this DD partitioning count, store it */
9591 state_local->ddp_count = dd->ddp_count;
9592 if (bMasterState)
9594 /* The DD master node knows the complete cg distribution,
9595 * store the count so we can possibly skip the cg info communication.
9597 comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9600 if (comm->DD_debug > 0)
9602 /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9603 check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
9604 "after partitioning");