added Verlet scheme and NxN non-bonded functionality
[gromacs.git] / src / mdlib / domdec.c
blobad1b8c807740ba34b9cf58947527397228b8bebc
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 *
4 * This file is part of Gromacs Copyright (c) 1991-2008
5 * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
12 * To help us fund GROMACS development, we humbly ask that you cite
13 * the research papers on the package. Check out http://www.gromacs.org
15 * And Hey:
16 * Gnomes, ROck Monsters And Chili Sauce
19 #ifdef HAVE_CONFIG_H
20 #include <config.h>
21 #endif
23 #include <stdio.h>
24 #include <time.h>
25 #include <math.h>
26 #include <string.h>
27 #include <stdlib.h>
28 #include "typedefs.h"
29 #include "smalloc.h"
30 #include "gmx_fatal.h"
31 #include "gmx_fatal_collective.h"
32 #include "vec.h"
33 #include "domdec.h"
34 #include "domdec_network.h"
35 #include "nrnb.h"
36 #include "pbc.h"
37 #include "chargegroup.h"
38 #include "constr.h"
39 #include "mdatoms.h"
40 #include "names.h"
41 #include "pdbio.h"
42 #include "futil.h"
43 #include "force.h"
44 #include "pme.h"
45 #include "pull.h"
46 #include "pull_rotation.h"
47 #include "gmx_wallcycle.h"
48 #include "mdrun.h"
49 #include "nsgrid.h"
50 #include "shellfc.h"
51 #include "mtop_util.h"
52 #include "gmxfio.h"
53 #include "gmx_ga2la.h"
54 #include "gmx_sort.h"
55 #include "nbnxn_search.h"
56 #include "bondf.h"
57 #include "gmx_omp_nthreads.h"
59 #ifdef GMX_LIB_MPI
60 #include <mpi.h>
61 #endif
62 #ifdef GMX_THREAD_MPI
63 #include "tmpi.h"
64 #endif
66 #define DDRANK(dd,rank) (rank)
67 #define DDMASTERRANK(dd) (dd->masterrank)
69 typedef struct gmx_domdec_master
71 /* The cell boundaries */
72 real **cell_x;
73 /* The global charge group division */
74 int *ncg; /* Number of home charge groups for each node */
75 int *index; /* Index of nnodes+1 into cg */
76 int *cg; /* Global charge group index */
77 int *nat; /* Number of home atoms for each node. */
78 int *ibuf; /* Buffer for communication */
79 rvec *vbuf; /* Buffer for state scattering and gathering */
80 } gmx_domdec_master_t;
82 typedef struct
84 /* The numbers of charge groups to send and receive for each cell
85 * that requires communication, the last entry contains the total
86 * number of atoms that needs to be communicated.
88 int nsend[DD_MAXIZONE+2];
89 int nrecv[DD_MAXIZONE+2];
90 /* The charge groups to send */
91 int *index;
92 int nalloc;
93 /* The atom range for non-in-place communication */
94 int cell2at0[DD_MAXIZONE];
95 int cell2at1[DD_MAXIZONE];
96 } gmx_domdec_ind_t;
98 typedef struct
100 int np; /* Number of grid pulses in this dimension */
101 int np_dlb; /* For dlb, for use with edlbAUTO */
102 gmx_domdec_ind_t *ind; /* The indices to communicate, size np */
103 int np_nalloc;
104 gmx_bool bInPlace; /* Can we communicate in place? */
105 } gmx_domdec_comm_dim_t;
107 typedef struct
109 gmx_bool *bCellMin; /* Temp. var.: is this cell size at the limit */
110 real *cell_f; /* State var.: cell boundaries, box relative */
111 real *old_cell_f; /* Temp. var.: old cell size */
112 real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
113 real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
114 real *bound_min; /* Temp. var.: lower limit for cell boundary */
115 real *bound_max; /* Temp. var.: upper limit for cell boundary */
116 gmx_bool bLimited; /* State var.: is DLB limited in this dim and row */
117 real *buf_ncd; /* Temp. var. */
118 } gmx_domdec_root_t;
120 #define DD_NLOAD_MAX 9
122 /* Here floats are accurate enough, since these variables
123 * only influence the load balancing, not the actual MD results.
125 typedef struct
127 int nload;
128 float *load;
129 float sum;
130 float max;
131 float sum_m;
132 float cvol_min;
133 float mdf;
134 float pme;
135 int flags;
136 } gmx_domdec_load_t;
138 typedef struct
140 int nsc;
141 int ind_gl;
142 int ind;
143 } gmx_cgsort_t;
145 typedef struct
147 gmx_cgsort_t *sort;
148 gmx_cgsort_t *sort2;
149 int sort_nalloc;
150 gmx_cgsort_t *sort_new;
151 int sort_new_nalloc;
152 int *ibuf;
153 int ibuf_nalloc;
154 } gmx_domdec_sort_t;
156 typedef struct
158 rvec *v;
159 int nalloc;
160 } vec_rvec_t;
162 /* This enum determines the order of the coordinates.
163 * ddnatHOME and ddnatZONE should be first and second,
164 * the others can be ordered as wanted.
166 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
168 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
169 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
171 typedef struct
173 int dim; /* The dimension */
174 gmx_bool dim_match;/* Tells if DD and PME dims match */
175 int nslab; /* The number of PME slabs in this dimension */
176 real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB */
177 int *pp_min; /* The minimum pp node location, size nslab */
178 int *pp_max; /* The maximum pp node location,size nslab */
179 int maxshift; /* The maximum shift for coordinate redistribution in PME */
180 } gmx_ddpme_t;
182 typedef struct
184 real min0; /* The minimum bottom of this zone */
185 real max1; /* The maximum top of this zone */
186 real min1; /* The minimum top of this zone */
187 real mch0; /* The maximum bottom communicaton height for this zone */
188 real mch1; /* The maximum top communicaton height for this zone */
189 real p1_0; /* The bottom value of the first cell in this zone */
190 real p1_1; /* The top value of the first cell in this zone */
191 } gmx_ddzone_t;
193 typedef struct
195 gmx_domdec_ind_t ind;
196 int *ibuf;
197 int ibuf_nalloc;
198 vec_rvec_t vbuf;
199 int nsend;
200 int nat;
201 int nsend_zone;
202 } dd_comm_setup_work_t;
204 typedef struct gmx_domdec_comm
206 /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
207 * unless stated otherwise.
210 /* The number of decomposition dimensions for PME, 0: no PME */
211 int npmedecompdim;
212 /* The number of nodes doing PME (PP/PME or only PME) */
213 int npmenodes;
214 int npmenodes_x;
215 int npmenodes_y;
216 /* The communication setup including the PME only nodes */
217 gmx_bool bCartesianPP_PME;
218 ivec ntot;
219 int cartpmedim;
220 int *pmenodes; /* size npmenodes */
221 int *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
222 * but with bCartesianPP_PME */
223 gmx_ddpme_t ddpme[2];
225 /* The DD particle-particle nodes only */
226 gmx_bool bCartesianPP;
227 int *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
229 /* The global charge groups */
230 t_block cgs_gl;
232 /* Should we sort the cgs */
233 int nstSortCG;
234 gmx_domdec_sort_t *sort;
236 /* Are there charge groups? */
237 gmx_bool bCGs;
239 /* Are there bonded and multi-body interactions between charge groups? */
240 gmx_bool bInterCGBondeds;
241 gmx_bool bInterCGMultiBody;
243 /* Data for the optional bonded interaction atom communication range */
244 gmx_bool bBondComm;
245 t_blocka *cglink;
246 char *bLocalCG;
248 /* The DLB option */
249 int eDLB;
250 /* Are we actually using DLB? */
251 gmx_bool bDynLoadBal;
253 /* Cell sizes for static load balancing, first index cartesian */
254 real **slb_frac;
256 /* The width of the communicated boundaries */
257 real cutoff_mbody;
258 real cutoff;
259 /* The minimum cell size (including triclinic correction) */
260 rvec cellsize_min;
261 /* For dlb, for use with edlbAUTO */
262 rvec cellsize_min_dlb;
263 /* The lower limit for the DD cell size with DLB */
264 real cellsize_limit;
265 /* Effectively no NB cut-off limit with DLB for systems without PBC? */
266 gmx_bool bVacDLBNoLimit;
268 /* tric_dir is only stored here because dd_get_ns_ranges needs it */
269 ivec tric_dir;
270 /* box0 and box_size are required with dim's without pbc and -gcom */
271 rvec box0;
272 rvec box_size;
274 /* The cell boundaries */
275 rvec cell_x0;
276 rvec cell_x1;
278 /* The old location of the cell boundaries, to check cg displacements */
279 rvec old_cell_x0;
280 rvec old_cell_x1;
282 /* The communication setup and charge group boundaries for the zones */
283 gmx_domdec_zones_t zones;
285 /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
286 * cell boundaries of neighboring cells for dynamic load balancing.
288 gmx_ddzone_t zone_d1[2];
289 gmx_ddzone_t zone_d2[2][2];
291 /* The coordinate/force communication setup and indices */
292 gmx_domdec_comm_dim_t cd[DIM];
293 /* The maximum number of cells to communicate with in one dimension */
294 int maxpulse;
296 /* Which cg distribution is stored on the master node */
297 int master_cg_ddp_count;
299 /* The number of cg's received from the direct neighbors */
300 int zone_ncg1[DD_MAXZONE];
302 /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
303 int nat[ddnatNR];
305 /* Array for signalling if atoms have moved to another domain */
306 int *moved;
307 int moved_nalloc;
309 /* Communication buffer for general use */
310 int *buf_int;
311 int nalloc_int;
313 /* Communication buffer for general use */
314 vec_rvec_t vbuf;
316 /* Temporary storage for thread parallel communication setup */
317 int nth;
318 dd_comm_setup_work_t *dth;
320 /* Communication buffers only used with multiple grid pulses */
321 int *buf_int2;
322 int nalloc_int2;
323 vec_rvec_t vbuf2;
325 /* Communication buffers for local redistribution */
326 int **cggl_flag;
327 int cggl_flag_nalloc[DIM*2];
328 rvec **cgcm_state;
329 int cgcm_state_nalloc[DIM*2];
331 /* Cell sizes for dynamic load balancing */
332 gmx_domdec_root_t **root;
333 real *cell_f_row;
334 real cell_f0[DIM];
335 real cell_f1[DIM];
336 real cell_f_max0[DIM];
337 real cell_f_min1[DIM];
339 /* Stuff for load communication */
340 gmx_bool bRecordLoad;
341 gmx_domdec_load_t *load;
342 #ifdef GMX_MPI
343 MPI_Comm *mpi_comm_load;
344 #endif
346 /* Maximum DLB scaling per load balancing step in percent */
347 int dlb_scale_lim;
349 /* Cycle counters */
350 float cycl[ddCyclNr];
351 int cycl_n[ddCyclNr];
352 float cycl_max[ddCyclNr];
353 /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
354 int eFlop;
355 double flop;
356 int flop_n;
357 /* Have often have did we have load measurements */
358 int n_load_have;
359 /* Have often have we collected the load measurements */
360 int n_load_collect;
362 /* Statistics */
363 double sum_nat[ddnatNR-ddnatZONE];
364 int ndecomp;
365 int nload;
366 double load_step;
367 double load_sum;
368 double load_max;
369 ivec load_lim;
370 double load_mdf;
371 double load_pme;
373 /* The last partition step */
374 gmx_large_int_t partition_step;
376 /* Debugging */
377 int nstDDDump;
378 int nstDDDumpGrid;
379 int DD_debug;
380 } gmx_domdec_comm_t;
382 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
383 #define DD_CGIBS 2
385 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
386 #define DD_FLAG_NRCG 65535
387 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
388 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
390 /* Zone permutation required to obtain consecutive charge groups
391 * for neighbor searching.
393 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
395 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
396 * components see only j zones with that component 0.
399 /* The DD zone order */
400 static const ivec dd_zo[DD_MAXZONE] =
401 {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
403 /* The 3D setup */
404 #define dd_z3n 8
405 #define dd_zp3n 4
406 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
408 /* The 2D setup */
409 #define dd_z2n 4
410 #define dd_zp2n 2
411 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
413 /* The 1D setup */
414 #define dd_z1n 2
415 #define dd_zp1n 1
416 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
418 /* Factors used to avoid problems due to rounding issues */
419 #define DD_CELL_MARGIN 1.0001
420 #define DD_CELL_MARGIN2 1.00005
421 /* Factor to account for pressure scaling during nstlist steps */
422 #define DD_PRES_SCALE_MARGIN 1.02
424 /* Allowed performance loss before we DLB or warn */
425 #define DD_PERF_LOSS 0.05
427 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
429 /* Use separate MPI send and receive commands
430 * when nnodes <= GMX_DD_NNODES_SENDRECV.
431 * This saves memory (and some copying for small nnodes).
432 * For high parallelization scatter and gather calls are used.
434 #define GMX_DD_NNODES_SENDRECV 4
438 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
440 static void index2xyz(ivec nc,int ind,ivec xyz)
442 xyz[XX] = ind % nc[XX];
443 xyz[YY] = (ind / nc[XX]) % nc[YY];
444 xyz[ZZ] = ind / (nc[YY]*nc[XX]);
448 /* This order is required to minimize the coordinate communication in PME
449 * which uses decomposition in the x direction.
451 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
453 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
455 xyz[XX] = ind / (nc[YY]*nc[ZZ]);
456 xyz[YY] = (ind / nc[ZZ]) % nc[YY];
457 xyz[ZZ] = ind % nc[ZZ];
460 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
462 int ddindex;
463 int ddnodeid=-1;
465 ddindex = dd_index(dd->nc,c);
466 if (dd->comm->bCartesianPP_PME)
468 ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
470 else if (dd->comm->bCartesianPP)
472 #ifdef GMX_MPI
473 MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
474 #endif
476 else
478 ddnodeid = ddindex;
481 return ddnodeid;
484 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
486 return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
489 int ddglatnr(gmx_domdec_t *dd,int i)
491 int atnr;
493 if (dd == NULL)
495 atnr = i + 1;
497 else
499 if (i >= dd->comm->nat[ddnatNR-1])
501 gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
503 atnr = dd->gatindex[i] + 1;
506 return atnr;
509 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
511 return &dd->comm->cgs_gl;
514 static void vec_rvec_init(vec_rvec_t *v)
516 v->nalloc = 0;
517 v->v = NULL;
520 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
522 if (n > v->nalloc)
524 v->nalloc = over_alloc_dd(n);
525 srenew(v->v,v->nalloc);
529 void dd_store_state(gmx_domdec_t *dd,t_state *state)
531 int i;
533 if (state->ddp_count != dd->ddp_count)
535 gmx_incons("The state does not the domain decomposition state");
538 state->ncg_gl = dd->ncg_home;
539 if (state->ncg_gl > state->cg_gl_nalloc)
541 state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
542 srenew(state->cg_gl,state->cg_gl_nalloc);
544 for(i=0; i<state->ncg_gl; i++)
546 state->cg_gl[i] = dd->index_gl[i];
549 state->ddp_count_cg_gl = dd->ddp_count;
552 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
554 return &dd->comm->zones;
557 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
558 int *jcg0,int *jcg1,ivec shift0,ivec shift1)
560 gmx_domdec_zones_t *zones;
561 int izone,d,dim;
563 zones = &dd->comm->zones;
565 izone = 0;
566 while (icg >= zones->izone[izone].cg1)
568 izone++;
571 if (izone == 0)
573 *jcg0 = icg;
575 else if (izone < zones->nizone)
577 *jcg0 = zones->izone[izone].jcg0;
579 else
581 gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
582 icg,izone,zones->nizone);
585 *jcg1 = zones->izone[izone].jcg1;
587 for(d=0; d<dd->ndim; d++)
589 dim = dd->dim[d];
590 shift0[dim] = zones->izone[izone].shift0[dim];
591 shift1[dim] = zones->izone[izone].shift1[dim];
592 if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
594 /* A conservative approach, this can be optimized */
595 shift0[dim] -= 1;
596 shift1[dim] += 1;
601 int dd_natoms_vsite(gmx_domdec_t *dd)
603 return dd->comm->nat[ddnatVSITE];
606 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
608 *at_start = dd->comm->nat[ddnatCON-1];
609 *at_end = dd->comm->nat[ddnatCON];
612 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
614 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
615 int *index,*cgindex;
616 gmx_domdec_comm_t *comm;
617 gmx_domdec_comm_dim_t *cd;
618 gmx_domdec_ind_t *ind;
619 rvec shift={0,0,0},*buf,*rbuf;
620 gmx_bool bPBC,bScrew;
622 comm = dd->comm;
624 cgindex = dd->cgindex;
626 buf = comm->vbuf.v;
628 nzone = 1;
629 nat_tot = dd->nat_home;
630 for(d=0; d<dd->ndim; d++)
632 bPBC = (dd->ci[dd->dim[d]] == 0);
633 bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
634 if (bPBC)
636 copy_rvec(box[dd->dim[d]],shift);
638 cd = &comm->cd[d];
639 for(p=0; p<cd->np; p++)
641 ind = &cd->ind[p];
642 index = ind->index;
643 n = 0;
644 if (!bPBC)
646 for(i=0; i<ind->nsend[nzone]; i++)
648 at0 = cgindex[index[i]];
649 at1 = cgindex[index[i]+1];
650 for(j=at0; j<at1; j++)
652 copy_rvec(x[j],buf[n]);
653 n++;
657 else if (!bScrew)
659 for(i=0; i<ind->nsend[nzone]; i++)
661 at0 = cgindex[index[i]];
662 at1 = cgindex[index[i]+1];
663 for(j=at0; j<at1; j++)
665 /* We need to shift the coordinates */
666 rvec_add(x[j],shift,buf[n]);
667 n++;
671 else
673 for(i=0; i<ind->nsend[nzone]; i++)
675 at0 = cgindex[index[i]];
676 at1 = cgindex[index[i]+1];
677 for(j=at0; j<at1; j++)
679 /* Shift x */
680 buf[n][XX] = x[j][XX] + shift[XX];
681 /* Rotate y and z.
682 * This operation requires a special shift force
683 * treatment, which is performed in calc_vir.
685 buf[n][YY] = box[YY][YY] - x[j][YY];
686 buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
687 n++;
692 if (cd->bInPlace)
694 rbuf = x + nat_tot;
696 else
698 rbuf = comm->vbuf2.v;
700 /* Send and receive the coordinates */
701 dd_sendrecv_rvec(dd, d, dddirBackward,
702 buf, ind->nsend[nzone+1],
703 rbuf, ind->nrecv[nzone+1]);
704 if (!cd->bInPlace)
706 j = 0;
707 for(zone=0; zone<nzone; zone++)
709 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
711 copy_rvec(rbuf[j],x[i]);
712 j++;
716 nat_tot += ind->nrecv[nzone+1];
718 nzone += nzone;
722 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
724 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
725 int *index,*cgindex;
726 gmx_domdec_comm_t *comm;
727 gmx_domdec_comm_dim_t *cd;
728 gmx_domdec_ind_t *ind;
729 rvec *buf,*sbuf;
730 ivec vis;
731 int is;
732 gmx_bool bPBC,bScrew;
734 comm = dd->comm;
736 cgindex = dd->cgindex;
738 buf = comm->vbuf.v;
740 n = 0;
741 nzone = comm->zones.n/2;
742 nat_tot = dd->nat_tot;
743 for(d=dd->ndim-1; d>=0; d--)
745 bPBC = (dd->ci[dd->dim[d]] == 0);
746 bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
747 if (fshift == NULL && !bScrew)
749 bPBC = FALSE;
751 /* Determine which shift vector we need */
752 clear_ivec(vis);
753 vis[dd->dim[d]] = 1;
754 is = IVEC2IS(vis);
756 cd = &comm->cd[d];
757 for(p=cd->np-1; p>=0; p--) {
758 ind = &cd->ind[p];
759 nat_tot -= ind->nrecv[nzone+1];
760 if (cd->bInPlace)
762 sbuf = f + nat_tot;
764 else
766 sbuf = comm->vbuf2.v;
767 j = 0;
768 for(zone=0; zone<nzone; zone++)
770 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
772 copy_rvec(f[i],sbuf[j]);
773 j++;
777 /* Communicate the forces */
778 dd_sendrecv_rvec(dd, d, dddirForward,
779 sbuf, ind->nrecv[nzone+1],
780 buf, ind->nsend[nzone+1]);
781 index = ind->index;
782 /* Add the received forces */
783 n = 0;
784 if (!bPBC)
786 for(i=0; i<ind->nsend[nzone]; i++)
788 at0 = cgindex[index[i]];
789 at1 = cgindex[index[i]+1];
790 for(j=at0; j<at1; j++)
792 rvec_inc(f[j],buf[n]);
793 n++;
797 else if (!bScrew)
799 for(i=0; i<ind->nsend[nzone]; i++)
801 at0 = cgindex[index[i]];
802 at1 = cgindex[index[i]+1];
803 for(j=at0; j<at1; j++)
805 rvec_inc(f[j],buf[n]);
806 /* Add this force to the shift force */
807 rvec_inc(fshift[is],buf[n]);
808 n++;
812 else
814 for(i=0; i<ind->nsend[nzone]; i++)
816 at0 = cgindex[index[i]];
817 at1 = cgindex[index[i]+1];
818 for(j=at0; j<at1; j++)
820 /* Rotate the force */
821 f[j][XX] += buf[n][XX];
822 f[j][YY] -= buf[n][YY];
823 f[j][ZZ] -= buf[n][ZZ];
824 if (fshift)
826 /* Add this force to the shift force */
827 rvec_inc(fshift[is],buf[n]);
829 n++;
834 nzone /= 2;
838 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
840 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
841 int *index,*cgindex;
842 gmx_domdec_comm_t *comm;
843 gmx_domdec_comm_dim_t *cd;
844 gmx_domdec_ind_t *ind;
845 real *buf,*rbuf;
847 comm = dd->comm;
849 cgindex = dd->cgindex;
851 buf = &comm->vbuf.v[0][0];
853 nzone = 1;
854 nat_tot = dd->nat_home;
855 for(d=0; d<dd->ndim; d++)
857 cd = &comm->cd[d];
858 for(p=0; p<cd->np; p++)
860 ind = &cd->ind[p];
861 index = ind->index;
862 n = 0;
863 for(i=0; i<ind->nsend[nzone]; i++)
865 at0 = cgindex[index[i]];
866 at1 = cgindex[index[i]+1];
867 for(j=at0; j<at1; j++)
869 buf[n] = v[j];
870 n++;
874 if (cd->bInPlace)
876 rbuf = v + nat_tot;
878 else
880 rbuf = &comm->vbuf2.v[0][0];
882 /* Send and receive the coordinates */
883 dd_sendrecv_real(dd, d, dddirBackward,
884 buf, ind->nsend[nzone+1],
885 rbuf, ind->nrecv[nzone+1]);
886 if (!cd->bInPlace)
888 j = 0;
889 for(zone=0; zone<nzone; zone++)
891 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
893 v[i] = rbuf[j];
894 j++;
898 nat_tot += ind->nrecv[nzone+1];
900 nzone += nzone;
904 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
906 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
907 int *index,*cgindex;
908 gmx_domdec_comm_t *comm;
909 gmx_domdec_comm_dim_t *cd;
910 gmx_domdec_ind_t *ind;
911 real *buf,*sbuf;
913 comm = dd->comm;
915 cgindex = dd->cgindex;
917 buf = &comm->vbuf.v[0][0];
919 n = 0;
920 nzone = comm->zones.n/2;
921 nat_tot = dd->nat_tot;
922 for(d=dd->ndim-1; d>=0; d--)
924 cd = &comm->cd[d];
925 for(p=cd->np-1; p>=0; p--) {
926 ind = &cd->ind[p];
927 nat_tot -= ind->nrecv[nzone+1];
928 if (cd->bInPlace)
930 sbuf = v + nat_tot;
932 else
934 sbuf = &comm->vbuf2.v[0][0];
935 j = 0;
936 for(zone=0; zone<nzone; zone++)
938 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
940 sbuf[j] = v[i];
941 j++;
945 /* Communicate the forces */
946 dd_sendrecv_real(dd, d, dddirForward,
947 sbuf, ind->nrecv[nzone+1],
948 buf, ind->nsend[nzone+1]);
949 index = ind->index;
950 /* Add the received forces */
951 n = 0;
952 for(i=0; i<ind->nsend[nzone]; i++)
954 at0 = cgindex[index[i]];
955 at1 = cgindex[index[i]+1];
956 for(j=at0; j<at1; j++)
958 v[j] += buf[n];
959 n++;
963 nzone /= 2;
967 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
969 fprintf(fp,"zone d0 %d d1 %d d2 %d min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
970 d,i,j,
971 zone->min0,zone->max1,
972 zone->mch0,zone->mch0,
973 zone->p1_0,zone->p1_1);
977 #define DDZONECOMM_MAXZONE 5
978 #define DDZONECOMM_BUFSIZE 3
980 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
981 int ddimind,int direction,
982 gmx_ddzone_t *buf_s,int n_s,
983 gmx_ddzone_t *buf_r,int n_r)
985 #define ZBS DDZONECOMM_BUFSIZE
986 rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
987 rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
988 int i;
990 for(i=0; i<n_s; i++)
992 vbuf_s[i*ZBS ][0] = buf_s[i].min0;
993 vbuf_s[i*ZBS ][1] = buf_s[i].max1;
994 vbuf_s[i*ZBS ][2] = buf_s[i].min1;
995 vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
996 vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
997 vbuf_s[i*ZBS+1][2] = 0;
998 vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
999 vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1000 vbuf_s[i*ZBS+2][2] = 0;
1003 dd_sendrecv_rvec(dd, ddimind, direction,
1004 vbuf_s, n_s*ZBS,
1005 vbuf_r, n_r*ZBS);
1007 for(i=0; i<n_r; i++)
1009 buf_r[i].min0 = vbuf_r[i*ZBS ][0];
1010 buf_r[i].max1 = vbuf_r[i*ZBS ][1];
1011 buf_r[i].min1 = vbuf_r[i*ZBS ][2];
1012 buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1013 buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1014 buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1015 buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1018 #undef ZBS
1021 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
1022 rvec cell_ns_x0,rvec cell_ns_x1)
1024 int d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
1025 gmx_ddzone_t *zp;
1026 gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
1027 gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
1028 gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
1029 rvec extr_s[2],extr_r[2];
1030 rvec dh;
1031 real dist_d,c=0,det;
1032 gmx_domdec_comm_t *comm;
1033 gmx_bool bPBC,bUse;
1035 comm = dd->comm;
1037 for(d=1; d<dd->ndim; d++)
1039 dim = dd->dim[d];
1040 zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1041 zp->min0 = cell_ns_x0[dim];
1042 zp->max1 = cell_ns_x1[dim];
1043 zp->min1 = cell_ns_x1[dim];
1044 zp->mch0 = cell_ns_x0[dim];
1045 zp->mch1 = cell_ns_x1[dim];
1046 zp->p1_0 = cell_ns_x0[dim];
1047 zp->p1_1 = cell_ns_x1[dim];
1050 for(d=dd->ndim-2; d>=0; d--)
1052 dim = dd->dim[d];
1053 bPBC = (dim < ddbox->npbcdim);
1055 /* Use an rvec to store two reals */
1056 extr_s[d][0] = comm->cell_f0[d+1];
1057 extr_s[d][1] = comm->cell_f1[d+1];
1058 extr_s[d][2] = comm->cell_f1[d+1];
1060 pos = 0;
1061 /* Store the extremes in the backward sending buffer,
1062 * so the get updated separately from the forward communication.
1064 for(d1=d; d1<dd->ndim-1; d1++)
1066 /* We invert the order to be able to use the same loop for buf_e */
1067 buf_s[pos].min0 = extr_s[d1][1];
1068 buf_s[pos].max1 = extr_s[d1][0];
1069 buf_s[pos].min1 = extr_s[d1][2];
1070 buf_s[pos].mch0 = 0;
1071 buf_s[pos].mch1 = 0;
1072 /* Store the cell corner of the dimension we communicate along */
1073 buf_s[pos].p1_0 = comm->cell_x0[dim];
1074 buf_s[pos].p1_1 = 0;
1075 pos++;
1078 buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1079 pos++;
1081 if (dd->ndim == 3 && d == 0)
1083 buf_s[pos] = comm->zone_d2[0][1];
1084 pos++;
1085 buf_s[pos] = comm->zone_d1[0];
1086 pos++;
1089 /* We only need to communicate the extremes
1090 * in the forward direction
1092 npulse = comm->cd[d].np;
1093 if (bPBC)
1095 /* Take the minimum to avoid double communication */
1096 npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1098 else
1100 /* Without PBC we should really not communicate over
1101 * the boundaries, but implementing that complicates
1102 * the communication setup and therefore we simply
1103 * do all communication, but ignore some data.
1105 npulse_min = npulse;
1107 for(p=0; p<npulse_min; p++)
1109 /* Communicate the extremes forward */
1110 bUse = (bPBC || dd->ci[dim] > 0);
1112 dd_sendrecv_rvec(dd, d, dddirForward,
1113 extr_s+d, dd->ndim-d-1,
1114 extr_r+d, dd->ndim-d-1);
1116 if (bUse)
1118 for(d1=d; d1<dd->ndim-1; d1++)
1120 extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1121 extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1122 extr_s[d1][2] = min(extr_s[d1][2],extr_r[d1][2]);
1127 buf_size = pos;
1128 for(p=0; p<npulse; p++)
1130 /* Communicate all the zone information backward */
1131 bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1133 dd_sendrecv_ddzone(dd, d, dddirBackward,
1134 buf_s, buf_size,
1135 buf_r, buf_size);
1137 clear_rvec(dh);
1138 if (p > 0)
1140 for(d1=d+1; d1<dd->ndim; d1++)
1142 /* Determine the decrease of maximum required
1143 * communication height along d1 due to the distance along d,
1144 * this avoids a lot of useless atom communication.
1146 dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1148 if (ddbox->tric_dir[dim])
1150 /* c is the off-diagonal coupling between the cell planes
1151 * along directions d and d1.
1153 c = ddbox->v[dim][dd->dim[d1]][dim];
1155 else
1157 c = 0;
1159 det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1160 if (det > 0)
1162 dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1164 else
1166 /* A negative value signals out of range */
1167 dh[d1] = -1;
1172 /* Accumulate the extremes over all pulses */
1173 for(i=0; i<buf_size; i++)
1175 if (p == 0)
1177 buf_e[i] = buf_r[i];
1179 else
1181 if (bUse)
1183 buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1184 buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1185 buf_e[i].min1 = min(buf_e[i].min1,buf_r[i].min1);
1188 if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1190 d1 = 1;
1192 else
1194 d1 = d + 1;
1196 if (bUse && dh[d1] >= 0)
1198 buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1199 buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1202 /* Copy the received buffer to the send buffer,
1203 * to pass the data through with the next pulse.
1205 buf_s[i] = buf_r[i];
1207 if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1208 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1210 /* Store the extremes */
1211 pos = 0;
1213 for(d1=d; d1<dd->ndim-1; d1++)
1215 extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1216 extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1217 extr_s[d1][2] = min(extr_s[d1][2],buf_e[pos].min1);
1218 pos++;
1221 if (d == 1 || (d == 0 && dd->ndim == 3))
1223 for(i=d; i<2; i++)
1225 comm->zone_d2[1-d][i] = buf_e[pos];
1226 pos++;
1229 if (d == 0)
1231 comm->zone_d1[1] = buf_e[pos];
1232 pos++;
1238 if (dd->ndim >= 2)
1240 dim = dd->dim[1];
1241 for(i=0; i<2; i++)
1243 if (debug)
1245 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1247 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1248 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1251 if (dd->ndim >= 3)
1253 dim = dd->dim[2];
1254 for(i=0; i<2; i++)
1256 for(j=0; j<2; j++)
1258 if (debug)
1260 print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1262 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1263 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1267 for(d=1; d<dd->ndim; d++)
1269 comm->cell_f_max0[d] = extr_s[d-1][0];
1270 comm->cell_f_min1[d] = extr_s[d-1][1];
1271 if (debug)
1273 fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1274 d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1279 static void dd_collect_cg(gmx_domdec_t *dd,
1280 t_state *state_local)
1282 gmx_domdec_master_t *ma=NULL;
1283 int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1284 t_block *cgs_gl;
1286 if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1288 /* The master has the correct distribution */
1289 return;
1292 if (state_local->ddp_count == dd->ddp_count)
1294 ncg_home = dd->ncg_home;
1295 cg = dd->index_gl;
1296 nat_home = dd->nat_home;
1298 else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1300 cgs_gl = &dd->comm->cgs_gl;
1302 ncg_home = state_local->ncg_gl;
1303 cg = state_local->cg_gl;
1304 nat_home = 0;
1305 for(i=0; i<ncg_home; i++)
1307 nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1310 else
1312 gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1315 buf2[0] = dd->ncg_home;
1316 buf2[1] = dd->nat_home;
1317 if (DDMASTER(dd))
1319 ma = dd->ma;
1320 ibuf = ma->ibuf;
1322 else
1324 ibuf = NULL;
1326 /* Collect the charge group and atom counts on the master */
1327 dd_gather(dd,2*sizeof(int),buf2,ibuf);
1329 if (DDMASTER(dd))
1331 ma->index[0] = 0;
1332 for(i=0; i<dd->nnodes; i++)
1334 ma->ncg[i] = ma->ibuf[2*i];
1335 ma->nat[i] = ma->ibuf[2*i+1];
1336 ma->index[i+1] = ma->index[i] + ma->ncg[i];
1339 /* Make byte counts and indices */
1340 for(i=0; i<dd->nnodes; i++)
1342 ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1343 ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1345 if (debug)
1347 fprintf(debug,"Initial charge group distribution: ");
1348 for(i=0; i<dd->nnodes; i++)
1349 fprintf(debug," %d",ma->ncg[i]);
1350 fprintf(debug,"\n");
1354 /* Collect the charge group indices on the master */
1355 dd_gatherv(dd,
1356 dd->ncg_home*sizeof(int),dd->index_gl,
1357 DDMASTER(dd) ? ma->ibuf : NULL,
1358 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1359 DDMASTER(dd) ? ma->cg : NULL);
1361 dd->comm->master_cg_ddp_count = state_local->ddp_count;
1364 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1365 rvec *lv,rvec *v)
1367 gmx_domdec_master_t *ma;
1368 int n,i,c,a,nalloc=0;
1369 rvec *buf=NULL;
1370 t_block *cgs_gl;
1372 ma = dd->ma;
1374 if (!DDMASTER(dd))
1376 #ifdef GMX_MPI
1377 MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1378 dd->rank,dd->mpi_comm_all);
1379 #endif
1380 } else {
1381 /* Copy the master coordinates to the global array */
1382 cgs_gl = &dd->comm->cgs_gl;
1384 n = DDMASTERRANK(dd);
1385 a = 0;
1386 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1388 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1390 copy_rvec(lv[a++],v[c]);
1394 for(n=0; n<dd->nnodes; n++)
1396 if (n != dd->rank)
1398 if (ma->nat[n] > nalloc)
1400 nalloc = over_alloc_dd(ma->nat[n]);
1401 srenew(buf,nalloc);
1403 #ifdef GMX_MPI
1404 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1405 n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1406 #endif
1407 a = 0;
1408 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1410 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1412 copy_rvec(buf[a++],v[c]);
1417 sfree(buf);
1421 static void get_commbuffer_counts(gmx_domdec_t *dd,
1422 int **counts,int **disps)
1424 gmx_domdec_master_t *ma;
1425 int n;
1427 ma = dd->ma;
1429 /* Make the rvec count and displacment arrays */
1430 *counts = ma->ibuf;
1431 *disps = ma->ibuf + dd->nnodes;
1432 for(n=0; n<dd->nnodes; n++)
1434 (*counts)[n] = ma->nat[n]*sizeof(rvec);
1435 (*disps)[n] = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1439 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1440 rvec *lv,rvec *v)
1442 gmx_domdec_master_t *ma;
1443 int *rcounts=NULL,*disps=NULL;
1444 int n,i,c,a;
1445 rvec *buf=NULL;
1446 t_block *cgs_gl;
1448 ma = dd->ma;
1450 if (DDMASTER(dd))
1452 get_commbuffer_counts(dd,&rcounts,&disps);
1454 buf = ma->vbuf;
1457 dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1459 if (DDMASTER(dd))
1461 cgs_gl = &dd->comm->cgs_gl;
1463 a = 0;
1464 for(n=0; n<dd->nnodes; n++)
1466 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1468 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1470 copy_rvec(buf[a++],v[c]);
1477 void dd_collect_vec(gmx_domdec_t *dd,
1478 t_state *state_local,rvec *lv,rvec *v)
1480 gmx_domdec_master_t *ma;
1481 int n,i,c,a,nalloc=0;
1482 rvec *buf=NULL;
1484 dd_collect_cg(dd,state_local);
1486 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1488 dd_collect_vec_sendrecv(dd,lv,v);
1490 else
1492 dd_collect_vec_gatherv(dd,lv,v);
1497 void dd_collect_state(gmx_domdec_t *dd,
1498 t_state *state_local,t_state *state)
1500 int est,i,j,nh;
1502 nh = state->nhchainlength;
1504 if (DDMASTER(dd))
1506 for (i=0;i<efptNR;i++) {
1507 state->lambda[i] = state_local->lambda[i];
1509 state->fep_state = state_local->fep_state;
1510 state->veta = state_local->veta;
1511 state->vol0 = state_local->vol0;
1512 copy_mat(state_local->box,state->box);
1513 copy_mat(state_local->boxv,state->boxv);
1514 copy_mat(state_local->svir_prev,state->svir_prev);
1515 copy_mat(state_local->fvir_prev,state->fvir_prev);
1516 copy_mat(state_local->pres_prev,state->pres_prev);
1519 for(i=0; i<state_local->ngtc; i++)
1521 for(j=0; j<nh; j++) {
1522 state->nosehoover_xi[i*nh+j] = state_local->nosehoover_xi[i*nh+j];
1523 state->nosehoover_vxi[i*nh+j] = state_local->nosehoover_vxi[i*nh+j];
1525 state->therm_integral[i] = state_local->therm_integral[i];
1527 for(i=0; i<state_local->nnhpres; i++)
1529 for(j=0; j<nh; j++) {
1530 state->nhpres_xi[i*nh+j] = state_local->nhpres_xi[i*nh+j];
1531 state->nhpres_vxi[i*nh+j] = state_local->nhpres_vxi[i*nh+j];
1535 for(est=0; est<estNR; est++)
1537 if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1539 switch (est) {
1540 case estX:
1541 dd_collect_vec(dd,state_local,state_local->x,state->x);
1542 break;
1543 case estV:
1544 dd_collect_vec(dd,state_local,state_local->v,state->v);
1545 break;
1546 case estSDX:
1547 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1548 break;
1549 case estCGP:
1550 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1551 break;
1552 case estLD_RNG:
1553 if (state->nrngi == 1)
1555 if (DDMASTER(dd))
1557 for(i=0; i<state_local->nrng; i++)
1559 state->ld_rng[i] = state_local->ld_rng[i];
1563 else
1565 dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1566 state_local->ld_rng,state->ld_rng);
1568 break;
1569 case estLD_RNGI:
1570 if (state->nrngi == 1)
1572 if (DDMASTER(dd))
1574 state->ld_rngi[0] = state_local->ld_rngi[0];
1577 else
1579 dd_gather(dd,sizeof(state->ld_rngi[0]),
1580 state_local->ld_rngi,state->ld_rngi);
1582 break;
1583 case estDISRE_INITF:
1584 case estDISRE_RM3TAV:
1585 case estORIRE_INITF:
1586 case estORIRE_DTAV:
1587 break;
1588 default:
1589 gmx_incons("Unknown state entry encountered in dd_collect_state");
1595 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1597 int est;
1599 if (debug)
1601 fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1604 state->nalloc = over_alloc_dd(nalloc);
1606 for(est=0; est<estNR; est++)
1608 if (EST_DISTR(est) && (state->flags & (1<<est)))
1610 switch(est) {
1611 case estX:
1612 srenew(state->x,state->nalloc);
1613 break;
1614 case estV:
1615 srenew(state->v,state->nalloc);
1616 break;
1617 case estSDX:
1618 srenew(state->sd_X,state->nalloc);
1619 break;
1620 case estCGP:
1621 srenew(state->cg_p,state->nalloc);
1622 break;
1623 case estLD_RNG:
1624 case estLD_RNGI:
1625 case estDISRE_INITF:
1626 case estDISRE_RM3TAV:
1627 case estORIRE_INITF:
1628 case estORIRE_DTAV:
1629 /* No reallocation required */
1630 break;
1631 default:
1632 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1637 if (f != NULL)
1639 srenew(*f,state->nalloc);
1643 static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
1644 int nalloc)
1646 if (nalloc > fr->cg_nalloc)
1648 if (debug)
1650 fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1652 fr->cg_nalloc = over_alloc_dd(nalloc);
1653 srenew(fr->cginfo,fr->cg_nalloc);
1654 if (fr->cutoff_scheme == ecutsGROUP)
1656 srenew(fr->cg_cm,fr->cg_nalloc);
1659 if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1661 /* We don't use charge groups, we use x in state to set up
1662 * the atom communication.
1664 dd_realloc_state(state,f,nalloc);
1668 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1669 rvec *v,rvec *lv)
1671 gmx_domdec_master_t *ma;
1672 int n,i,c,a,nalloc=0;
1673 rvec *buf=NULL;
1675 if (DDMASTER(dd))
1677 ma = dd->ma;
1679 for(n=0; n<dd->nnodes; n++)
1681 if (n != dd->rank)
1683 if (ma->nat[n] > nalloc)
1685 nalloc = over_alloc_dd(ma->nat[n]);
1686 srenew(buf,nalloc);
1688 /* Use lv as a temporary buffer */
1689 a = 0;
1690 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1692 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1694 copy_rvec(v[c],buf[a++]);
1697 if (a != ma->nat[n])
1699 gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1700 a,ma->nat[n]);
1703 #ifdef GMX_MPI
1704 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1705 DDRANK(dd,n),n,dd->mpi_comm_all);
1706 #endif
1709 sfree(buf);
1710 n = DDMASTERRANK(dd);
1711 a = 0;
1712 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1714 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1716 copy_rvec(v[c],lv[a++]);
1720 else
1722 #ifdef GMX_MPI
1723 MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1724 MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1725 #endif
1729 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1730 rvec *v,rvec *lv)
1732 gmx_domdec_master_t *ma;
1733 int *scounts=NULL,*disps=NULL;
1734 int n,i,c,a,nalloc=0;
1735 rvec *buf=NULL;
1737 if (DDMASTER(dd))
1739 ma = dd->ma;
1741 get_commbuffer_counts(dd,&scounts,&disps);
1743 buf = ma->vbuf;
1744 a = 0;
1745 for(n=0; n<dd->nnodes; n++)
1747 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1749 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1751 copy_rvec(v[c],buf[a++]);
1757 dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1760 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1762 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1764 dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1766 else
1768 dd_distribute_vec_scatterv(dd,cgs,v,lv);
1772 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1773 t_state *state,t_state *state_local,
1774 rvec **f)
1776 int i,j,nh;
1778 nh = state->nhchainlength;
1780 if (DDMASTER(dd))
1782 for(i=0;i<efptNR;i++)
1784 state_local->lambda[i] = state->lambda[i];
1786 state_local->fep_state = state->fep_state;
1787 state_local->veta = state->veta;
1788 state_local->vol0 = state->vol0;
1789 copy_mat(state->box,state_local->box);
1790 copy_mat(state->box_rel,state_local->box_rel);
1791 copy_mat(state->boxv,state_local->boxv);
1792 copy_mat(state->svir_prev,state_local->svir_prev);
1793 copy_mat(state->fvir_prev,state_local->fvir_prev);
1794 for(i=0; i<state_local->ngtc; i++)
1796 for(j=0; j<nh; j++) {
1797 state_local->nosehoover_xi[i*nh+j] = state->nosehoover_xi[i*nh+j];
1798 state_local->nosehoover_vxi[i*nh+j] = state->nosehoover_vxi[i*nh+j];
1800 state_local->therm_integral[i] = state->therm_integral[i];
1802 for(i=0; i<state_local->nnhpres; i++)
1804 for(j=0; j<nh; j++) {
1805 state_local->nhpres_xi[i*nh+j] = state->nhpres_xi[i*nh+j];
1806 state_local->nhpres_vxi[i*nh+j] = state->nhpres_vxi[i*nh+j];
1810 dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
1811 dd_bcast(dd,sizeof(int),&state_local->fep_state);
1812 dd_bcast(dd,sizeof(real),&state_local->veta);
1813 dd_bcast(dd,sizeof(real),&state_local->vol0);
1814 dd_bcast(dd,sizeof(state_local->box),state_local->box);
1815 dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1816 dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1817 dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1818 dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1819 dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1820 dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1821 dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1822 dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1823 dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1825 if (dd->nat_home > state_local->nalloc)
1827 dd_realloc_state(state_local,f,dd->nat_home);
1829 for(i=0; i<estNR; i++)
1831 if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1833 switch (i) {
1834 case estX:
1835 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1836 break;
1837 case estV:
1838 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1839 break;
1840 case estSDX:
1841 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1842 break;
1843 case estCGP:
1844 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1845 break;
1846 case estLD_RNG:
1847 if (state->nrngi == 1)
1849 dd_bcastc(dd,
1850 state_local->nrng*sizeof(state_local->ld_rng[0]),
1851 state->ld_rng,state_local->ld_rng);
1853 else
1855 dd_scatter(dd,
1856 state_local->nrng*sizeof(state_local->ld_rng[0]),
1857 state->ld_rng,state_local->ld_rng);
1859 break;
1860 case estLD_RNGI:
1861 if (state->nrngi == 1)
1863 dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1864 state->ld_rngi,state_local->ld_rngi);
1866 else
1868 dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1869 state->ld_rngi,state_local->ld_rngi);
1871 break;
1872 case estDISRE_INITF:
1873 case estDISRE_RM3TAV:
1874 case estORIRE_INITF:
1875 case estORIRE_DTAV:
1876 /* Not implemented yet */
1877 break;
1878 default:
1879 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1885 static char dim2char(int dim)
1887 char c='?';
1889 switch (dim)
1891 case XX: c = 'X'; break;
1892 case YY: c = 'Y'; break;
1893 case ZZ: c = 'Z'; break;
1894 default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1897 return c;
1900 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1901 gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1903 rvec grid_s[2],*grid_r=NULL,cx,r;
1904 char fname[STRLEN],format[STRLEN],buf[22];
1905 FILE *out;
1906 int a,i,d,z,y,x;
1907 matrix tric;
1908 real vol;
1910 copy_rvec(dd->comm->cell_x0,grid_s[0]);
1911 copy_rvec(dd->comm->cell_x1,grid_s[1]);
1913 if (DDMASTER(dd))
1915 snew(grid_r,2*dd->nnodes);
1918 dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1920 if (DDMASTER(dd))
1922 for(d=0; d<DIM; d++)
1924 for(i=0; i<DIM; i++)
1926 if (d == i)
1928 tric[d][i] = 1;
1930 else
1932 if (d < ddbox->npbcdim && dd->nc[d] > 1)
1934 tric[d][i] = box[i][d]/box[i][i];
1936 else
1938 tric[d][i] = 0;
1943 sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1944 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1945 out = gmx_fio_fopen(fname,"w");
1946 gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1947 a = 1;
1948 for(i=0; i<dd->nnodes; i++)
1950 vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1951 for(d=0; d<DIM; d++)
1953 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1955 for(z=0; z<2; z++)
1957 for(y=0; y<2; y++)
1959 for(x=0; x<2; x++)
1961 cx[XX] = grid_r[i*2+x][XX];
1962 cx[YY] = grid_r[i*2+y][YY];
1963 cx[ZZ] = grid_r[i*2+z][ZZ];
1964 mvmul(tric,cx,r);
1965 fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1966 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1970 for(d=0; d<DIM; d++)
1972 for(x=0; x<4; x++)
1974 switch(d)
1976 case 0: y = 1 + i*8 + 2*x; break;
1977 case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1978 case 2: y = 1 + i*8 + x; break;
1980 fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1984 gmx_fio_fclose(out);
1985 sfree(grid_r);
1989 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1990 gmx_mtop_t *mtop,t_commrec *cr,
1991 int natoms,rvec x[],matrix box)
1993 char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1994 FILE *out;
1995 int i,ii,resnr,c;
1996 char *atomname,*resname;
1997 real b;
1998 gmx_domdec_t *dd;
2000 dd = cr->dd;
2001 if (natoms == -1)
2003 natoms = dd->comm->nat[ddnatVSITE];
2006 sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
2008 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
2009 sprintf(format4,"%s%s\n",pdbformat4,"%6.2f%6.2f");
2011 out = gmx_fio_fopen(fname,"w");
2013 fprintf(out,"TITLE %s\n",title);
2014 gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
2015 for(i=0; i<natoms; i++)
2017 ii = dd->gatindex[i];
2018 gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
2019 if (i < dd->comm->nat[ddnatZONE])
2021 c = 0;
2022 while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2024 c++;
2026 b = c;
2028 else if (i < dd->comm->nat[ddnatVSITE])
2030 b = dd->comm->zones.n;
2032 else
2034 b = dd->comm->zones.n + 1;
2036 fprintf(out,strlen(atomname)<4 ? format : format4,
2037 "ATOM",(ii+1)%100000,
2038 atomname,resname,' ',resnr%10000,' ',
2039 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
2041 fprintf(out,"TER\n");
2043 gmx_fio_fclose(out);
2046 real dd_cutoff_mbody(gmx_domdec_t *dd)
2048 gmx_domdec_comm_t *comm;
2049 int di;
2050 real r;
2052 comm = dd->comm;
2054 r = -1;
2055 if (comm->bInterCGBondeds)
2057 if (comm->cutoff_mbody > 0)
2059 r = comm->cutoff_mbody;
2061 else
2063 /* cutoff_mbody=0 means we do not have DLB */
2064 r = comm->cellsize_min[dd->dim[0]];
2065 for(di=1; di<dd->ndim; di++)
2067 r = min(r,comm->cellsize_min[dd->dim[di]]);
2069 if (comm->bBondComm)
2071 r = max(r,comm->cutoff_mbody);
2073 else
2075 r = min(r,comm->cutoff);
2080 return r;
2083 real dd_cutoff_twobody(gmx_domdec_t *dd)
2085 real r_mb;
2087 r_mb = dd_cutoff_mbody(dd);
2089 return max(dd->comm->cutoff,r_mb);
2093 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2095 int nc,ntot;
2097 nc = dd->nc[dd->comm->cartpmedim];
2098 ntot = dd->comm->ntot[dd->comm->cartpmedim];
2099 copy_ivec(coord,coord_pme);
2100 coord_pme[dd->comm->cartpmedim] =
2101 nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2104 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2106 /* Here we assign a PME node to communicate with this DD node
2107 * by assuming that the major index of both is x.
2108 * We add cr->npmenodes/2 to obtain an even distribution.
2110 return (ddindex*npme + npme/2)/ndd;
2113 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2115 return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2118 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2120 return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2123 static int *dd_pmenodes(t_commrec *cr)
2125 int *pmenodes;
2126 int n,i,p0,p1;
2128 snew(pmenodes,cr->npmenodes);
2129 n = 0;
2130 for(i=0; i<cr->dd->nnodes; i++) {
2131 p0 = cr_ddindex2pmeindex(cr,i);
2132 p1 = cr_ddindex2pmeindex(cr,i+1);
2133 if (i+1 == cr->dd->nnodes || p1 > p0) {
2134 if (debug)
2135 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2136 pmenodes[n] = i + 1 + n;
2137 n++;
2141 return pmenodes;
2144 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2146 gmx_domdec_t *dd;
2147 ivec coords,coords_pme,nc;
2148 int slab;
2150 dd = cr->dd;
2152 if (dd->comm->bCartesian) {
2153 gmx_ddindex2xyz(dd->nc,ddindex,coords);
2154 dd_coords2pmecoords(dd,coords,coords_pme);
2155 copy_ivec(dd->ntot,nc);
2156 nc[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2157 coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2159 slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2160 } else {
2161 slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2164 coords[XX] = x;
2165 coords[YY] = y;
2166 coords[ZZ] = z;
2167 slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2169 return slab;
2172 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2174 gmx_domdec_comm_t *comm;
2175 ivec coords;
2176 int ddindex,nodeid=-1;
2178 comm = cr->dd->comm;
2180 coords[XX] = x;
2181 coords[YY] = y;
2182 coords[ZZ] = z;
2183 if (comm->bCartesianPP_PME)
2185 #ifdef GMX_MPI
2186 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2187 #endif
2189 else
2191 ddindex = dd_index(cr->dd->nc,coords);
2192 if (comm->bCartesianPP)
2194 nodeid = comm->ddindex2simnodeid[ddindex];
2196 else
2198 if (comm->pmenodes)
2200 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2202 else
2204 nodeid = ddindex;
2209 return nodeid;
2212 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2214 gmx_domdec_t *dd;
2215 gmx_domdec_comm_t *comm;
2216 ivec coord,coord_pme;
2217 int i;
2218 int pmenode=-1;
2220 dd = cr->dd;
2221 comm = dd->comm;
2223 /* This assumes a uniform x domain decomposition grid cell size */
2224 if (comm->bCartesianPP_PME)
2226 #ifdef GMX_MPI
2227 MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2228 if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2230 /* This is a PP node */
2231 dd_cart_coord2pmecoord(dd,coord,coord_pme);
2232 MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2234 #endif
2236 else if (comm->bCartesianPP)
2238 if (sim_nodeid < dd->nnodes)
2240 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2243 else
2245 /* This assumes DD cells with identical x coordinates
2246 * are numbered sequentially.
2248 if (dd->comm->pmenodes == NULL)
2250 if (sim_nodeid < dd->nnodes)
2252 /* The DD index equals the nodeid */
2253 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2256 else
2258 i = 0;
2259 while (sim_nodeid > dd->comm->pmenodes[i])
2261 i++;
2263 if (sim_nodeid < dd->comm->pmenodes[i])
2265 pmenode = dd->comm->pmenodes[i];
2270 return pmenode;
2273 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2275 gmx_bool bPMEOnlyNode;
2277 if (DOMAINDECOMP(cr))
2279 bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2281 else
2283 bPMEOnlyNode = FALSE;
2286 return bPMEOnlyNode;
2289 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2290 int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2292 gmx_domdec_t *dd;
2293 int x,y,z;
2294 ivec coord,coord_pme;
2296 dd = cr->dd;
2298 snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2300 *nmy_ddnodes = 0;
2301 for(x=0; x<dd->nc[XX]; x++)
2303 for(y=0; y<dd->nc[YY]; y++)
2305 for(z=0; z<dd->nc[ZZ]; z++)
2307 if (dd->comm->bCartesianPP_PME)
2309 coord[XX] = x;
2310 coord[YY] = y;
2311 coord[ZZ] = z;
2312 dd_cart_coord2pmecoord(dd,coord,coord_pme);
2313 if (dd->ci[XX] == coord_pme[XX] &&
2314 dd->ci[YY] == coord_pme[YY] &&
2315 dd->ci[ZZ] == coord_pme[ZZ])
2316 (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2318 else
2320 /* The slab corresponds to the nodeid in the PME group */
2321 if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2323 (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2330 /* The last PP-only node is the peer node */
2331 *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2333 if (debug)
2335 fprintf(debug,"Receive coordinates from PP nodes:");
2336 for(x=0; x<*nmy_ddnodes; x++)
2338 fprintf(debug," %d",(*my_ddnodes)[x]);
2340 fprintf(debug,"\n");
2344 static gmx_bool receive_vir_ener(t_commrec *cr)
2346 gmx_domdec_comm_t *comm;
2347 int pmenode,coords[DIM],rank;
2348 gmx_bool bReceive;
2350 bReceive = TRUE;
2351 if (cr->npmenodes < cr->dd->nnodes)
2353 comm = cr->dd->comm;
2354 if (comm->bCartesianPP_PME)
2356 pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2357 #ifdef GMX_MPI
2358 MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2359 coords[comm->cartpmedim]++;
2360 if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2362 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2363 if (dd_simnode2pmenode(cr,rank) == pmenode)
2365 /* This is not the last PP node for pmenode */
2366 bReceive = FALSE;
2369 #endif
2371 else
2373 pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2374 if (cr->sim_nodeid+1 < cr->nnodes &&
2375 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2377 /* This is not the last PP node for pmenode */
2378 bReceive = FALSE;
2383 return bReceive;
2386 static void set_zones_ncg_home(gmx_domdec_t *dd)
2388 gmx_domdec_zones_t *zones;
2389 int i;
2391 zones = &dd->comm->zones;
2393 zones->cg_range[0] = 0;
2394 for(i=1; i<zones->n+1; i++)
2396 zones->cg_range[i] = dd->ncg_home;
2400 static void rebuild_cgindex(gmx_domdec_t *dd,
2401 const int *gcgs_index,t_state *state)
2403 int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2405 ind = state->cg_gl;
2406 dd_cg_gl = dd->index_gl;
2407 cgindex = dd->cgindex;
2408 nat = 0;
2409 cgindex[0] = nat;
2410 for(i=0; i<state->ncg_gl; i++)
2412 cgindex[i] = nat;
2413 cg_gl = ind[i];
2414 dd_cg_gl[i] = cg_gl;
2415 nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2417 cgindex[i] = nat;
2419 dd->ncg_home = state->ncg_gl;
2420 dd->nat_home = nat;
2422 set_zones_ncg_home(dd);
2425 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2427 while (cg >= cginfo_mb->cg_end)
2429 cginfo_mb++;
2432 return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2435 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2436 t_forcerec *fr,char *bLocalCG)
2438 cginfo_mb_t *cginfo_mb;
2439 int *cginfo;
2440 int cg;
2442 if (fr != NULL)
2444 cginfo_mb = fr->cginfo_mb;
2445 cginfo = fr->cginfo;
2447 for(cg=cg0; cg<cg1; cg++)
2449 cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2453 if (bLocalCG != NULL)
2455 for(cg=cg0; cg<cg1; cg++)
2457 bLocalCG[index_gl[cg]] = TRUE;
2462 static void make_dd_indices(gmx_domdec_t *dd,
2463 const int *gcgs_index,int cg_start)
2465 int nzone,zone,zone1,cg0,cg1,cg1_p1,cg,cg_gl,a,a_gl;
2466 int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2467 gmx_ga2la_t *ga2la;
2468 char *bLocalCG;
2469 gmx_bool bCGs;
2471 bLocalCG = dd->comm->bLocalCG;
2473 if (dd->nat_tot > dd->gatindex_nalloc)
2475 dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2476 srenew(dd->gatindex,dd->gatindex_nalloc);
2479 nzone = dd->comm->zones.n;
2480 zone2cg = dd->comm->zones.cg_range;
2481 zone_ncg1 = dd->comm->zone_ncg1;
2482 index_gl = dd->index_gl;
2483 gatindex = dd->gatindex;
2484 bCGs = dd->comm->bCGs;
2486 if (zone2cg[1] != dd->ncg_home)
2488 gmx_incons("dd->ncg_zone is not up to date");
2491 /* Make the local to global and global to local atom index */
2492 a = dd->cgindex[cg_start];
2493 for(zone=0; zone<nzone; zone++)
2495 if (zone == 0)
2497 cg0 = cg_start;
2499 else
2501 cg0 = zone2cg[zone];
2503 cg1 = zone2cg[zone+1];
2504 cg1_p1 = cg0 + zone_ncg1[zone];
2506 for(cg=cg0; cg<cg1; cg++)
2508 zone1 = zone;
2509 if (cg >= cg1_p1)
2511 /* Signal that this cg is from more than one pulse away */
2512 zone1 += nzone;
2514 cg_gl = index_gl[cg];
2515 if (bCGs)
2517 for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2519 gatindex[a] = a_gl;
2520 ga2la_set(dd->ga2la,a_gl,a,zone1);
2521 a++;
2524 else
2526 gatindex[a] = cg_gl;
2527 ga2la_set(dd->ga2la,cg_gl,a,zone1);
2528 a++;
2534 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2535 const char *where)
2537 int ncg,i,ngl,nerr;
2539 nerr = 0;
2540 if (bLocalCG == NULL)
2542 return nerr;
2544 for(i=0; i<dd->ncg_tot; i++)
2546 if (!bLocalCG[dd->index_gl[i]])
2548 fprintf(stderr,
2549 "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2550 nerr++;
2553 ngl = 0;
2554 for(i=0; i<ncg_sys; i++)
2556 if (bLocalCG[i])
2558 ngl++;
2561 if (ngl != dd->ncg_tot)
2563 fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2564 nerr++;
2567 return nerr;
2570 static void check_index_consistency(gmx_domdec_t *dd,
2571 int natoms_sys,int ncg_sys,
2572 const char *where)
2574 int nerr,ngl,i,a,cell;
2575 int *have;
2577 nerr = 0;
2579 if (dd->comm->DD_debug > 1)
2581 snew(have,natoms_sys);
2582 for(a=0; a<dd->nat_tot; a++)
2584 if (have[dd->gatindex[a]] > 0)
2586 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2588 else
2590 have[dd->gatindex[a]] = a + 1;
2593 sfree(have);
2596 snew(have,dd->nat_tot);
2598 ngl = 0;
2599 for(i=0; i<natoms_sys; i++)
2601 if (ga2la_get(dd->ga2la,i,&a,&cell))
2603 if (a >= dd->nat_tot)
2605 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2606 nerr++;
2608 else
2610 have[a] = 1;
2611 if (dd->gatindex[a] != i)
2613 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2614 nerr++;
2617 ngl++;
2620 if (ngl != dd->nat_tot)
2622 fprintf(stderr,
2623 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2624 dd->rank,where,ngl,dd->nat_tot);
2626 for(a=0; a<dd->nat_tot; a++)
2628 if (have[a] == 0)
2630 fprintf(stderr,
2631 "DD node %d, %s: local atom %d, global %d has no global index\n",
2632 dd->rank,where,a+1,dd->gatindex[a]+1);
2635 sfree(have);
2637 nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2639 if (nerr > 0) {
2640 gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2641 dd->rank,where,nerr);
2645 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2647 int i;
2648 char *bLocalCG;
2650 if (a_start == 0)
2652 /* Clear the whole list without searching */
2653 ga2la_clear(dd->ga2la);
2655 else
2657 for(i=a_start; i<dd->nat_tot; i++)
2659 ga2la_del(dd->ga2la,dd->gatindex[i]);
2663 bLocalCG = dd->comm->bLocalCG;
2664 if (bLocalCG)
2666 for(i=cg_start; i<dd->ncg_tot; i++)
2668 bLocalCG[dd->index_gl[i]] = FALSE;
2672 dd_clear_local_vsite_indices(dd);
2674 if (dd->constraints)
2676 dd_clear_local_constraint_indices(dd);
2680 static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
2681 int dim_ind)
2683 real grid_jump_limit;
2685 /* The distance between the boundaries of cells at distance
2686 * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2687 * and by the fact that cells should not be shifted by more than
2688 * half their size, such that cg's only shift by one cell
2689 * at redecomposition.
2691 grid_jump_limit = comm->cellsize_limit;
2692 if (!comm->bVacDLBNoLimit)
2694 grid_jump_limit = max(grid_jump_limit,
2695 cutoff/comm->cd[dim_ind].np);
2698 return grid_jump_limit;
2701 static gmx_bool check_grid_jump(gmx_large_int_t step,
2702 gmx_domdec_t *dd,
2703 real cutoff,
2704 gmx_ddbox_t *ddbox,
2705 gmx_bool bFatal)
2707 gmx_domdec_comm_t *comm;
2708 int d,dim;
2709 real limit,bfac;
2710 gmx_bool bInvalid;
2712 bInvalid = FALSE;
2714 comm = dd->comm;
2716 for(d=1; d<dd->ndim; d++)
2718 dim = dd->dim[d];
2719 limit = grid_jump_limit(comm,cutoff,d);
2720 bfac = ddbox->box_size[dim];
2721 if (ddbox->tric_dir[dim])
2723 bfac *= ddbox->skew_fac[dim];
2725 if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac < limit ||
2726 (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2728 bInvalid = TRUE;
2730 if (bFatal)
2732 char buf[22];
2734 /* This error should never be triggered under normal
2735 * circumstances, but you never know ...
2737 gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2738 gmx_step_str(step,buf),
2739 dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2744 return bInvalid;
2747 static int dd_load_count(gmx_domdec_comm_t *comm)
2749 return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2752 static float dd_force_load(gmx_domdec_comm_t *comm)
2754 float load;
2756 if (comm->eFlop)
2758 load = comm->flop;
2759 if (comm->eFlop > 1)
2761 load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2764 else
2766 load = comm->cycl[ddCyclF];
2767 if (comm->cycl_n[ddCyclF] > 1)
2769 /* Subtract the maximum of the last n cycle counts
2770 * to get rid of possible high counts due to other soures,
2771 * for instance system activity, that would otherwise
2772 * affect the dynamic load balancing.
2774 load -= comm->cycl_max[ddCyclF];
2778 return load;
2781 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2783 gmx_domdec_comm_t *comm;
2784 int i;
2786 comm = dd->comm;
2788 snew(*dim_f,dd->nc[dim]+1);
2789 (*dim_f)[0] = 0;
2790 for(i=1; i<dd->nc[dim]; i++)
2792 if (comm->slb_frac[dim])
2794 (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2796 else
2798 (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2801 (*dim_f)[dd->nc[dim]] = 1;
2804 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2806 int pmeindex,slab,nso,i;
2807 ivec xyz;
2809 if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2811 ddpme->dim = YY;
2813 else
2815 ddpme->dim = dimind;
2817 ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2819 ddpme->nslab = (ddpme->dim == 0 ?
2820 dd->comm->npmenodes_x :
2821 dd->comm->npmenodes_y);
2823 if (ddpme->nslab <= 1)
2825 return;
2828 nso = dd->comm->npmenodes/ddpme->nslab;
2829 /* Determine for each PME slab the PP location range for dimension dim */
2830 snew(ddpme->pp_min,ddpme->nslab);
2831 snew(ddpme->pp_max,ddpme->nslab);
2832 for(slab=0; slab<ddpme->nslab; slab++) {
2833 ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2834 ddpme->pp_max[slab] = 0;
2836 for(i=0; i<dd->nnodes; i++) {
2837 ddindex2xyz(dd->nc,i,xyz);
2838 /* For y only use our y/z slab.
2839 * This assumes that the PME x grid size matches the DD grid size.
2841 if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2842 pmeindex = ddindex2pmeindex(dd,i);
2843 if (dimind == 0) {
2844 slab = pmeindex/nso;
2845 } else {
2846 slab = pmeindex % ddpme->nslab;
2848 ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2849 ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2853 set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2856 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2858 if (dd->comm->ddpme[0].dim == XX)
2860 return dd->comm->ddpme[0].maxshift;
2862 else
2864 return 0;
2868 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2870 if (dd->comm->ddpme[0].dim == YY)
2872 return dd->comm->ddpme[0].maxshift;
2874 else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2876 return dd->comm->ddpme[1].maxshift;
2878 else
2880 return 0;
2884 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2885 gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2887 gmx_domdec_comm_t *comm;
2888 int nc,ns,s;
2889 int *xmin,*xmax;
2890 real range,pme_boundary;
2891 int sh;
2893 comm = dd->comm;
2894 nc = dd->nc[ddpme->dim];
2895 ns = ddpme->nslab;
2897 if (!ddpme->dim_match)
2899 /* PP decomposition is not along dim: the worst situation */
2900 sh = ns/2;
2902 else if (ns <= 3 || (bUniform && ns == nc))
2904 /* The optimal situation */
2905 sh = 1;
2907 else
2909 /* We need to check for all pme nodes which nodes they
2910 * could possibly need to communicate with.
2912 xmin = ddpme->pp_min;
2913 xmax = ddpme->pp_max;
2914 /* Allow for atoms to be maximally 2/3 times the cut-off
2915 * out of their DD cell. This is a reasonable balance between
2916 * between performance and support for most charge-group/cut-off
2917 * combinations.
2919 range = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2920 /* Avoid extra communication when we are exactly at a boundary */
2921 range *= 0.999;
2923 sh = 1;
2924 for(s=0; s<ns; s++)
2926 /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2927 pme_boundary = (real)s/ns;
2928 while (sh+1 < ns &&
2929 ((s-(sh+1) >= 0 &&
2930 cell_f[xmax[s-(sh+1) ]+1] + range > pme_boundary) ||
2931 (s-(sh+1) < 0 &&
2932 cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2934 sh++;
2936 pme_boundary = (real)(s+1)/ns;
2937 while (sh+1 < ns &&
2938 ((s+(sh+1) < ns &&
2939 cell_f[xmin[s+(sh+1) ] ] - range < pme_boundary) ||
2940 (s+(sh+1) >= ns &&
2941 cell_f[xmin[s+(sh+1)-ns] ] + 1 - range < pme_boundary)))
2943 sh++;
2948 ddpme->maxshift = sh;
2950 if (debug)
2952 fprintf(debug,"PME slab communication range for dim %d is %d\n",
2953 ddpme->dim,ddpme->maxshift);
2957 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2959 int d,dim;
2961 for(d=0; d<dd->ndim; d++)
2963 dim = dd->dim[d];
2964 if (dim < ddbox->nboundeddim &&
2965 ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2966 dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2968 gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2969 dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2970 dd->nc[dim],dd->comm->cellsize_limit);
2975 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2976 gmx_bool bMaster,ivec npulse)
2978 gmx_domdec_comm_t *comm;
2979 int d,j;
2980 rvec cellsize_min;
2981 real *cell_x,cell_dx,cellsize;
2983 comm = dd->comm;
2985 for(d=0; d<DIM; d++)
2987 cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2988 npulse[d] = 1;
2989 if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2991 /* Uniform grid */
2992 cell_dx = ddbox->box_size[d]/dd->nc[d];
2993 if (bMaster)
2995 for(j=0; j<dd->nc[d]+1; j++)
2997 dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3000 else
3002 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d] )*cell_dx;
3003 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3005 cellsize = cell_dx*ddbox->skew_fac[d];
3006 while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3008 npulse[d]++;
3010 cellsize_min[d] = cellsize;
3012 else
3014 /* Statically load balanced grid */
3015 /* Also when we are not doing a master distribution we determine
3016 * all cell borders in a loop to obtain identical values
3017 * to the master distribution case and to determine npulse.
3019 if (bMaster)
3021 cell_x = dd->ma->cell_x[d];
3023 else
3025 snew(cell_x,dd->nc[d]+1);
3027 cell_x[0] = ddbox->box0[d];
3028 for(j=0; j<dd->nc[d]; j++)
3030 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
3031 cell_x[j+1] = cell_x[j] + cell_dx;
3032 cellsize = cell_dx*ddbox->skew_fac[d];
3033 while (cellsize*npulse[d] < comm->cutoff &&
3034 npulse[d] < dd->nc[d]-1)
3036 npulse[d]++;
3038 cellsize_min[d] = min(cellsize_min[d],cellsize);
3040 if (!bMaster)
3042 comm->cell_x0[d] = cell_x[dd->ci[d]];
3043 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3044 sfree(cell_x);
3047 /* The following limitation is to avoid that a cell would receive
3048 * some of its own home charge groups back over the periodic boundary.
3049 * Double charge groups cause trouble with the global indices.
3051 if (d < ddbox->npbcdim &&
3052 dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3054 gmx_fatal_collective(FARGS,NULL,dd,
3055 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3056 dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
3057 comm->cutoff,
3058 dd->nc[d],dd->nc[d],
3059 dd->nnodes > dd->nc[d] ? "cells" : "processors");
3063 if (!comm->bDynLoadBal)
3065 copy_rvec(cellsize_min,comm->cellsize_min);
3068 for(d=0; d<comm->npmedecompdim; d++)
3070 set_pme_maxshift(dd,&comm->ddpme[d],
3071 comm->slb_frac[dd->dim[d]]==NULL,ddbox,
3072 comm->ddpme[d].slb_dim_f);
3077 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3078 int d,int dim,gmx_domdec_root_t *root,
3079 gmx_ddbox_t *ddbox,
3080 gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
3082 gmx_domdec_comm_t *comm;
3083 int ncd,i,j,nmin,nmin_old;
3084 gmx_bool bLimLo,bLimHi;
3085 real *cell_size;
3086 real fac,halfway,cellsize_limit_f_i,region_size;
3087 gmx_bool bPBC,bLastHi=FALSE;
3088 int nrange[]={range[0],range[1]};
3090 region_size= root->cell_f[range[1]]-root->cell_f[range[0]];
3092 comm = dd->comm;
3094 ncd = dd->nc[dim];
3096 bPBC = (dim < ddbox->npbcdim);
3098 cell_size = root->buf_ncd;
3100 if (debug)
3102 fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
3105 /* First we need to check if the scaling does not make cells
3106 * smaller than the smallest allowed size.
3107 * We need to do this iteratively, since if a cell is too small,
3108 * it needs to be enlarged, which makes all the other cells smaller,
3109 * which could in turn make another cell smaller than allowed.
3111 for(i=range[0]; i<range[1]; i++)
3113 root->bCellMin[i] = FALSE;
3115 nmin = 0;
3118 nmin_old = nmin;
3119 /* We need the total for normalization */
3120 fac = 0;
3121 for(i=range[0]; i<range[1]; i++)
3123 if (root->bCellMin[i] == FALSE)
3125 fac += cell_size[i];
3128 fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3129 /* Determine the cell boundaries */
3130 for(i=range[0]; i<range[1]; i++)
3132 if (root->bCellMin[i] == FALSE)
3134 cell_size[i] *= fac;
3135 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3137 cellsize_limit_f_i = 0;
3139 else
3141 cellsize_limit_f_i = cellsize_limit_f;
3143 if (cell_size[i] < cellsize_limit_f_i)
3145 root->bCellMin[i] = TRUE;
3146 cell_size[i] = cellsize_limit_f_i;
3147 nmin++;
3150 root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3153 while (nmin > nmin_old);
3155 i=range[1]-1;
3156 cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3157 /* For this check we should not use DD_CELL_MARGIN,
3158 * but a slightly smaller factor,
3159 * since rounding could get use below the limit.
3161 if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3163 char buf[22];
3164 gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3165 gmx_step_str(step,buf),
3166 dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3167 ncd,comm->cellsize_min[dim]);
3170 root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3172 if (!bUniform)
3174 /* Check if the boundary did not displace more than halfway
3175 * each of the cells it bounds, as this could cause problems,
3176 * especially when the differences between cell sizes are large.
3177 * If changes are applied, they will not make cells smaller
3178 * than the cut-off, as we check all the boundaries which
3179 * might be affected by a change and if the old state was ok,
3180 * the cells will at most be shrunk back to their old size.
3182 for(i=range[0]+1; i<range[1]; i++)
3184 halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3185 if (root->cell_f[i] < halfway)
3187 root->cell_f[i] = halfway;
3188 /* Check if the change also causes shifts of the next boundaries */
3189 for(j=i+1; j<range[1]; j++)
3191 if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3192 root->cell_f[j] = root->cell_f[j-1] + cellsize_limit_f;
3195 halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3196 if (root->cell_f[i] > halfway)
3198 root->cell_f[i] = halfway;
3199 /* Check if the change also causes shifts of the next boundaries */
3200 for(j=i-1; j>=range[0]+1; j--)
3202 if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3203 root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3209 /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3210 /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3211 * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3212 * for a and b nrange is used */
3213 if (d > 0)
3215 /* Take care of the staggering of the cell boundaries */
3216 if (bUniform)
3218 for(i=range[0]; i<range[1]; i++)
3220 root->cell_f_max0[i] = root->cell_f[i];
3221 root->cell_f_min1[i] = root->cell_f[i+1];
3224 else
3226 for(i=range[0]+1; i<range[1]; i++)
3228 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3229 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3230 if (bLimLo && bLimHi)
3232 /* Both limits violated, try the best we can */
3233 /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3234 root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3235 nrange[0]=range[0];
3236 nrange[1]=i;
3237 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3239 nrange[0]=i;
3240 nrange[1]=range[1];
3241 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3243 return;
3245 else if (bLimLo)
3247 /* root->cell_f[i] = root->bound_min[i]; */
3248 nrange[1]=i; /* only store violation location. There could be a LimLo violation following with an higher index */
3249 bLastHi=FALSE;
3251 else if (bLimHi && !bLastHi)
3253 bLastHi=TRUE;
3254 if (nrange[1] < range[1]) /* found a LimLo before */
3256 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3257 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3258 nrange[0]=nrange[1];
3260 root->cell_f[i] = root->bound_max[i];
3261 nrange[1]=i;
3262 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3263 nrange[0]=i;
3264 nrange[1]=range[1];
3267 if (nrange[1] < range[1]) /* found last a LimLo */
3269 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3270 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3271 nrange[0]=nrange[1];
3272 nrange[1]=range[1];
3273 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3275 else if (nrange[0] > range[0]) /* found at least one LimHi */
3277 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3284 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3285 int d,int dim,gmx_domdec_root_t *root,
3286 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3287 gmx_bool bUniform,gmx_large_int_t step)
3289 gmx_domdec_comm_t *comm;
3290 int ncd,d1,i,j,pos;
3291 real *cell_size;
3292 real load_aver,load_i,imbalance,change,change_max,sc;
3293 real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3294 real change_limit;
3295 real relax = 0.5;
3296 gmx_bool bPBC;
3297 int range[] = { 0, 0 };
3299 comm = dd->comm;
3301 /* Convert the maximum change from the input percentage to a fraction */
3302 change_limit = comm->dlb_scale_lim*0.01;
3304 ncd = dd->nc[dim];
3306 bPBC = (dim < ddbox->npbcdim);
3308 cell_size = root->buf_ncd;
3310 /* Store the original boundaries */
3311 for(i=0; i<ncd+1; i++)
3313 root->old_cell_f[i] = root->cell_f[i];
3315 if (bUniform) {
3316 for(i=0; i<ncd; i++)
3318 cell_size[i] = 1.0/ncd;
3321 else if (dd_load_count(comm))
3323 load_aver = comm->load[d].sum_m/ncd;
3324 change_max = 0;
3325 for(i=0; i<ncd; i++)
3327 /* Determine the relative imbalance of cell i */
3328 load_i = comm->load[d].load[i*comm->load[d].nload+2];
3329 imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3330 /* Determine the change of the cell size using underrelaxation */
3331 change = -relax*imbalance;
3332 change_max = max(change_max,max(change,-change));
3334 /* Limit the amount of scaling.
3335 * We need to use the same rescaling for all cells in one row,
3336 * otherwise the load balancing might not converge.
3338 sc = relax;
3339 if (change_max > change_limit)
3341 sc *= change_limit/change_max;
3343 for(i=0; i<ncd; i++)
3345 /* Determine the relative imbalance of cell i */
3346 load_i = comm->load[d].load[i*comm->load[d].nload+2];
3347 imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3348 /* Determine the change of the cell size using underrelaxation */
3349 change = -sc*imbalance;
3350 cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3354 cellsize_limit_f = comm->cellsize_min[dim]/ddbox->box_size[dim];
3355 cellsize_limit_f *= DD_CELL_MARGIN;
3356 dist_min_f_hard = grid_jump_limit(comm,comm->cutoff,d)/ddbox->box_size[dim];
3357 dist_min_f = dist_min_f_hard * DD_CELL_MARGIN;
3358 if (ddbox->tric_dir[dim])
3360 cellsize_limit_f /= ddbox->skew_fac[dim];
3361 dist_min_f /= ddbox->skew_fac[dim];
3363 if (bDynamicBox && d > 0)
3365 dist_min_f *= DD_PRES_SCALE_MARGIN;
3367 if (d > 0 && !bUniform)
3369 /* Make sure that the grid is not shifted too much */
3370 for(i=1; i<ncd; i++) {
3371 if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3373 gmx_incons("Inconsistent DD boundary staggering limits!");
3375 root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3376 space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3377 if (space > 0) {
3378 root->bound_min[i] += 0.5*space;
3380 root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3381 space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3382 if (space < 0) {
3383 root->bound_max[i] += 0.5*space;
3385 if (debug)
3387 fprintf(debug,
3388 "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3389 d,i,
3390 root->cell_f_max0[i-1] + dist_min_f,
3391 root->bound_min[i],root->cell_f[i],root->bound_max[i],
3392 root->cell_f_min1[i] - dist_min_f);
3396 range[1]=ncd;
3397 root->cell_f[0] = 0;
3398 root->cell_f[ncd] = 1;
3399 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3402 /* After the checks above, the cells should obey the cut-off
3403 * restrictions, but it does not hurt to check.
3405 for(i=0; i<ncd; i++)
3407 if (debug)
3409 fprintf(debug,"Relative bounds dim %d cell %d: %f %f\n",
3410 dim,i,root->cell_f[i],root->cell_f[i+1]);
3413 if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3414 root->cell_f[i+1] - root->cell_f[i] <
3415 cellsize_limit_f/DD_CELL_MARGIN)
3417 char buf[22];
3418 fprintf(stderr,
3419 "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3420 gmx_step_str(step,buf),dim2char(dim),i,
3421 (root->cell_f[i+1] - root->cell_f[i])
3422 *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3426 pos = ncd + 1;
3427 /* Store the cell boundaries of the lower dimensions at the end */
3428 for(d1=0; d1<d; d1++)
3430 root->cell_f[pos++] = comm->cell_f0[d1];
3431 root->cell_f[pos++] = comm->cell_f1[d1];
3434 if (d < comm->npmedecompdim)
3436 /* The master determines the maximum shift for
3437 * the coordinate communication between separate PME nodes.
3439 set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3441 root->cell_f[pos++] = comm->ddpme[0].maxshift;
3442 if (d >= 1)
3444 root->cell_f[pos++] = comm->ddpme[1].maxshift;
3448 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3449 gmx_ddbox_t *ddbox,int dimind)
3451 gmx_domdec_comm_t *comm;
3452 int dim;
3454 comm = dd->comm;
3456 /* Set the cell dimensions */
3457 dim = dd->dim[dimind];
3458 comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3459 comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3460 if (dim >= ddbox->nboundeddim)
3462 comm->cell_x0[dim] += ddbox->box0[dim];
3463 comm->cell_x1[dim] += ddbox->box0[dim];
3467 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3468 int d,int dim,real *cell_f_row,
3469 gmx_ddbox_t *ddbox)
3471 gmx_domdec_comm_t *comm;
3472 int d1,dim1,pos;
3474 comm = dd->comm;
3476 #ifdef GMX_MPI
3477 /* Each node would only need to know two fractions,
3478 * but it is probably cheaper to broadcast the whole array.
3480 MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3481 0,comm->mpi_comm_load[d]);
3482 #endif
3483 /* Copy the fractions for this dimension from the buffer */
3484 comm->cell_f0[d] = cell_f_row[dd->ci[dim] ];
3485 comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3486 /* The whole array was communicated, so set the buffer position */
3487 pos = dd->nc[dim] + 1;
3488 for(d1=0; d1<=d; d1++)
3490 if (d1 < d)
3492 /* Copy the cell fractions of the lower dimensions */
3493 comm->cell_f0[d1] = cell_f_row[pos++];
3494 comm->cell_f1[d1] = cell_f_row[pos++];
3496 relative_to_absolute_cell_bounds(dd,ddbox,d1);
3498 /* Convert the communicated shift from float to int */
3499 comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3500 if (d >= 1)
3502 comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3506 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3507 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3508 gmx_bool bUniform,gmx_large_int_t step)
3510 gmx_domdec_comm_t *comm;
3511 int d,dim,d1;
3512 gmx_bool bRowMember,bRowRoot;
3513 real *cell_f_row;
3515 comm = dd->comm;
3517 for(d=0; d<dd->ndim; d++)
3519 dim = dd->dim[d];
3520 bRowMember = TRUE;
3521 bRowRoot = TRUE;
3522 for(d1=d; d1<dd->ndim; d1++)
3524 if (dd->ci[dd->dim[d1]] > 0)
3526 if (d1 > d)
3528 bRowMember = FALSE;
3530 bRowRoot = FALSE;
3533 if (bRowMember)
3535 if (bRowRoot)
3537 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3538 ddbox,bDynamicBox,bUniform,step);
3539 cell_f_row = comm->root[d]->cell_f;
3541 else
3543 cell_f_row = comm->cell_f_row;
3545 distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3550 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3552 int d;
3554 /* This function assumes the box is static and should therefore
3555 * not be called when the box has changed since the last
3556 * call to dd_partition_system.
3558 for(d=0; d<dd->ndim; d++)
3560 relative_to_absolute_cell_bounds(dd,ddbox,d);
3566 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3567 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3568 gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3569 gmx_wallcycle_t wcycle)
3571 gmx_domdec_comm_t *comm;
3572 int dim;
3574 comm = dd->comm;
3576 if (bDoDLB)
3578 wallcycle_start(wcycle,ewcDDCOMMBOUND);
3579 set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3580 wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3582 else if (bDynamicBox)
3584 set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3587 /* Set the dimensions for which no DD is used */
3588 for(dim=0; dim<DIM; dim++) {
3589 if (dd->nc[dim] == 1) {
3590 comm->cell_x0[dim] = 0;
3591 comm->cell_x1[dim] = ddbox->box_size[dim];
3592 if (dim >= ddbox->nboundeddim)
3594 comm->cell_x0[dim] += ddbox->box0[dim];
3595 comm->cell_x1[dim] += ddbox->box0[dim];
3601 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3603 int d,np,i;
3604 gmx_domdec_comm_dim_t *cd;
3606 for(d=0; d<dd->ndim; d++)
3608 cd = &dd->comm->cd[d];
3609 np = npulse[dd->dim[d]];
3610 if (np > cd->np_nalloc)
3612 if (debug)
3614 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3615 dim2char(dd->dim[d]),np);
3617 if (DDMASTER(dd) && cd->np_nalloc > 0)
3619 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3621 srenew(cd->ind,np);
3622 for(i=cd->np_nalloc; i<np; i++)
3624 cd->ind[i].index = NULL;
3625 cd->ind[i].nalloc = 0;
3627 cd->np_nalloc = np;
3629 cd->np = np;
3634 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3635 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3636 gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3637 gmx_wallcycle_t wcycle)
3639 gmx_domdec_comm_t *comm;
3640 int d;
3641 ivec npulse;
3643 comm = dd->comm;
3645 /* Copy the old cell boundaries for the cg displacement check */
3646 copy_rvec(comm->cell_x0,comm->old_cell_x0);
3647 copy_rvec(comm->cell_x1,comm->old_cell_x1);
3649 if (comm->bDynLoadBal)
3651 if (DDMASTER(dd))
3653 check_box_size(dd,ddbox);
3655 set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3657 else
3659 set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3660 realloc_comm_ind(dd,npulse);
3663 if (debug)
3665 for(d=0; d<DIM; d++)
3667 fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3668 d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3673 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3674 gmx_ddbox_t *ddbox,
3675 rvec cell_ns_x0,rvec cell_ns_x1,
3676 gmx_large_int_t step)
3678 gmx_domdec_comm_t *comm;
3679 int dim_ind,dim;
3681 comm = dd->comm;
3683 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3685 dim = dd->dim[dim_ind];
3687 /* Without PBC we don't have restrictions on the outer cells */
3688 if (!(dim >= ddbox->npbcdim &&
3689 (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3690 comm->bDynLoadBal &&
3691 (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3692 comm->cellsize_min[dim])
3694 char buf[22];
3695 gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3696 gmx_step_str(step,buf),dim2char(dim),
3697 comm->cell_x1[dim] - comm->cell_x0[dim],
3698 ddbox->skew_fac[dim],
3699 dd->comm->cellsize_min[dim],
3700 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3704 if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3706 /* Communicate the boundaries and update cell_ns_x0/1 */
3707 dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3708 if (dd->bGridJump && dd->ndim > 1)
3710 check_grid_jump(step,dd,dd->comm->cutoff,ddbox,TRUE);
3715 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3717 if (YY < npbcdim)
3719 tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3721 else
3723 tcm[YY][XX] = 0;
3725 if (ZZ < npbcdim)
3727 tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3728 tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3730 else
3732 tcm[ZZ][XX] = 0;
3733 tcm[ZZ][YY] = 0;
3737 static void check_screw_box(matrix box)
3739 /* Mathematical limitation */
3740 if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3742 gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3745 /* Limitation due to the asymmetry of the eighth shell method */
3746 if (box[ZZ][YY] != 0)
3748 gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3752 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3753 matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3754 gmx_domdec_t *dd)
3756 gmx_domdec_master_t *ma;
3757 int **tmp_ind=NULL,*tmp_nalloc=NULL;
3758 int i,icg,j,k,k0,k1,d,npbcdim;
3759 matrix tcm;
3760 rvec box_size,cg_cm;
3761 ivec ind;
3762 real nrcg,inv_ncg,pos_d;
3763 atom_id *cgindex;
3764 gmx_bool bUnbounded,bScrew;
3766 ma = dd->ma;
3768 if (tmp_ind == NULL)
3770 snew(tmp_nalloc,dd->nnodes);
3771 snew(tmp_ind,dd->nnodes);
3772 for(i=0; i<dd->nnodes; i++)
3774 tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3775 snew(tmp_ind[i],tmp_nalloc[i]);
3779 /* Clear the count */
3780 for(i=0; i<dd->nnodes; i++)
3782 ma->ncg[i] = 0;
3783 ma->nat[i] = 0;
3786 make_tric_corr_matrix(dd->npbcdim,box,tcm);
3788 cgindex = cgs->index;
3790 /* Compute the center of geometry for all charge groups */
3791 for(icg=0; icg<cgs->nr; icg++)
3793 k0 = cgindex[icg];
3794 k1 = cgindex[icg+1];
3795 nrcg = k1 - k0;
3796 if (nrcg == 1)
3798 copy_rvec(pos[k0],cg_cm);
3800 else
3802 inv_ncg = 1.0/nrcg;
3804 clear_rvec(cg_cm);
3805 for(k=k0; (k<k1); k++)
3807 rvec_inc(cg_cm,pos[k]);
3809 for(d=0; (d<DIM); d++)
3811 cg_cm[d] *= inv_ncg;
3814 /* Put the charge group in the box and determine the cell index */
3815 for(d=DIM-1; d>=0; d--) {
3816 pos_d = cg_cm[d];
3817 if (d < dd->npbcdim)
3819 bScrew = (dd->bScrewPBC && d == XX);
3820 if (tric_dir[d] && dd->nc[d] > 1)
3822 /* Use triclinic coordintates for this dimension */
3823 for(j=d+1; j<DIM; j++)
3825 pos_d += cg_cm[j]*tcm[j][d];
3828 while(pos_d >= box[d][d])
3830 pos_d -= box[d][d];
3831 rvec_dec(cg_cm,box[d]);
3832 if (bScrew)
3834 cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3835 cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3837 for(k=k0; (k<k1); k++)
3839 rvec_dec(pos[k],box[d]);
3840 if (bScrew)
3842 pos[k][YY] = box[YY][YY] - pos[k][YY];
3843 pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3847 while(pos_d < 0)
3849 pos_d += box[d][d];
3850 rvec_inc(cg_cm,box[d]);
3851 if (bScrew)
3853 cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3854 cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3856 for(k=k0; (k<k1); k++)
3858 rvec_inc(pos[k],box[d]);
3859 if (bScrew) {
3860 pos[k][YY] = box[YY][YY] - pos[k][YY];
3861 pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3866 /* This could be done more efficiently */
3867 ind[d] = 0;
3868 while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3870 ind[d]++;
3873 i = dd_index(dd->nc,ind);
3874 if (ma->ncg[i] == tmp_nalloc[i])
3876 tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3877 srenew(tmp_ind[i],tmp_nalloc[i]);
3879 tmp_ind[i][ma->ncg[i]] = icg;
3880 ma->ncg[i]++;
3881 ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3884 k1 = 0;
3885 for(i=0; i<dd->nnodes; i++)
3887 ma->index[i] = k1;
3888 for(k=0; k<ma->ncg[i]; k++)
3890 ma->cg[k1++] = tmp_ind[i][k];
3893 ma->index[dd->nnodes] = k1;
3895 for(i=0; i<dd->nnodes; i++)
3897 sfree(tmp_ind[i]);
3899 sfree(tmp_ind);
3900 sfree(tmp_nalloc);
3902 if (fplog)
3904 char buf[22];
3905 fprintf(fplog,"Charge group distribution at step %s:",
3906 gmx_step_str(step,buf));
3907 for(i=0; i<dd->nnodes; i++)
3909 fprintf(fplog," %d",ma->ncg[i]);
3911 fprintf(fplog,"\n");
3915 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3916 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3917 rvec pos[])
3919 gmx_domdec_master_t *ma=NULL;
3920 ivec npulse;
3921 int i,cg_gl;
3922 int *ibuf,buf2[2] = { 0, 0 };
3923 gmx_bool bMaster = DDMASTER(dd);
3924 if (bMaster)
3926 ma = dd->ma;
3928 if (dd->bScrewPBC)
3930 check_screw_box(box);
3933 set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3935 distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3936 for(i=0; i<dd->nnodes; i++)
3938 ma->ibuf[2*i] = ma->ncg[i];
3939 ma->ibuf[2*i+1] = ma->nat[i];
3941 ibuf = ma->ibuf;
3943 else
3945 ibuf = NULL;
3947 dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3949 dd->ncg_home = buf2[0];
3950 dd->nat_home = buf2[1];
3951 dd->ncg_tot = dd->ncg_home;
3952 dd->nat_tot = dd->nat_home;
3953 if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3955 dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3956 srenew(dd->index_gl,dd->cg_nalloc);
3957 srenew(dd->cgindex,dd->cg_nalloc+1);
3959 if (bMaster)
3961 for(i=0; i<dd->nnodes; i++)
3963 ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3964 ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3968 dd_scatterv(dd,
3969 DDMASTER(dd) ? ma->ibuf : NULL,
3970 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3971 DDMASTER(dd) ? ma->cg : NULL,
3972 dd->ncg_home*sizeof(int),dd->index_gl);
3974 /* Determine the home charge group sizes */
3975 dd->cgindex[0] = 0;
3976 for(i=0; i<dd->ncg_home; i++)
3978 cg_gl = dd->index_gl[i];
3979 dd->cgindex[i+1] =
3980 dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3983 if (debug)
3985 fprintf(debug,"Home charge groups:\n");
3986 for(i=0; i<dd->ncg_home; i++)
3988 fprintf(debug," %d",dd->index_gl[i]);
3989 if (i % 10 == 9)
3990 fprintf(debug,"\n");
3992 fprintf(debug,"\n");
3996 static int compact_and_copy_vec_at(int ncg,int *move,
3997 int *cgindex,
3998 int nvec,int vec,
3999 rvec *src,gmx_domdec_comm_t *comm,
4000 gmx_bool bCompact)
4002 int m,icg,i,i0,i1,nrcg;
4003 int home_pos;
4004 int pos_vec[DIM*2];
4006 home_pos = 0;
4008 for(m=0; m<DIM*2; m++)
4010 pos_vec[m] = 0;
4013 i0 = 0;
4014 for(icg=0; icg<ncg; icg++)
4016 i1 = cgindex[icg+1];
4017 m = move[icg];
4018 if (m == -1)
4020 if (bCompact)
4022 /* Compact the home array in place */
4023 for(i=i0; i<i1; i++)
4025 copy_rvec(src[i],src[home_pos++]);
4029 else
4031 /* Copy to the communication buffer */
4032 nrcg = i1 - i0;
4033 pos_vec[m] += 1 + vec*nrcg;
4034 for(i=i0; i<i1; i++)
4036 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
4038 pos_vec[m] += (nvec - vec - 1)*nrcg;
4040 if (!bCompact)
4042 home_pos += i1 - i0;
4044 i0 = i1;
4047 return home_pos;
4050 static int compact_and_copy_vec_cg(int ncg,int *move,
4051 int *cgindex,
4052 int nvec,rvec *src,gmx_domdec_comm_t *comm,
4053 gmx_bool bCompact)
4055 int m,icg,i0,i1,nrcg;
4056 int home_pos;
4057 int pos_vec[DIM*2];
4059 home_pos = 0;
4061 for(m=0; m<DIM*2; m++)
4063 pos_vec[m] = 0;
4066 i0 = 0;
4067 for(icg=0; icg<ncg; icg++)
4069 i1 = cgindex[icg+1];
4070 m = move[icg];
4071 if (m == -1)
4073 if (bCompact)
4075 /* Compact the home array in place */
4076 copy_rvec(src[icg],src[home_pos++]);
4079 else
4081 nrcg = i1 - i0;
4082 /* Copy to the communication buffer */
4083 copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
4084 pos_vec[m] += 1 + nrcg*nvec;
4086 i0 = i1;
4088 if (!bCompact)
4090 home_pos = ncg;
4093 return home_pos;
4096 static int compact_ind(int ncg,int *move,
4097 int *index_gl,int *cgindex,
4098 int *gatindex,
4099 gmx_ga2la_t ga2la,char *bLocalCG,
4100 int *cginfo)
4102 int cg,nat,a0,a1,a,a_gl;
4103 int home_pos;
4105 home_pos = 0;
4106 nat = 0;
4107 for(cg=0; cg<ncg; cg++)
4109 a0 = cgindex[cg];
4110 a1 = cgindex[cg+1];
4111 if (move[cg] == -1)
4113 /* Compact the home arrays in place.
4114 * Anything that can be done here avoids access to global arrays.
4116 cgindex[home_pos] = nat;
4117 for(a=a0; a<a1; a++)
4119 a_gl = gatindex[a];
4120 gatindex[nat] = a_gl;
4121 /* The cell number stays 0, so we don't need to set it */
4122 ga2la_change_la(ga2la,a_gl,nat);
4123 nat++;
4125 index_gl[home_pos] = index_gl[cg];
4126 cginfo[home_pos] = cginfo[cg];
4127 /* The charge group remains local, so bLocalCG does not change */
4128 home_pos++;
4130 else
4132 /* Clear the global indices */
4133 for(a=a0; a<a1; a++)
4135 ga2la_del(ga2la,gatindex[a]);
4137 if (bLocalCG)
4139 bLocalCG[index_gl[cg]] = FALSE;
4143 cgindex[home_pos] = nat;
4145 return home_pos;
4148 static void clear_and_mark_ind(int ncg,int *move,
4149 int *index_gl,int *cgindex,int *gatindex,
4150 gmx_ga2la_t ga2la,char *bLocalCG,
4151 int *cell_index)
4153 int cg,a0,a1,a;
4155 for(cg=0; cg<ncg; cg++)
4157 if (move[cg] >= 0)
4159 a0 = cgindex[cg];
4160 a1 = cgindex[cg+1];
4161 /* Clear the global indices */
4162 for(a=a0; a<a1; a++)
4164 ga2la_del(ga2la,gatindex[a]);
4166 if (bLocalCG)
4168 bLocalCG[index_gl[cg]] = FALSE;
4170 /* Signal that this cg has moved using the ns cell index.
4171 * Here we set it to -1. fill_grid will change it
4172 * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4174 cell_index[cg] = -1;
4179 static void print_cg_move(FILE *fplog,
4180 gmx_domdec_t *dd,
4181 gmx_large_int_t step,int cg,int dim,int dir,
4182 gmx_bool bHaveLimitdAndCMOld,real limitd,
4183 rvec cm_old,rvec cm_new,real pos_d)
4185 gmx_domdec_comm_t *comm;
4186 char buf[22];
4188 comm = dd->comm;
4190 fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4191 if (bHaveLimitdAndCMOld)
4193 fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4194 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4196 else
4198 fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4199 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4201 fprintf(fplog,"distance out of cell %f\n",
4202 dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4203 if (bHaveLimitdAndCMOld)
4205 fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4206 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4208 fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4209 cm_new[XX],cm_new[YY],cm_new[ZZ]);
4210 fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4211 dim2char(dim),
4212 comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4213 fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4214 dim2char(dim),
4215 comm->cell_x0[dim],comm->cell_x1[dim]);
4218 static void cg_move_error(FILE *fplog,
4219 gmx_domdec_t *dd,
4220 gmx_large_int_t step,int cg,int dim,int dir,
4221 gmx_bool bHaveLimitdAndCMOld,real limitd,
4222 rvec cm_old,rvec cm_new,real pos_d)
4224 if (fplog)
4226 print_cg_move(fplog, dd,step,cg,dim,dir,
4227 bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4229 print_cg_move(stderr,dd,step,cg,dim,dir,
4230 bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4231 gmx_fatal(FARGS,
4232 "A charge group moved too far between two domain decomposition steps\n"
4233 "This usually means that your system is not well equilibrated");
4236 static void rotate_state_atom(t_state *state,int a)
4238 int est;
4240 for(est=0; est<estNR; est++)
4242 if (EST_DISTR(est) && (state->flags & (1<<est))) {
4243 switch (est) {
4244 case estX:
4245 /* Rotate the complete state; for a rectangular box only */
4246 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4247 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4248 break;
4249 case estV:
4250 state->v[a][YY] = -state->v[a][YY];
4251 state->v[a][ZZ] = -state->v[a][ZZ];
4252 break;
4253 case estSDX:
4254 state->sd_X[a][YY] = -state->sd_X[a][YY];
4255 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4256 break;
4257 case estCGP:
4258 state->cg_p[a][YY] = -state->cg_p[a][YY];
4259 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4260 break;
4261 case estDISRE_INITF:
4262 case estDISRE_RM3TAV:
4263 case estORIRE_INITF:
4264 case estORIRE_DTAV:
4265 /* These are distances, so not affected by rotation */
4266 break;
4267 default:
4268 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4274 static int *get_moved(gmx_domdec_comm_t *comm,int natoms)
4276 if (natoms > comm->moved_nalloc)
4278 /* Contents should be preserved here */
4279 comm->moved_nalloc = over_alloc_dd(natoms);
4280 srenew(comm->moved,comm->moved_nalloc);
4283 return comm->moved;
4286 static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
4287 gmx_domdec_t *dd,
4288 t_state *state,
4289 ivec tric_dir,matrix tcm,
4290 rvec cell_x0,rvec cell_x1,
4291 rvec limitd,rvec limit0,rvec limit1,
4292 const int *cgindex,
4293 int cg_start,int cg_end,
4294 rvec *cg_cm,
4295 int *move)
4297 int npbcdim;
4298 int c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4299 int mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4300 int flag;
4301 gmx_bool bScrew;
4302 ivec dev;
4303 real inv_ncg,pos_d;
4304 rvec cm_new;
4306 npbcdim = dd->npbcdim;
4308 for(cg=cg_start; cg<cg_end; cg++)
4310 k0 = cgindex[cg];
4311 k1 = cgindex[cg+1];
4312 nrcg = k1 - k0;
4313 if (nrcg == 1)
4315 copy_rvec(state->x[k0],cm_new);
4317 else
4319 inv_ncg = 1.0/nrcg;
4321 clear_rvec(cm_new);
4322 for(k=k0; (k<k1); k++)
4324 rvec_inc(cm_new,state->x[k]);
4326 for(d=0; (d<DIM); d++)
4328 cm_new[d] = inv_ncg*cm_new[d];
4332 clear_ivec(dev);
4333 /* Do pbc and check DD cell boundary crossings */
4334 for(d=DIM-1; d>=0; d--)
4336 if (dd->nc[d] > 1)
4338 bScrew = (dd->bScrewPBC && d == XX);
4339 /* Determine the location of this cg in lattice coordinates */
4340 pos_d = cm_new[d];
4341 if (tric_dir[d])
4343 for(d2=d+1; d2<DIM; d2++)
4345 pos_d += cm_new[d2]*tcm[d2][d];
4348 /* Put the charge group in the triclinic unit-cell */
4349 if (pos_d >= cell_x1[d])
4351 if (pos_d >= limit1[d])
4353 cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4354 cg_cm[cg],cm_new,pos_d);
4356 dev[d] = 1;
4357 if (dd->ci[d] == dd->nc[d] - 1)
4359 rvec_dec(cm_new,state->box[d]);
4360 if (bScrew)
4362 cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4363 cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4365 for(k=k0; (k<k1); k++)
4367 rvec_dec(state->x[k],state->box[d]);
4368 if (bScrew)
4370 rotate_state_atom(state,k);
4375 else if (pos_d < cell_x0[d])
4377 if (pos_d < limit0[d])
4379 cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4380 cg_cm[cg],cm_new,pos_d);
4382 dev[d] = -1;
4383 if (dd->ci[d] == 0)
4385 rvec_inc(cm_new,state->box[d]);
4386 if (bScrew)
4388 cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4389 cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4391 for(k=k0; (k<k1); k++)
4393 rvec_inc(state->x[k],state->box[d]);
4394 if (bScrew)
4396 rotate_state_atom(state,k);
4402 else if (d < npbcdim)
4404 /* Put the charge group in the rectangular unit-cell */
4405 while (cm_new[d] >= state->box[d][d])
4407 rvec_dec(cm_new,state->box[d]);
4408 for(k=k0; (k<k1); k++)
4410 rvec_dec(state->x[k],state->box[d]);
4413 while (cm_new[d] < 0)
4415 rvec_inc(cm_new,state->box[d]);
4416 for(k=k0; (k<k1); k++)
4418 rvec_inc(state->x[k],state->box[d]);
4424 copy_rvec(cm_new,cg_cm[cg]);
4426 /* Determine where this cg should go */
4427 flag = 0;
4428 mc = -1;
4429 for(d=0; d<dd->ndim; d++)
4431 dim = dd->dim[d];
4432 if (dev[dim] == 1)
4434 flag |= DD_FLAG_FW(d);
4435 if (mc == -1)
4437 mc = d*2;
4440 else if (dev[dim] == -1)
4442 flag |= DD_FLAG_BW(d);
4443 if (mc == -1) {
4444 if (dd->nc[dim] > 2)
4446 mc = d*2 + 1;
4448 else
4450 mc = d*2;
4455 /* Temporarily store the flag in move */
4456 move[cg] = mc + flag;
4460 static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4461 gmx_domdec_t *dd,ivec tric_dir,
4462 t_state *state,rvec **f,
4463 t_forcerec *fr,t_mdatoms *md,
4464 gmx_bool bCompact,
4465 t_nrnb *nrnb,
4466 int *ncg_stay_home,
4467 int *ncg_moved)
4469 int *move;
4470 int npbcdim;
4471 int ncg[DIM*2],nat[DIM*2];
4472 int c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4473 int mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4474 int sbuf[2],rbuf[2];
4475 int home_pos_cg,home_pos_at,buf_pos;
4476 int flag;
4477 gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4478 gmx_bool bScrew;
4479 ivec dev;
4480 real inv_ncg,pos_d;
4481 matrix tcm;
4482 rvec *cg_cm=NULL,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4483 atom_id *cgindex;
4484 cginfo_mb_t *cginfo_mb;
4485 gmx_domdec_comm_t *comm;
4486 int *moved;
4487 int nthread,thread;
4489 if (dd->bScrewPBC)
4491 check_screw_box(state->box);
4494 comm = dd->comm;
4495 if (fr->cutoff_scheme == ecutsGROUP)
4497 cg_cm = fr->cg_cm;
4500 for(i=0; i<estNR; i++)
4502 if (EST_DISTR(i))
4504 switch (i)
4506 case estX: /* Always present */ break;
4507 case estV: bV = (state->flags & (1<<i)); break;
4508 case estSDX: bSDX = (state->flags & (1<<i)); break;
4509 case estCGP: bCGP = (state->flags & (1<<i)); break;
4510 case estLD_RNG:
4511 case estLD_RNGI:
4512 case estDISRE_INITF:
4513 case estDISRE_RM3TAV:
4514 case estORIRE_INITF:
4515 case estORIRE_DTAV:
4516 /* No processing required */
4517 break;
4518 default:
4519 gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4524 if (dd->ncg_tot > comm->nalloc_int)
4526 comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4527 srenew(comm->buf_int,comm->nalloc_int);
4529 move = comm->buf_int;
4531 /* Clear the count */
4532 for(c=0; c<dd->ndim*2; c++)
4534 ncg[c] = 0;
4535 nat[c] = 0;
4538 npbcdim = dd->npbcdim;
4540 for(d=0; (d<DIM); d++)
4542 limitd[d] = dd->comm->cellsize_min[d];
4543 if (d >= npbcdim && dd->ci[d] == 0)
4545 cell_x0[d] = -GMX_FLOAT_MAX;
4547 else
4549 cell_x0[d] = comm->cell_x0[d];
4551 if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4553 cell_x1[d] = GMX_FLOAT_MAX;
4555 else
4557 cell_x1[d] = comm->cell_x1[d];
4559 if (d < npbcdim)
4561 limit0[d] = comm->old_cell_x0[d] - limitd[d];
4562 limit1[d] = comm->old_cell_x1[d] + limitd[d];
4564 else
4566 /* We check after communication if a charge group moved
4567 * more than one cell. Set the pre-comm check limit to float_max.
4569 limit0[d] = -GMX_FLOAT_MAX;
4570 limit1[d] = GMX_FLOAT_MAX;
4574 make_tric_corr_matrix(npbcdim,state->box,tcm);
4576 cgindex = dd->cgindex;
4578 nthread = gmx_omp_nthreads_get(emntDomdec);
4580 /* Compute the center of geometry for all home charge groups
4581 * and put them in the box and determine where they should go.
4583 #pragma omp parallel for num_threads(nthread) schedule(static)
4584 for(thread=0; thread<nthread; thread++)
4586 calc_cg_move(fplog,step,dd,state,tric_dir,tcm,
4587 cell_x0,cell_x1,limitd,limit0,limit1,
4588 cgindex,
4589 ( thread *dd->ncg_home)/nthread,
4590 ((thread+1)*dd->ncg_home)/nthread,
4591 fr->cutoff_scheme==ecutsGROUP ? cg_cm : state->x,
4592 move);
4595 for(cg=0; cg<dd->ncg_home; cg++)
4597 if (move[cg] >= 0)
4599 mc = move[cg];
4600 flag = mc & ~DD_FLAG_NRCG;
4601 mc = mc & DD_FLAG_NRCG;
4602 move[cg] = mc;
4604 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4606 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4607 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4609 comm->cggl_flag[mc][ncg[mc]*DD_CGIBS ] = dd->index_gl[cg];
4610 /* We store the cg size in the lower 16 bits
4611 * and the place where the charge group should go
4612 * in the next 6 bits. This saves some communication volume.
4614 nrcg = cgindex[cg+1] - cgindex[cg];
4615 comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4616 ncg[mc] += 1;
4617 nat[mc] += nrcg;
4621 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4622 inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4624 *ncg_moved = 0;
4625 for(i=0; i<dd->ndim*2; i++)
4627 *ncg_moved += ncg[i];
4630 nvec = 1;
4631 if (bV)
4633 nvec++;
4635 if (bSDX)
4637 nvec++;
4639 if (bCGP)
4641 nvec++;
4644 /* Make sure the communication buffers are large enough */
4645 for(mc=0; mc<dd->ndim*2; mc++)
4647 nvr = ncg[mc] + nat[mc]*nvec;
4648 if (nvr > comm->cgcm_state_nalloc[mc])
4650 comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4651 srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4655 switch (fr->cutoff_scheme)
4657 case ecutsGROUP:
4658 /* Recalculating cg_cm might be cheaper than communicating,
4659 * but that could give rise to rounding issues.
4661 home_pos_cg =
4662 compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4663 nvec,cg_cm,comm,bCompact);
4664 break;
4665 case ecutsVERLET:
4666 /* Without charge groups we send the moved atom coordinates
4667 * over twice. This is so the code below can be used without
4668 * many conditionals for both for with and without charge groups.
4670 home_pos_cg =
4671 compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4672 nvec,state->x,comm,FALSE);
4673 if (bCompact)
4675 home_pos_cg -= *ncg_moved;
4677 break;
4678 default:
4679 gmx_incons("unimplemented");
4680 home_pos_cg = 0;
4683 vec = 0;
4684 home_pos_at =
4685 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4686 nvec,vec++,state->x,comm,bCompact);
4687 if (bV)
4689 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4690 nvec,vec++,state->v,comm,bCompact);
4692 if (bSDX)
4694 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4695 nvec,vec++,state->sd_X,comm,bCompact);
4697 if (bCGP)
4699 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4700 nvec,vec++,state->cg_p,comm,bCompact);
4703 if (bCompact)
4705 compact_ind(dd->ncg_home,move,
4706 dd->index_gl,dd->cgindex,dd->gatindex,
4707 dd->ga2la,comm->bLocalCG,
4708 fr->cginfo);
4710 else
4712 if (fr->cutoff_scheme == ecutsVERLET)
4714 moved = get_moved(comm,dd->ncg_home);
4716 for(k=0; k<dd->ncg_home; k++)
4718 moved[k] = 0;
4721 else
4723 moved = fr->ns.grid->cell_index;
4726 clear_and_mark_ind(dd->ncg_home,move,
4727 dd->index_gl,dd->cgindex,dd->gatindex,
4728 dd->ga2la,comm->bLocalCG,
4729 moved);
4732 cginfo_mb = fr->cginfo_mb;
4734 *ncg_stay_home = home_pos_cg;
4735 for(d=0; d<dd->ndim; d++)
4737 dim = dd->dim[d];
4738 ncg_recv = 0;
4739 nat_recv = 0;
4740 nvr = 0;
4741 for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4743 cdd = d*2 + dir;
4744 /* Communicate the cg and atom counts */
4745 sbuf[0] = ncg[cdd];
4746 sbuf[1] = nat[cdd];
4747 if (debug)
4749 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4750 d,dir,sbuf[0],sbuf[1]);
4752 dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4754 if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4756 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4757 srenew(comm->buf_int,comm->nalloc_int);
4760 /* Communicate the charge group indices, sizes and flags */
4761 dd_sendrecv_int(dd, d, dir,
4762 comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4763 comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4765 nvs = ncg[cdd] + nat[cdd]*nvec;
4766 i = rbuf[0] + rbuf[1] *nvec;
4767 vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4769 /* Communicate cgcm and state */
4770 dd_sendrecv_rvec(dd, d, dir,
4771 comm->cgcm_state[cdd], nvs,
4772 comm->vbuf.v+nvr, i);
4773 ncg_recv += rbuf[0];
4774 nat_recv += rbuf[1];
4775 nvr += i;
4778 /* Process the received charge groups */
4779 buf_pos = 0;
4780 for(cg=0; cg<ncg_recv; cg++)
4782 flag = comm->buf_int[cg*DD_CGIBS+1];
4784 if (dim >= npbcdim && dd->nc[dim] > 2)
4786 /* No pbc in this dim and more than one domain boundary.
4787 * We do a separate check if a charge group didn't move too far.
4789 if (((flag & DD_FLAG_FW(d)) &&
4790 comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4791 ((flag & DD_FLAG_BW(d)) &&
4792 comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4794 cg_move_error(fplog,dd,step,cg,dim,
4795 (flag & DD_FLAG_FW(d)) ? 1 : 0,
4796 FALSE,0,
4797 comm->vbuf.v[buf_pos],
4798 comm->vbuf.v[buf_pos],
4799 comm->vbuf.v[buf_pos][dim]);
4803 mc = -1;
4804 if (d < dd->ndim-1)
4806 /* Check which direction this cg should go */
4807 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4809 if (dd->bGridJump)
4811 /* The cell boundaries for dimension d2 are not equal
4812 * for each cell row of the lower dimension(s),
4813 * therefore we might need to redetermine where
4814 * this cg should go.
4816 dim2 = dd->dim[d2];
4817 /* If this cg crosses the box boundary in dimension d2
4818 * we can use the communicated flag, so we do not
4819 * have to worry about pbc.
4821 if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4822 (flag & DD_FLAG_FW(d2))) ||
4823 (dd->ci[dim2] == 0 &&
4824 (flag & DD_FLAG_BW(d2)))))
4826 /* Clear the two flags for this dimension */
4827 flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4828 /* Determine the location of this cg
4829 * in lattice coordinates
4831 pos_d = comm->vbuf.v[buf_pos][dim2];
4832 if (tric_dir[dim2])
4834 for(d3=dim2+1; d3<DIM; d3++)
4836 pos_d +=
4837 comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4840 /* Check of we are not at the box edge.
4841 * pbc is only handled in the first step above,
4842 * but this check could move over pbc while
4843 * the first step did not due to different rounding.
4845 if (pos_d >= cell_x1[dim2] &&
4846 dd->ci[dim2] != dd->nc[dim2]-1)
4848 flag |= DD_FLAG_FW(d2);
4850 else if (pos_d < cell_x0[dim2] &&
4851 dd->ci[dim2] != 0)
4853 flag |= DD_FLAG_BW(d2);
4855 comm->buf_int[cg*DD_CGIBS+1] = flag;
4858 /* Set to which neighboring cell this cg should go */
4859 if (flag & DD_FLAG_FW(d2))
4861 mc = d2*2;
4863 else if (flag & DD_FLAG_BW(d2))
4865 if (dd->nc[dd->dim[d2]] > 2)
4867 mc = d2*2+1;
4869 else
4871 mc = d2*2;
4877 nrcg = flag & DD_FLAG_NRCG;
4878 if (mc == -1)
4880 if (home_pos_cg+1 > dd->cg_nalloc)
4882 dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4883 srenew(dd->index_gl,dd->cg_nalloc);
4884 srenew(dd->cgindex,dd->cg_nalloc+1);
4886 /* Set the global charge group index and size */
4887 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4888 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4889 /* Copy the state from the buffer */
4890 dd_check_alloc_ncg(fr,state,f,home_pos_cg+1);
4891 if (fr->cutoff_scheme == ecutsGROUP)
4893 cg_cm = fr->cg_cm;
4894 copy_rvec(comm->vbuf.v[buf_pos],cg_cm[home_pos_cg]);
4896 buf_pos++;
4898 /* Set the cginfo */
4899 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4900 dd->index_gl[home_pos_cg]);
4901 if (comm->bLocalCG)
4903 comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4906 if (home_pos_at+nrcg > state->nalloc)
4908 dd_realloc_state(state,f,home_pos_at+nrcg);
4910 for(i=0; i<nrcg; i++)
4912 copy_rvec(comm->vbuf.v[buf_pos++],
4913 state->x[home_pos_at+i]);
4915 if (bV)
4917 for(i=0; i<nrcg; i++)
4919 copy_rvec(comm->vbuf.v[buf_pos++],
4920 state->v[home_pos_at+i]);
4923 if (bSDX)
4925 for(i=0; i<nrcg; i++)
4927 copy_rvec(comm->vbuf.v[buf_pos++],
4928 state->sd_X[home_pos_at+i]);
4931 if (bCGP)
4933 for(i=0; i<nrcg; i++)
4935 copy_rvec(comm->vbuf.v[buf_pos++],
4936 state->cg_p[home_pos_at+i]);
4939 home_pos_cg += 1;
4940 home_pos_at += nrcg;
4942 else
4944 /* Reallocate the buffers if necessary */
4945 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4947 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4948 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4950 nvr = ncg[mc] + nat[mc]*nvec;
4951 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4953 comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4954 srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4956 /* Copy from the receive to the send buffers */
4957 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4958 comm->buf_int + cg*DD_CGIBS,
4959 DD_CGIBS*sizeof(int));
4960 memcpy(comm->cgcm_state[mc][nvr],
4961 comm->vbuf.v[buf_pos],
4962 (1+nrcg*nvec)*sizeof(rvec));
4963 buf_pos += 1 + nrcg*nvec;
4964 ncg[mc] += 1;
4965 nat[mc] += nrcg;
4970 /* With sorting (!bCompact) the indices are now only partially up to date
4971 * and ncg_home and nat_home are not the real count, since there are
4972 * "holes" in the arrays for the charge groups that moved to neighbors.
4974 if (fr->cutoff_scheme == ecutsVERLET)
4976 moved = get_moved(comm,home_pos_cg);
4978 for(i=dd->ncg_home; i<home_pos_cg; i++)
4980 moved[i] = 0;
4983 dd->ncg_home = home_pos_cg;
4984 dd->nat_home = home_pos_at;
4986 if (debug)
4988 fprintf(debug,
4989 "Finished repartitioning: cgs moved out %d, new home %d\n",
4990 *ncg_moved,dd->ncg_home-*ncg_moved);
4995 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4997 dd->comm->cycl[ddCycl] += cycles;
4998 dd->comm->cycl_n[ddCycl]++;
4999 if (cycles > dd->comm->cycl_max[ddCycl])
5001 dd->comm->cycl_max[ddCycl] = cycles;
5005 static double force_flop_count(t_nrnb *nrnb)
5007 int i;
5008 double sum;
5009 const char *name;
5011 sum = 0;
5012 for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
5014 /* To get closer to the real timings, we half the count
5015 * for the normal loops and again half it for water loops.
5017 name = nrnb_str(i);
5018 if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5020 sum += nrnb->n[i]*0.25*cost_nrnb(i);
5022 else
5024 sum += nrnb->n[i]*0.50*cost_nrnb(i);
5027 for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
5029 name = nrnb_str(i);
5030 if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5031 sum += nrnb->n[i]*cost_nrnb(i);
5033 for(i=eNR_BONDS; i<=eNR_WALLS; i++)
5035 sum += nrnb->n[i]*cost_nrnb(i);
5038 return sum;
5041 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
5043 if (dd->comm->eFlop)
5045 dd->comm->flop -= force_flop_count(nrnb);
5048 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
5050 if (dd->comm->eFlop)
5052 dd->comm->flop += force_flop_count(nrnb);
5053 dd->comm->flop_n++;
5057 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5059 int i;
5061 for(i=0; i<ddCyclNr; i++)
5063 dd->comm->cycl[i] = 0;
5064 dd->comm->cycl_n[i] = 0;
5065 dd->comm->cycl_max[i] = 0;
5067 dd->comm->flop = 0;
5068 dd->comm->flop_n = 0;
5071 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
5073 gmx_domdec_comm_t *comm;
5074 gmx_domdec_load_t *load;
5075 gmx_domdec_root_t *root=NULL;
5076 int d,dim,cid,i,pos;
5077 float cell_frac=0,sbuf[DD_NLOAD_MAX];
5078 gmx_bool bSepPME;
5080 if (debug)
5082 fprintf(debug,"get_load_distribution start\n");
5085 wallcycle_start(wcycle,ewcDDCOMMLOAD);
5087 comm = dd->comm;
5089 bSepPME = (dd->pme_nodeid >= 0);
5091 for(d=dd->ndim-1; d>=0; d--)
5093 dim = dd->dim[d];
5094 /* Check if we participate in the communication in this dimension */
5095 if (d == dd->ndim-1 ||
5096 (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
5098 load = &comm->load[d];
5099 if (dd->bGridJump)
5101 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5103 pos = 0;
5104 if (d == dd->ndim-1)
5106 sbuf[pos++] = dd_force_load(comm);
5107 sbuf[pos++] = sbuf[0];
5108 if (dd->bGridJump)
5110 sbuf[pos++] = sbuf[0];
5111 sbuf[pos++] = cell_frac;
5112 if (d > 0)
5114 sbuf[pos++] = comm->cell_f_max0[d];
5115 sbuf[pos++] = comm->cell_f_min1[d];
5118 if (bSepPME)
5120 sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5121 sbuf[pos++] = comm->cycl[ddCyclPME];
5124 else
5126 sbuf[pos++] = comm->load[d+1].sum;
5127 sbuf[pos++] = comm->load[d+1].max;
5128 if (dd->bGridJump)
5130 sbuf[pos++] = comm->load[d+1].sum_m;
5131 sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5132 sbuf[pos++] = comm->load[d+1].flags;
5133 if (d > 0)
5135 sbuf[pos++] = comm->cell_f_max0[d];
5136 sbuf[pos++] = comm->cell_f_min1[d];
5139 if (bSepPME)
5141 sbuf[pos++] = comm->load[d+1].mdf;
5142 sbuf[pos++] = comm->load[d+1].pme;
5145 load->nload = pos;
5146 /* Communicate a row in DD direction d.
5147 * The communicators are setup such that the root always has rank 0.
5149 #ifdef GMX_MPI
5150 MPI_Gather(sbuf ,load->nload*sizeof(float),MPI_BYTE,
5151 load->load,load->nload*sizeof(float),MPI_BYTE,
5152 0,comm->mpi_comm_load[d]);
5153 #endif
5154 if (dd->ci[dim] == dd->master_ci[dim])
5156 /* We are the root, process this row */
5157 if (comm->bDynLoadBal)
5159 root = comm->root[d];
5161 load->sum = 0;
5162 load->max = 0;
5163 load->sum_m = 0;
5164 load->cvol_min = 1;
5165 load->flags = 0;
5166 load->mdf = 0;
5167 load->pme = 0;
5168 pos = 0;
5169 for(i=0; i<dd->nc[dim]; i++)
5171 load->sum += load->load[pos++];
5172 load->max = max(load->max,load->load[pos]);
5173 pos++;
5174 if (dd->bGridJump)
5176 if (root->bLimited)
5178 /* This direction could not be load balanced properly,
5179 * therefore we need to use the maximum iso the average load.
5181 load->sum_m = max(load->sum_m,load->load[pos]);
5183 else
5185 load->sum_m += load->load[pos];
5187 pos++;
5188 load->cvol_min = min(load->cvol_min,load->load[pos]);
5189 pos++;
5190 if (d < dd->ndim-1)
5192 load->flags = (int)(load->load[pos++] + 0.5);
5194 if (d > 0)
5196 root->cell_f_max0[i] = load->load[pos++];
5197 root->cell_f_min1[i] = load->load[pos++];
5200 if (bSepPME)
5202 load->mdf = max(load->mdf,load->load[pos]);
5203 pos++;
5204 load->pme = max(load->pme,load->load[pos]);
5205 pos++;
5208 if (comm->bDynLoadBal && root->bLimited)
5210 load->sum_m *= dd->nc[dim];
5211 load->flags |= (1<<d);
5217 if (DDMASTER(dd))
5219 comm->nload += dd_load_count(comm);
5220 comm->load_step += comm->cycl[ddCyclStep];
5221 comm->load_sum += comm->load[0].sum;
5222 comm->load_max += comm->load[0].max;
5223 if (comm->bDynLoadBal)
5225 for(d=0; d<dd->ndim; d++)
5227 if (comm->load[0].flags & (1<<d))
5229 comm->load_lim[d]++;
5233 if (bSepPME)
5235 comm->load_mdf += comm->load[0].mdf;
5236 comm->load_pme += comm->load[0].pme;
5240 wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5242 if (debug)
5244 fprintf(debug,"get_load_distribution finished\n");
5248 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5250 /* Return the relative performance loss on the total run time
5251 * due to the force calculation load imbalance.
5253 if (dd->comm->nload > 0)
5255 return
5256 (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5257 (dd->comm->load_step*dd->nnodes);
5259 else
5261 return 0;
5265 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5267 char buf[STRLEN];
5268 int npp,npme,nnodes,d,limp;
5269 float imbal,pme_f_ratio,lossf,lossp=0;
5270 gmx_bool bLim;
5271 gmx_domdec_comm_t *comm;
5273 comm = dd->comm;
5274 if (DDMASTER(dd) && comm->nload > 0)
5276 npp = dd->nnodes;
5277 npme = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5278 nnodes = npp + npme;
5279 imbal = comm->load_max*npp/comm->load_sum - 1;
5280 lossf = dd_force_imb_perf_loss(dd);
5281 sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5282 fprintf(fplog,"%s",buf);
5283 fprintf(stderr,"\n");
5284 fprintf(stderr,"%s",buf);
5285 sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5286 fprintf(fplog,"%s",buf);
5287 fprintf(stderr,"%s",buf);
5288 bLim = FALSE;
5289 if (comm->bDynLoadBal)
5291 sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5292 for(d=0; d<dd->ndim; d++)
5294 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5295 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5296 if (limp >= 50)
5298 bLim = TRUE;
5301 sprintf(buf+strlen(buf),"\n");
5302 fprintf(fplog,"%s",buf);
5303 fprintf(stderr,"%s",buf);
5305 if (npme > 0)
5307 pme_f_ratio = comm->load_pme/comm->load_mdf;
5308 lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5309 if (lossp <= 0)
5311 lossp *= (float)npme/(float)nnodes;
5313 else
5315 lossp *= (float)npp/(float)nnodes;
5317 sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5318 fprintf(fplog,"%s",buf);
5319 fprintf(stderr,"%s",buf);
5320 sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5321 fprintf(fplog,"%s",buf);
5322 fprintf(stderr,"%s",buf);
5324 fprintf(fplog,"\n");
5325 fprintf(stderr,"\n");
5327 if (lossf >= DD_PERF_LOSS)
5329 sprintf(buf,
5330 "NOTE: %.1f %% performance was lost due to load imbalance\n"
5331 " in the domain decomposition.\n",lossf*100);
5332 if (!comm->bDynLoadBal)
5334 sprintf(buf+strlen(buf)," You might want to use dynamic load balancing (option -dlb.)\n");
5336 else if (bLim)
5338 sprintf(buf+strlen(buf)," You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5340 fprintf(fplog,"%s\n",buf);
5341 fprintf(stderr,"%s\n",buf);
5343 if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5345 sprintf(buf,
5346 "NOTE: %.1f %% performance was lost because the PME nodes\n"
5347 " had %s work to do than the PP nodes.\n"
5348 " You might want to %s the number of PME nodes\n"
5349 " or %s the cut-off and the grid spacing.\n",
5350 fabs(lossp*100),
5351 (lossp < 0) ? "less" : "more",
5352 (lossp < 0) ? "decrease" : "increase",
5353 (lossp < 0) ? "decrease" : "increase");
5354 fprintf(fplog,"%s\n",buf);
5355 fprintf(stderr,"%s\n",buf);
5360 static float dd_vol_min(gmx_domdec_t *dd)
5362 return dd->comm->load[0].cvol_min*dd->nnodes;
5365 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5367 return dd->comm->load[0].flags;
5370 static float dd_f_imbal(gmx_domdec_t *dd)
5372 return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5375 float dd_pme_f_ratio(gmx_domdec_t *dd)
5377 if (dd->comm->cycl_n[ddCyclPME] > 0)
5379 return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5381 else
5383 return -1.0;
5387 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5389 int flags,d;
5390 char buf[22];
5392 flags = dd_load_flags(dd);
5393 if (flags)
5395 fprintf(fplog,
5396 "DD load balancing is limited by minimum cell size in dimension");
5397 for(d=0; d<dd->ndim; d++)
5399 if (flags & (1<<d))
5401 fprintf(fplog," %c",dim2char(dd->dim[d]));
5404 fprintf(fplog,"\n");
5406 fprintf(fplog,"DD step %s",gmx_step_str(step,buf));
5407 if (dd->comm->bDynLoadBal)
5409 fprintf(fplog," vol min/aver %5.3f%c",
5410 dd_vol_min(dd),flags ? '!' : ' ');
5412 fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5413 if (dd->comm->cycl_n[ddCyclPME])
5415 fprintf(fplog," pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5417 fprintf(fplog,"\n\n");
5420 static void dd_print_load_verbose(gmx_domdec_t *dd)
5422 if (dd->comm->bDynLoadBal)
5424 fprintf(stderr,"vol %4.2f%c ",
5425 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5427 fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5428 if (dd->comm->cycl_n[ddCyclPME])
5430 fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5434 #ifdef GMX_MPI
5435 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
5437 MPI_Comm c_row;
5438 int dim, i, rank;
5439 ivec loc_c;
5440 gmx_domdec_root_t *root;
5441 gmx_bool bPartOfGroup = FALSE;
5443 dim = dd->dim[dim_ind];
5444 copy_ivec(loc,loc_c);
5445 for(i=0; i<dd->nc[dim]; i++)
5447 loc_c[dim] = i;
5448 rank = dd_index(dd->nc,loc_c);
5449 if (rank == dd->rank)
5451 /* This process is part of the group */
5452 bPartOfGroup = TRUE;
5455 MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
5456 &c_row);
5457 if (bPartOfGroup)
5459 dd->comm->mpi_comm_load[dim_ind] = c_row;
5460 if (dd->comm->eDLB != edlbNO)
5462 if (dd->ci[dim] == dd->master_ci[dim])
5464 /* This is the root process of this row */
5465 snew(dd->comm->root[dim_ind],1);
5466 root = dd->comm->root[dim_ind];
5467 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5468 snew(root->old_cell_f,dd->nc[dim]+1);
5469 snew(root->bCellMin,dd->nc[dim]);
5470 if (dim_ind > 0)
5472 snew(root->cell_f_max0,dd->nc[dim]);
5473 snew(root->cell_f_min1,dd->nc[dim]);
5474 snew(root->bound_min,dd->nc[dim]);
5475 snew(root->bound_max,dd->nc[dim]);
5477 snew(root->buf_ncd,dd->nc[dim]);
5479 else
5481 /* This is not a root process, we only need to receive cell_f */
5482 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5485 if (dd->ci[dim] == dd->master_ci[dim])
5487 snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5491 #endif
5493 static void make_load_communicators(gmx_domdec_t *dd)
5495 #ifdef GMX_MPI
5496 int dim0,dim1,i,j;
5497 ivec loc;
5499 if (debug)
5500 fprintf(debug,"Making load communicators\n");
5502 snew(dd->comm->load,dd->ndim);
5503 snew(dd->comm->mpi_comm_load,dd->ndim);
5505 clear_ivec(loc);
5506 make_load_communicator(dd,0,loc);
5507 if (dd->ndim > 1) {
5508 dim0 = dd->dim[0];
5509 for(i=0; i<dd->nc[dim0]; i++) {
5510 loc[dim0] = i;
5511 make_load_communicator(dd,1,loc);
5514 if (dd->ndim > 2) {
5515 dim0 = dd->dim[0];
5516 for(i=0; i<dd->nc[dim0]; i++) {
5517 loc[dim0] = i;
5518 dim1 = dd->dim[1];
5519 for(j=0; j<dd->nc[dim1]; j++) {
5520 loc[dim1] = j;
5521 make_load_communicator(dd,2,loc);
5526 if (debug)
5527 fprintf(debug,"Finished making load communicators\n");
5528 #endif
5531 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5533 gmx_bool bZYX;
5534 int d,dim,i,j,m;
5535 ivec tmp,s;
5536 int nzone,nzonep;
5537 ivec dd_zp[DD_MAXIZONE];
5538 gmx_domdec_zones_t *zones;
5539 gmx_domdec_ns_ranges_t *izone;
5541 for(d=0; d<dd->ndim; d++)
5543 dim = dd->dim[d];
5544 copy_ivec(dd->ci,tmp);
5545 tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5546 dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5547 copy_ivec(dd->ci,tmp);
5548 tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5549 dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5550 if (debug)
5552 fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5553 dd->rank,dim,
5554 dd->neighbor[d][0],
5555 dd->neighbor[d][1]);
5559 if (DDMASTER(dd))
5561 fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5562 dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5564 if (fplog)
5566 fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5567 dd->ndim,
5568 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5569 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5571 switch (dd->ndim)
5573 case 3:
5574 nzone = dd_z3n;
5575 nzonep = dd_zp3n;
5576 for(i=0; i<nzonep; i++)
5578 copy_ivec(dd_zp3[i],dd_zp[i]);
5580 break;
5581 case 2:
5582 nzone = dd_z2n;
5583 nzonep = dd_zp2n;
5584 for(i=0; i<nzonep; i++)
5586 copy_ivec(dd_zp2[i],dd_zp[i]);
5588 break;
5589 case 1:
5590 nzone = dd_z1n;
5591 nzonep = dd_zp1n;
5592 for(i=0; i<nzonep; i++)
5594 copy_ivec(dd_zp1[i],dd_zp[i]);
5596 break;
5597 default:
5598 gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5599 nzone = 0;
5600 nzonep = 0;
5603 zones = &dd->comm->zones;
5605 for(i=0; i<nzone; i++)
5607 m = 0;
5608 clear_ivec(zones->shift[i]);
5609 for(d=0; d<dd->ndim; d++)
5611 zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5615 zones->n = nzone;
5616 for(i=0; i<nzone; i++)
5618 for(d=0; d<DIM; d++)
5620 s[d] = dd->ci[d] - zones->shift[i][d];
5621 if (s[d] < 0)
5623 s[d] += dd->nc[d];
5625 else if (s[d] >= dd->nc[d])
5627 s[d] -= dd->nc[d];
5631 zones->nizone = nzonep;
5632 for(i=0; i<zones->nizone; i++)
5634 if (dd_zp[i][0] != i)
5636 gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5638 izone = &zones->izone[i];
5639 izone->j0 = dd_zp[i][1];
5640 izone->j1 = dd_zp[i][2];
5641 for(dim=0; dim<DIM; dim++)
5643 if (dd->nc[dim] == 1)
5645 /* All shifts should be allowed */
5646 izone->shift0[dim] = -1;
5647 izone->shift1[dim] = 1;
5649 else
5652 izone->shift0[d] = 0;
5653 izone->shift1[d] = 0;
5654 for(j=izone->j0; j<izone->j1; j++) {
5655 if (dd->shift[j][d] > dd->shift[i][d])
5656 izone->shift0[d] = -1;
5657 if (dd->shift[j][d] < dd->shift[i][d])
5658 izone->shift1[d] = 1;
5662 int shift_diff;
5664 /* Assume the shift are not more than 1 cell */
5665 izone->shift0[dim] = 1;
5666 izone->shift1[dim] = -1;
5667 for(j=izone->j0; j<izone->j1; j++)
5669 shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5670 if (shift_diff < izone->shift0[dim])
5672 izone->shift0[dim] = shift_diff;
5674 if (shift_diff > izone->shift1[dim])
5676 izone->shift1[dim] = shift_diff;
5683 if (dd->comm->eDLB != edlbNO)
5685 snew(dd->comm->root,dd->ndim);
5688 if (dd->comm->bRecordLoad)
5690 make_load_communicators(dd);
5694 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5696 gmx_domdec_t *dd;
5697 gmx_domdec_comm_t *comm;
5698 int i,rank,*buf;
5699 ivec periods;
5700 #ifdef GMX_MPI
5701 MPI_Comm comm_cart;
5702 #endif
5704 dd = cr->dd;
5705 comm = dd->comm;
5707 #ifdef GMX_MPI
5708 if (comm->bCartesianPP)
5710 /* Set up cartesian communication for the particle-particle part */
5711 if (fplog)
5713 fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5714 dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5717 for(i=0; i<DIM; i++)
5719 periods[i] = TRUE;
5721 MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5722 &comm_cart);
5723 /* We overwrite the old communicator with the new cartesian one */
5724 cr->mpi_comm_mygroup = comm_cart;
5727 dd->mpi_comm_all = cr->mpi_comm_mygroup;
5728 MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5730 if (comm->bCartesianPP_PME)
5732 /* Since we want to use the original cartesian setup for sim,
5733 * and not the one after split, we need to make an index.
5735 snew(comm->ddindex2ddnodeid,dd->nnodes);
5736 comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5737 gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5738 /* Get the rank of the DD master,
5739 * above we made sure that the master node is a PP node.
5741 if (MASTER(cr))
5743 rank = dd->rank;
5745 else
5747 rank = 0;
5749 MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5751 else if (comm->bCartesianPP)
5753 if (cr->npmenodes == 0)
5755 /* The PP communicator is also
5756 * the communicator for this simulation
5758 cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5760 cr->nodeid = dd->rank;
5762 MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5764 /* We need to make an index to go from the coordinates
5765 * to the nodeid of this simulation.
5767 snew(comm->ddindex2simnodeid,dd->nnodes);
5768 snew(buf,dd->nnodes);
5769 if (cr->duty & DUTY_PP)
5771 buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5773 /* Communicate the ddindex to simulation nodeid index */
5774 MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5775 cr->mpi_comm_mysim);
5776 sfree(buf);
5778 /* Determine the master coordinates and rank.
5779 * The DD master should be the same node as the master of this sim.
5781 for(i=0; i<dd->nnodes; i++)
5783 if (comm->ddindex2simnodeid[i] == 0)
5785 ddindex2xyz(dd->nc,i,dd->master_ci);
5786 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5789 if (debug)
5791 fprintf(debug,"The master rank is %d\n",dd->masterrank);
5794 else
5796 /* No Cartesian communicators */
5797 /* We use the rank in dd->comm->all as DD index */
5798 ddindex2xyz(dd->nc,dd->rank,dd->ci);
5799 /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5800 dd->masterrank = 0;
5801 clear_ivec(dd->master_ci);
5803 #endif
5805 if (fplog)
5807 fprintf(fplog,
5808 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5809 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5811 if (debug)
5813 fprintf(debug,
5814 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5815 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5819 static void receive_ddindex2simnodeid(t_commrec *cr)
5821 gmx_domdec_t *dd;
5823 gmx_domdec_comm_t *comm;
5824 int *buf;
5826 dd = cr->dd;
5827 comm = dd->comm;
5829 #ifdef GMX_MPI
5830 if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5832 snew(comm->ddindex2simnodeid,dd->nnodes);
5833 snew(buf,dd->nnodes);
5834 if (cr->duty & DUTY_PP)
5836 buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5838 #ifdef GMX_MPI
5839 /* Communicate the ddindex to simulation nodeid index */
5840 MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5841 cr->mpi_comm_mysim);
5842 #endif
5843 sfree(buf);
5845 #endif
5848 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5849 int ncg,int natoms)
5851 gmx_domdec_master_t *ma;
5852 int i;
5854 snew(ma,1);
5856 snew(ma->ncg,dd->nnodes);
5857 snew(ma->index,dd->nnodes+1);
5858 snew(ma->cg,ncg);
5859 snew(ma->nat,dd->nnodes);
5860 snew(ma->ibuf,dd->nnodes*2);
5861 snew(ma->cell_x,DIM);
5862 for(i=0; i<DIM; i++)
5864 snew(ma->cell_x[i],dd->nc[i]+1);
5867 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5869 ma->vbuf = NULL;
5871 else
5873 snew(ma->vbuf,natoms);
5876 return ma;
5879 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5880 int reorder)
5882 gmx_domdec_t *dd;
5883 gmx_domdec_comm_t *comm;
5884 int i,rank;
5885 gmx_bool bDiv[DIM];
5886 ivec periods;
5887 #ifdef GMX_MPI
5888 MPI_Comm comm_cart;
5889 #endif
5891 dd = cr->dd;
5892 comm = dd->comm;
5894 if (comm->bCartesianPP)
5896 for(i=1; i<DIM; i++)
5898 bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5900 if (bDiv[YY] || bDiv[ZZ])
5902 comm->bCartesianPP_PME = TRUE;
5903 /* If we have 2D PME decomposition, which is always in x+y,
5904 * we stack the PME only nodes in z.
5905 * Otherwise we choose the direction that provides the thinnest slab
5906 * of PME only nodes as this will have the least effect
5907 * on the PP communication.
5908 * But for the PME communication the opposite might be better.
5910 if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5911 !bDiv[YY] ||
5912 dd->nc[YY] > dd->nc[ZZ]))
5914 comm->cartpmedim = ZZ;
5916 else
5918 comm->cartpmedim = YY;
5920 comm->ntot[comm->cartpmedim]
5921 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5923 else if (fplog)
5925 fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5926 fprintf(fplog,
5927 "Will not use a Cartesian communicator for PP <-> PME\n\n");
5931 #ifdef GMX_MPI
5932 if (comm->bCartesianPP_PME)
5934 if (fplog)
5936 fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5939 for(i=0; i<DIM; i++)
5941 periods[i] = TRUE;
5943 MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5944 &comm_cart);
5946 MPI_Comm_rank(comm_cart,&rank);
5947 if (MASTERNODE(cr) && rank != 0)
5949 gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5952 /* With this assigment we loose the link to the original communicator
5953 * which will usually be MPI_COMM_WORLD, unless have multisim.
5955 cr->mpi_comm_mysim = comm_cart;
5956 cr->sim_nodeid = rank;
5958 MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5960 if (fplog)
5962 fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5963 cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5966 if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5968 cr->duty = DUTY_PP;
5970 if (cr->npmenodes == 0 ||
5971 dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5973 cr->duty = DUTY_PME;
5976 /* Split the sim communicator into PP and PME only nodes */
5977 MPI_Comm_split(cr->mpi_comm_mysim,
5978 cr->duty,
5979 dd_index(comm->ntot,dd->ci),
5980 &cr->mpi_comm_mygroup);
5982 else
5984 switch (dd_node_order)
5986 case ddnoPP_PME:
5987 if (fplog)
5989 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5991 break;
5992 case ddnoINTERLEAVE:
5993 /* Interleave the PP-only and PME-only nodes,
5994 * as on clusters with dual-core machines this will double
5995 * the communication bandwidth of the PME processes
5996 * and thus speed up the PP <-> PME and inter PME communication.
5998 if (fplog)
6000 fprintf(fplog,"Interleaving PP and PME nodes\n");
6002 comm->pmenodes = dd_pmenodes(cr);
6003 break;
6004 case ddnoCARTESIAN:
6005 break;
6006 default:
6007 gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
6010 if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
6012 cr->duty = DUTY_PME;
6014 else
6016 cr->duty = DUTY_PP;
6019 /* Split the sim communicator into PP and PME only nodes */
6020 MPI_Comm_split(cr->mpi_comm_mysim,
6021 cr->duty,
6022 cr->nodeid,
6023 &cr->mpi_comm_mygroup);
6024 MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
6026 #endif
6028 if (fplog)
6030 fprintf(fplog,"This is a %s only node\n\n",
6031 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6035 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
6037 gmx_domdec_t *dd;
6038 gmx_domdec_comm_t *comm;
6039 int CartReorder;
6041 dd = cr->dd;
6042 comm = dd->comm;
6044 copy_ivec(dd->nc,comm->ntot);
6046 comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
6047 comm->bCartesianPP_PME = FALSE;
6049 /* Reorder the nodes by default. This might change the MPI ranks.
6050 * Real reordering is only supported on very few architectures,
6051 * Blue Gene is one of them.
6053 CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6055 if (cr->npmenodes > 0)
6057 /* Split the communicator into a PP and PME part */
6058 split_communicator(fplog,cr,dd_node_order,CartReorder);
6059 if (comm->bCartesianPP_PME)
6061 /* We (possibly) reordered the nodes in split_communicator,
6062 * so it is no longer required in make_pp_communicator.
6064 CartReorder = FALSE;
6067 else
6069 /* All nodes do PP and PME */
6070 #ifdef GMX_MPI
6071 /* We do not require separate communicators */
6072 cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6073 #endif
6076 if (cr->duty & DUTY_PP)
6078 /* Copy or make a new PP communicator */
6079 make_pp_communicator(fplog,cr,CartReorder);
6081 else
6083 receive_ddindex2simnodeid(cr);
6086 if (!(cr->duty & DUTY_PME))
6088 /* Set up the commnuication to our PME node */
6089 dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
6090 dd->pme_receive_vir_ener = receive_vir_ener(cr);
6091 if (debug)
6093 fprintf(debug,"My pme_nodeid %d receive ener %d\n",
6094 dd->pme_nodeid,dd->pme_receive_vir_ener);
6097 else
6099 dd->pme_nodeid = -1;
6102 if (DDMASTER(dd))
6104 dd->ma = init_gmx_domdec_master_t(dd,
6105 comm->cgs_gl.nr,
6106 comm->cgs_gl.index[comm->cgs_gl.nr]);
6110 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
6112 real *slb_frac,tot;
6113 int i,n;
6114 double dbl;
6116 slb_frac = NULL;
6117 if (nc > 1 && size_string != NULL)
6119 if (fplog)
6121 fprintf(fplog,"Using static load balancing for the %s direction\n",
6122 dir);
6124 snew(slb_frac,nc);
6125 tot = 0;
6126 for (i=0; i<nc; i++)
6128 dbl = 0;
6129 sscanf(size_string,"%lf%n",&dbl,&n);
6130 if (dbl == 0)
6132 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
6134 slb_frac[i] = dbl;
6135 size_string += n;
6136 tot += slb_frac[i];
6138 /* Normalize */
6139 if (fplog)
6141 fprintf(fplog,"Relative cell sizes:");
6143 for (i=0; i<nc; i++)
6145 slb_frac[i] /= tot;
6146 if (fplog)
6148 fprintf(fplog," %5.3f",slb_frac[i]);
6151 if (fplog)
6153 fprintf(fplog,"\n");
6157 return slb_frac;
6160 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6162 int n,nmol,ftype;
6163 gmx_mtop_ilistloop_t iloop;
6164 t_ilist *il;
6166 n = 0;
6167 iloop = gmx_mtop_ilistloop_init(mtop);
6168 while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
6170 for(ftype=0; ftype<F_NRE; ftype++)
6172 if ((interaction_function[ftype].flags & IF_BOND) &&
6173 NRAL(ftype) > 2)
6175 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6180 return n;
6183 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
6185 char *val;
6186 int nst;
6188 nst = def;
6189 val = getenv(env_var);
6190 if (val)
6192 if (sscanf(val,"%d",&nst) <= 0)
6194 nst = 1;
6196 if (fplog)
6198 fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
6199 env_var,val,nst);
6203 return nst;
6206 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
6208 if (MASTER(cr))
6210 fprintf(stderr,"\n%s\n",warn_string);
6212 if (fplog)
6214 fprintf(fplog,"\n%s\n",warn_string);
6218 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
6219 t_inputrec *ir,FILE *fplog)
6221 if (ir->ePBC == epbcSCREW &&
6222 (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6224 gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6227 if (ir->ns_type == ensSIMPLE)
6229 gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6232 if (ir->nstlist == 0)
6234 gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6237 if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6239 dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6243 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6245 int di,d;
6246 real r;
6248 r = ddbox->box_size[XX];
6249 for(di=0; di<dd->ndim; di++)
6251 d = dd->dim[di];
6252 /* Check using the initial average cell size */
6253 r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6256 return r;
6259 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6260 const char *dlb_opt,gmx_bool bRecordLoad,
6261 unsigned long Flags,t_inputrec *ir)
6263 gmx_domdec_t *dd;
6264 int eDLB=-1;
6265 char buf[STRLEN];
6267 switch (dlb_opt[0])
6269 case 'a': eDLB = edlbAUTO; break;
6270 case 'n': eDLB = edlbNO; break;
6271 case 'y': eDLB = edlbYES; break;
6272 default: gmx_incons("Unknown dlb_opt");
6275 if (Flags & MD_RERUN)
6277 return edlbNO;
6280 if (!EI_DYNAMICS(ir->eI))
6282 if (eDLB == edlbYES)
6284 sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6285 dd_warning(cr,fplog,buf);
6288 return edlbNO;
6291 if (!bRecordLoad)
6293 dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6295 return edlbNO;
6298 if (Flags & MD_REPRODUCIBLE)
6300 switch (eDLB)
6302 case edlbNO:
6303 break;
6304 case edlbAUTO:
6305 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6306 eDLB = edlbNO;
6307 break;
6308 case edlbYES:
6309 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6310 break;
6311 default:
6312 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6313 break;
6317 return eDLB;
6320 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6322 int dim;
6324 dd->ndim = 0;
6325 if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6327 /* Decomposition order z,y,x */
6328 if (fplog)
6330 fprintf(fplog,"Using domain decomposition order z, y, x\n");
6332 for(dim=DIM-1; dim>=0; dim--)
6334 if (dd->nc[dim] > 1)
6336 dd->dim[dd->ndim++] = dim;
6340 else
6342 /* Decomposition order x,y,z */
6343 for(dim=0; dim<DIM; dim++)
6345 if (dd->nc[dim] > 1)
6347 dd->dim[dd->ndim++] = dim;
6353 static gmx_domdec_comm_t *init_dd_comm()
6355 gmx_domdec_comm_t *comm;
6356 int i;
6358 snew(comm,1);
6359 snew(comm->cggl_flag,DIM*2);
6360 snew(comm->cgcm_state,DIM*2);
6361 for(i=0; i<DIM*2; i++)
6363 comm->cggl_flag_nalloc[i] = 0;
6364 comm->cgcm_state_nalloc[i] = 0;
6367 comm->nalloc_int = 0;
6368 comm->buf_int = NULL;
6370 vec_rvec_init(&comm->vbuf);
6372 comm->n_load_have = 0;
6373 comm->n_load_collect = 0;
6375 for(i=0; i<ddnatNR-ddnatZONE; i++)
6377 comm->sum_nat[i] = 0;
6379 comm->ndecomp = 0;
6380 comm->nload = 0;
6381 comm->load_step = 0;
6382 comm->load_sum = 0;
6383 comm->load_max = 0;
6384 clear_ivec(comm->load_lim);
6385 comm->load_mdf = 0;
6386 comm->load_pme = 0;
6388 return comm;
6391 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6392 unsigned long Flags,
6393 ivec nc,
6394 real comm_distance_min,real rconstr,
6395 const char *dlb_opt,real dlb_scale,
6396 const char *sizex,const char *sizey,const char *sizez,
6397 gmx_mtop_t *mtop,t_inputrec *ir,
6398 matrix box,rvec *x,
6399 gmx_ddbox_t *ddbox,
6400 int *npme_x,int *npme_y)
6402 gmx_domdec_t *dd;
6403 gmx_domdec_comm_t *comm;
6404 int recload;
6405 int d,i,j;
6406 real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6407 gmx_bool bC;
6408 char buf[STRLEN];
6410 if (fplog)
6412 fprintf(fplog,
6413 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6416 snew(dd,1);
6418 dd->comm = init_dd_comm();
6419 comm = dd->comm;
6420 snew(comm->cggl_flag,DIM*2);
6421 snew(comm->cgcm_state,DIM*2);
6423 dd->npbcdim = ePBC2npbcdim(ir->ePBC);
6424 dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6426 dd->bSendRecv2 = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6427 comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6428 comm->eFlop = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6429 recload = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6430 comm->nstSortCG = dd_nst_env(fplog,"GMX_DD_SORT",1);
6431 comm->nstDDDump = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6432 comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6433 comm->DD_debug = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6435 dd->pme_recv_f_alloc = 0;
6436 dd->pme_recv_f_buf = NULL;
6438 if (dd->bSendRecv2 && fplog)
6440 fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6442 if (comm->eFlop)
6444 if (fplog)
6446 fprintf(fplog,"Will load balance based on FLOP count\n");
6448 if (comm->eFlop > 1)
6450 srand(1+cr->nodeid);
6452 comm->bRecordLoad = TRUE;
6454 else
6456 comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6460 comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6462 comm->bDynLoadBal = (comm->eDLB == edlbYES);
6463 if (fplog)
6465 fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6467 dd->bGridJump = comm->bDynLoadBal;
6469 if (comm->nstSortCG)
6471 if (fplog)
6473 if (comm->nstSortCG == 1)
6475 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6477 else
6479 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6480 comm->nstSortCG);
6483 snew(comm->sort,1);
6485 else
6487 if (fplog)
6489 fprintf(fplog,"Will not sort the charge groups\n");
6493 comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6495 comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6496 if (comm->bInterCGBondeds)
6498 comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6500 else
6502 comm->bInterCGMultiBody = FALSE;
6505 dd->bInterCGcons = inter_charge_group_constraints(mtop);
6506 dd->bInterCGsettles = inter_charge_group_settles(mtop);
6508 if (ir->rlistlong == 0)
6510 /* Set the cut-off to some very large value,
6511 * so we don't need if statements everywhere in the code.
6512 * We use sqrt, since the cut-off is squared in some places.
6514 comm->cutoff = GMX_CUTOFF_INF;
6516 else
6518 comm->cutoff = ir->rlistlong;
6520 comm->cutoff_mbody = 0;
6522 comm->cellsize_limit = 0;
6523 comm->bBondComm = FALSE;
6525 if (comm->bInterCGBondeds)
6527 if (comm_distance_min > 0)
6529 comm->cutoff_mbody = comm_distance_min;
6530 if (Flags & MD_DDBONDCOMM)
6532 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6534 else
6536 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6538 r_bonded_limit = comm->cutoff_mbody;
6540 else if (ir->bPeriodicMols)
6542 /* Can not easily determine the required cut-off */
6543 dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6544 comm->cutoff_mbody = comm->cutoff/2;
6545 r_bonded_limit = comm->cutoff_mbody;
6547 else
6549 if (MASTER(cr))
6551 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6552 Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6554 gmx_bcast(sizeof(r_2b),&r_2b,cr);
6555 gmx_bcast(sizeof(r_mb),&r_mb,cr);
6557 /* We use an initial margin of 10% for the minimum cell size,
6558 * except when we are just below the non-bonded cut-off.
6560 if (Flags & MD_DDBONDCOMM)
6562 if (max(r_2b,r_mb) > comm->cutoff)
6564 r_bonded = max(r_2b,r_mb);
6565 r_bonded_limit = 1.1*r_bonded;
6566 comm->bBondComm = TRUE;
6568 else
6570 r_bonded = r_mb;
6571 r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6573 /* We determine cutoff_mbody later */
6575 else
6577 /* No special bonded communication,
6578 * simply increase the DD cut-off.
6580 r_bonded_limit = 1.1*max(r_2b,r_mb);
6581 comm->cutoff_mbody = r_bonded_limit;
6582 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6585 comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6586 if (fplog)
6588 fprintf(fplog,
6589 "Minimum cell size due to bonded interactions: %.3f nm\n",
6590 comm->cellsize_limit);
6594 if (dd->bInterCGcons && rconstr <= 0)
6596 /* There is a cell size limit due to the constraints (P-LINCS) */
6597 rconstr = constr_r_max(fplog,mtop,ir);
6598 if (fplog)
6600 fprintf(fplog,
6601 "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6602 rconstr);
6603 if (rconstr > comm->cellsize_limit)
6605 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6609 else if (rconstr > 0 && fplog)
6611 /* Here we do not check for dd->bInterCGcons,
6612 * because one can also set a cell size limit for virtual sites only
6613 * and at this point we don't know yet if there are intercg v-sites.
6615 fprintf(fplog,
6616 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6617 rconstr);
6619 comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6621 comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6623 if (nc[XX] > 0)
6625 copy_ivec(nc,dd->nc);
6626 set_dd_dim(fplog,dd);
6627 set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6629 if (cr->npmenodes == -1)
6631 cr->npmenodes = 0;
6633 acs = average_cellsize_min(dd,ddbox);
6634 if (acs < comm->cellsize_limit)
6636 if (fplog)
6638 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6640 gmx_fatal_collective(FARGS,cr,NULL,
6641 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6642 acs,comm->cellsize_limit);
6645 else
6647 set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6649 /* We need to choose the optimal DD grid and possibly PME nodes */
6650 limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6651 comm->eDLB!=edlbNO,dlb_scale,
6652 comm->cellsize_limit,comm->cutoff,
6653 comm->bInterCGBondeds,comm->bInterCGMultiBody);
6655 if (dd->nc[XX] == 0)
6657 bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6658 sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6659 !bC ? "-rdd" : "-rcon",
6660 comm->eDLB!=edlbNO ? " or -dds" : "",
6661 bC ? " or your LINCS settings" : "");
6663 gmx_fatal_collective(FARGS,cr,NULL,
6664 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6665 "%s\n"
6666 "Look in the log file for details on the domain decomposition",
6667 cr->nnodes-cr->npmenodes,limit,buf);
6669 set_dd_dim(fplog,dd);
6672 if (fplog)
6674 fprintf(fplog,
6675 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6676 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6679 dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6680 if (cr->nnodes - dd->nnodes != cr->npmenodes)
6682 gmx_fatal_collective(FARGS,cr,NULL,
6683 "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6684 dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6686 if (cr->npmenodes > dd->nnodes)
6688 gmx_fatal_collective(FARGS,cr,NULL,
6689 "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6691 if (cr->npmenodes > 0)
6693 comm->npmenodes = cr->npmenodes;
6695 else
6697 comm->npmenodes = dd->nnodes;
6700 if (EEL_PME(ir->coulombtype))
6702 /* The following choices should match those
6703 * in comm_cost_est in domdec_setup.c.
6704 * Note that here the checks have to take into account
6705 * that the decomposition might occur in a different order than xyz
6706 * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6707 * in which case they will not match those in comm_cost_est,
6708 * but since that is mainly for testing purposes that's fine.
6710 if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6711 comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6712 getenv("GMX_PMEONEDD") == NULL)
6714 comm->npmedecompdim = 2;
6715 comm->npmenodes_x = dd->nc[XX];
6716 comm->npmenodes_y = comm->npmenodes/comm->npmenodes_x;
6718 else
6720 /* In case nc is 1 in both x and y we could still choose to
6721 * decompose pme in y instead of x, but we use x for simplicity.
6723 comm->npmedecompdim = 1;
6724 if (dd->dim[0] == YY)
6726 comm->npmenodes_x = 1;
6727 comm->npmenodes_y = comm->npmenodes;
6729 else
6731 comm->npmenodes_x = comm->npmenodes;
6732 comm->npmenodes_y = 1;
6735 if (fplog)
6737 fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6738 comm->npmenodes_x,comm->npmenodes_y,1);
6741 else
6743 comm->npmedecompdim = 0;
6744 comm->npmenodes_x = 0;
6745 comm->npmenodes_y = 0;
6748 /* Technically we don't need both of these,
6749 * but it simplifies code not having to recalculate it.
6751 *npme_x = comm->npmenodes_x;
6752 *npme_y = comm->npmenodes_y;
6754 snew(comm->slb_frac,DIM);
6755 if (comm->eDLB == edlbNO)
6757 comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6758 comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6759 comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6762 if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6764 if (comm->bBondComm || comm->eDLB != edlbNO)
6766 /* Set the bonded communication distance to halfway
6767 * the minimum and the maximum,
6768 * since the extra communication cost is nearly zero.
6770 acs = average_cellsize_min(dd,ddbox);
6771 comm->cutoff_mbody = 0.5*(r_bonded + acs);
6772 if (comm->eDLB != edlbNO)
6774 /* Check if this does not limit the scaling */
6775 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6777 if (!comm->bBondComm)
6779 /* Without bBondComm do not go beyond the n.b. cut-off */
6780 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6781 if (comm->cellsize_limit >= comm->cutoff)
6783 /* We don't loose a lot of efficieny
6784 * when increasing it to the n.b. cut-off.
6785 * It can even be slightly faster, because we need
6786 * less checks for the communication setup.
6788 comm->cutoff_mbody = comm->cutoff;
6791 /* Check if we did not end up below our original limit */
6792 comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6794 if (comm->cutoff_mbody > comm->cellsize_limit)
6796 comm->cellsize_limit = comm->cutoff_mbody;
6799 /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6802 if (debug)
6804 fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6805 "cellsize limit %f\n",
6806 comm->bBondComm,comm->cellsize_limit);
6809 if (MASTER(cr))
6811 check_dd_restrictions(cr,dd,ir,fplog);
6814 comm->partition_step = INT_MIN;
6815 dd->ddp_count = 0;
6817 clear_dd_cycle_counts(dd);
6819 return dd;
6822 static void set_dlb_limits(gmx_domdec_t *dd)
6825 int d;
6827 for(d=0; d<dd->ndim; d++)
6829 dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6830 dd->comm->cellsize_min[dd->dim[d]] =
6831 dd->comm->cellsize_min_dlb[dd->dim[d]];
6836 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6838 gmx_domdec_t *dd;
6839 gmx_domdec_comm_t *comm;
6840 real cellsize_min;
6841 int d,nc,i;
6842 char buf[STRLEN];
6844 dd = cr->dd;
6845 comm = dd->comm;
6847 if (fplog)
6849 fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6852 cellsize_min = comm->cellsize_min[dd->dim[0]];
6853 for(d=1; d<dd->ndim; d++)
6855 cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6858 if (cellsize_min < comm->cellsize_limit*1.05)
6860 dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6862 /* Change DLB from "auto" to "no". */
6863 comm->eDLB = edlbNO;
6865 return;
6868 dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6869 comm->bDynLoadBal = TRUE;
6870 dd->bGridJump = TRUE;
6872 set_dlb_limits(dd);
6874 /* We can set the required cell size info here,
6875 * so we do not need to communicate this.
6876 * The grid is completely uniform.
6878 for(d=0; d<dd->ndim; d++)
6880 if (comm->root[d])
6882 comm->load[d].sum_m = comm->load[d].sum;
6884 nc = dd->nc[dd->dim[d]];
6885 for(i=0; i<nc; i++)
6887 comm->root[d]->cell_f[i] = i/(real)nc;
6888 if (d > 0)
6890 comm->root[d]->cell_f_max0[i] = i /(real)nc;
6891 comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6894 comm->root[d]->cell_f[nc] = 1.0;
6899 static char *init_bLocalCG(gmx_mtop_t *mtop)
6901 int ncg,cg;
6902 char *bLocalCG;
6904 ncg = ncg_mtop(mtop);
6905 snew(bLocalCG,ncg);
6906 for(cg=0; cg<ncg; cg++)
6908 bLocalCG[cg] = FALSE;
6911 return bLocalCG;
6914 void dd_init_bondeds(FILE *fplog,
6915 gmx_domdec_t *dd,gmx_mtop_t *mtop,
6916 gmx_vsite_t *vsite,gmx_constr_t constr,
6917 t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6919 gmx_domdec_comm_t *comm;
6920 gmx_bool bBondComm;
6921 int d;
6923 dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6925 comm = dd->comm;
6927 if (comm->bBondComm)
6929 /* Communicate atoms beyond the cut-off for bonded interactions */
6930 comm = dd->comm;
6932 comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6934 comm->bLocalCG = init_bLocalCG(mtop);
6936 else
6938 /* Only communicate atoms based on cut-off */
6939 comm->cglink = NULL;
6940 comm->bLocalCG = NULL;
6944 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6945 t_inputrec *ir,
6946 gmx_bool bDynLoadBal,real dlb_scale,
6947 gmx_ddbox_t *ddbox)
6949 gmx_domdec_comm_t *comm;
6950 int d;
6951 ivec np;
6952 real limit,shrink;
6953 char buf[64];
6955 if (fplog == NULL)
6957 return;
6960 comm = dd->comm;
6962 if (bDynLoadBal)
6964 fprintf(fplog,"The maximum number of communication pulses is:");
6965 for(d=0; d<dd->ndim; d++)
6967 fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6969 fprintf(fplog,"\n");
6970 fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6971 fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6972 fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6973 for(d=0; d<DIM; d++)
6975 if (dd->nc[d] > 1)
6977 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6979 shrink = 0;
6981 else
6983 shrink =
6984 comm->cellsize_min_dlb[d]/
6985 (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6987 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6990 fprintf(fplog,"\n");
6992 else
6994 set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6995 fprintf(fplog,"The initial number of communication pulses is:");
6996 for(d=0; d<dd->ndim; d++)
6998 fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
7000 fprintf(fplog,"\n");
7001 fprintf(fplog,"The initial domain decomposition cell size is:");
7002 for(d=0; d<DIM; d++) {
7003 if (dd->nc[d] > 1)
7005 fprintf(fplog," %c %.2f nm",
7006 dim2char(d),dd->comm->cellsize_min[d]);
7009 fprintf(fplog,"\n\n");
7012 if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7014 fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
7015 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7016 "non-bonded interactions","",comm->cutoff);
7018 if (bDynLoadBal)
7020 limit = dd->comm->cellsize_limit;
7022 else
7024 if (dynamic_dd_box(ddbox,ir))
7026 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
7028 limit = dd->comm->cellsize_min[XX];
7029 for(d=1; d<DIM; d++)
7031 limit = min(limit,dd->comm->cellsize_min[d]);
7035 if (comm->bInterCGBondeds)
7037 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7038 "two-body bonded interactions","(-rdd)",
7039 max(comm->cutoff,comm->cutoff_mbody));
7040 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7041 "multi-body bonded interactions","(-rdd)",
7042 (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
7044 if (dd->vsite_comm)
7046 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7047 "virtual site constructions","(-rcon)",limit);
7049 if (dd->constraint_comm)
7051 sprintf(buf,"atoms separated by up to %d constraints",
7052 1+ir->nProjOrder);
7053 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7054 buf,"(-rcon)",limit);
7056 fprintf(fplog,"\n");
7059 fflush(fplog);
7062 static void set_cell_limits_dlb(gmx_domdec_t *dd,
7063 real dlb_scale,
7064 const t_inputrec *ir,
7065 const gmx_ddbox_t *ddbox)
7067 gmx_domdec_comm_t *comm;
7068 int d,dim,npulse,npulse_d_max,npulse_d;
7069 gmx_bool bNoCutOff;
7071 comm = dd->comm;
7073 bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7075 /* Determine the maximum number of comm. pulses in one dimension */
7077 comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7079 /* Determine the maximum required number of grid pulses */
7080 if (comm->cellsize_limit >= comm->cutoff)
7082 /* Only a single pulse is required */
7083 npulse = 1;
7085 else if (!bNoCutOff && comm->cellsize_limit > 0)
7087 /* We round down slightly here to avoid overhead due to the latency
7088 * of extra communication calls when the cut-off
7089 * would be only slightly longer than the cell size.
7090 * Later cellsize_limit is redetermined,
7091 * so we can not miss interactions due to this rounding.
7093 npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7095 else
7097 /* There is no cell size limit */
7098 npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
7101 if (!bNoCutOff && npulse > 1)
7103 /* See if we can do with less pulses, based on dlb_scale */
7104 npulse_d_max = 0;
7105 for(d=0; d<dd->ndim; d++)
7107 dim = dd->dim[d];
7108 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7109 /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7110 npulse_d_max = max(npulse_d_max,npulse_d);
7112 npulse = min(npulse,npulse_d_max);
7115 /* This env var can override npulse */
7116 d = dd_nst_env(debug,"GMX_DD_NPULSE",0);
7117 if (d > 0)
7119 npulse = d;
7122 comm->maxpulse = 1;
7123 comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7124 for(d=0; d<dd->ndim; d++)
7126 comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
7127 comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7128 snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
7129 comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
7130 if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7132 comm->bVacDLBNoLimit = FALSE;
7136 /* cellsize_limit is set for LINCS in init_domain_decomposition */
7137 if (!comm->bVacDLBNoLimit)
7139 comm->cellsize_limit = max(comm->cellsize_limit,
7140 comm->cutoff/comm->maxpulse);
7142 comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7143 /* Set the minimum cell size for each DD dimension */
7144 for(d=0; d<dd->ndim; d++)
7146 if (comm->bVacDLBNoLimit ||
7147 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7149 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7151 else
7153 comm->cellsize_min_dlb[dd->dim[d]] =
7154 comm->cutoff/comm->cd[d].np_dlb;
7157 if (comm->cutoff_mbody <= 0)
7159 comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
7161 if (comm->bDynLoadBal)
7163 set_dlb_limits(dd);
7167 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC)
7169 /* If each molecule is a single charge group
7170 * or we use domain decomposition for each periodic dimension,
7171 * we do not need to take pbc into account for the bonded interactions.
7173 return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7174 !(dd->nc[XX]>1 &&
7175 dd->nc[YY]>1 &&
7176 (dd->nc[ZZ]>1 || ePBC==epbcXY)));
7179 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
7180 t_inputrec *ir,t_forcerec *fr,
7181 gmx_ddbox_t *ddbox)
7183 gmx_domdec_comm_t *comm;
7184 int natoms_tot;
7185 real vol_frac;
7187 comm = dd->comm;
7189 /* Initialize the thread data.
7190 * This can not be done in init_domain_decomposition,
7191 * as the numbers of threads is determined later.
7193 comm->nth = gmx_omp_nthreads_get(emntDomdec);
7194 if (comm->nth > 1)
7196 snew(comm->dth,comm->nth);
7199 if (EEL_PME(ir->coulombtype))
7201 init_ddpme(dd,&comm->ddpme[0],0);
7202 if (comm->npmedecompdim >= 2)
7204 init_ddpme(dd,&comm->ddpme[1],1);
7207 else
7209 comm->npmenodes = 0;
7210 if (dd->pme_nodeid >= 0)
7212 gmx_fatal_collective(FARGS,NULL,dd,
7213 "Can not have separate PME nodes without PME electrostatics");
7217 if (debug)
7219 fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
7221 if (comm->eDLB != edlbNO)
7223 set_cell_limits_dlb(dd,dlb_scale,ir,ddbox);
7226 print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
7227 if (comm->eDLB == edlbAUTO)
7229 if (fplog)
7231 fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
7233 print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
7236 if (ir->ePBC == epbcNONE)
7238 vol_frac = 1 - 1/(double)dd->nnodes;
7240 else
7242 vol_frac =
7243 (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7245 if (debug)
7247 fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7249 natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7251 dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7254 gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
7255 real cutoff_req)
7257 gmx_domdec_t *dd;
7258 gmx_ddbox_t ddbox;
7259 int d,dim,np;
7260 real inv_cell_size;
7261 int LocallyLimited;
7263 dd = cr->dd;
7265 set_ddbox(dd,FALSE,cr,ir,state->box,
7266 TRUE,&dd->comm->cgs_gl,state->x,&ddbox);
7268 LocallyLimited = 0;
7270 for(d=0; d<dd->ndim; d++)
7272 dim = dd->dim[d];
7274 inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7275 if (dynamic_dd_box(&ddbox,ir))
7277 inv_cell_size *= DD_PRES_SCALE_MARGIN;
7280 np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7282 if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7283 dd->comm->cd[d].np_dlb > 0)
7285 if (np > dd->comm->cd[d].np_dlb)
7287 return FALSE;
7290 /* If a current local cell size is smaller than the requested
7291 * cut-off, we could still fix it, but this gets very complicated.
7292 * Without fixing here, we might actually need more checks.
7294 if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7296 LocallyLimited = 1;
7301 if (dd->comm->eDLB != edlbNO)
7303 if (check_grid_jump(0,dd,cutoff_req,&ddbox,FALSE))
7305 LocallyLimited = 1;
7308 gmx_sumi(1,&LocallyLimited,cr);
7310 if (LocallyLimited > 0)
7312 return FALSE;
7316 dd->comm->cutoff = cutoff_req;
7318 return TRUE;
7321 static void merge_cg_buffers(int ncell,
7322 gmx_domdec_comm_dim_t *cd, int pulse,
7323 int *ncg_cell,
7324 int *index_gl, int *recv_i,
7325 rvec *cg_cm, rvec *recv_vr,
7326 int *cgindex,
7327 cginfo_mb_t *cginfo_mb,int *cginfo)
7329 gmx_domdec_ind_t *ind,*ind_p;
7330 int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7331 int shift,shift_at;
7333 ind = &cd->ind[pulse];
7335 /* First correct the already stored data */
7336 shift = ind->nrecv[ncell];
7337 for(cell=ncell-1; cell>=0; cell--)
7339 shift -= ind->nrecv[cell];
7340 if (shift > 0)
7342 /* Move the cg's present from previous grid pulses */
7343 cg0 = ncg_cell[ncell+cell];
7344 cg1 = ncg_cell[ncell+cell+1];
7345 cgindex[cg1+shift] = cgindex[cg1];
7346 for(cg=cg1-1; cg>=cg0; cg--)
7348 index_gl[cg+shift] = index_gl[cg];
7349 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7350 cgindex[cg+shift] = cgindex[cg];
7351 cginfo[cg+shift] = cginfo[cg];
7353 /* Correct the already stored send indices for the shift */
7354 for(p=1; p<=pulse; p++)
7356 ind_p = &cd->ind[p];
7357 cg0 = 0;
7358 for(c=0; c<cell; c++)
7360 cg0 += ind_p->nsend[c];
7362 cg1 = cg0 + ind_p->nsend[cell];
7363 for(cg=cg0; cg<cg1; cg++)
7365 ind_p->index[cg] += shift;
7371 /* Merge in the communicated buffers */
7372 shift = 0;
7373 shift_at = 0;
7374 cg0 = 0;
7375 for(cell=0; cell<ncell; cell++)
7377 cg1 = ncg_cell[ncell+cell+1] + shift;
7378 if (shift_at > 0)
7380 /* Correct the old cg indices */
7381 for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7383 cgindex[cg+1] += shift_at;
7386 for(cg=0; cg<ind->nrecv[cell]; cg++)
7388 /* Copy this charge group from the buffer */
7389 index_gl[cg1] = recv_i[cg0];
7390 copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7391 /* Add it to the cgindex */
7392 cg_gl = index_gl[cg1];
7393 cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7394 nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7395 cgindex[cg1+1] = cgindex[cg1] + nat;
7396 cg0++;
7397 cg1++;
7398 shift_at += nat;
7400 shift += ind->nrecv[cell];
7401 ncg_cell[ncell+cell+1] = cg1;
7405 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7406 int nzone,int cg0,const int *cgindex)
7408 int cg,zone,p;
7410 /* Store the atom block boundaries for easy copying of communication buffers
7412 cg = cg0;
7413 for(zone=0; zone<nzone; zone++)
7415 for(p=0; p<cd->np; p++) {
7416 cd->ind[p].cell2at0[zone] = cgindex[cg];
7417 cg += cd->ind[p].nrecv[zone];
7418 cd->ind[p].cell2at1[zone] = cgindex[cg];
7423 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7425 int i;
7426 gmx_bool bMiss;
7428 bMiss = FALSE;
7429 for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7431 if (!bLocalCG[link->a[i]])
7433 bMiss = TRUE;
7437 return bMiss;
7440 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7441 typedef struct {
7442 real c[DIM][4]; /* the corners for the non-bonded communication */
7443 real cr0; /* corner for rounding */
7444 real cr1[4]; /* corners for rounding */
7445 real bc[DIM]; /* corners for bounded communication */
7446 real bcr1; /* corner for rounding for bonded communication */
7447 } dd_corners_t;
7449 /* Determine the corners of the domain(s) we are communicating with */
7450 static void
7451 set_dd_corners(const gmx_domdec_t *dd,
7452 int dim0, int dim1, int dim2,
7453 gmx_bool bDistMB,
7454 dd_corners_t *c)
7456 const gmx_domdec_comm_t *comm;
7457 const gmx_domdec_zones_t *zones;
7458 int i,j;
7460 comm = dd->comm;
7462 zones = &comm->zones;
7464 /* Keep the compiler happy */
7465 c->cr0 = 0;
7466 c->bcr1 = 0;
7468 /* The first dimension is equal for all cells */
7469 c->c[0][0] = comm->cell_x0[dim0];
7470 if (bDistMB)
7472 c->bc[0] = c->c[0][0];
7474 if (dd->ndim >= 2)
7476 dim1 = dd->dim[1];
7477 /* This cell row is only seen from the first row */
7478 c->c[1][0] = comm->cell_x0[dim1];
7479 /* All rows can see this row */
7480 c->c[1][1] = comm->cell_x0[dim1];
7481 if (dd->bGridJump)
7483 c->c[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7484 if (bDistMB)
7486 /* For the multi-body distance we need the maximum */
7487 c->bc[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7490 /* Set the upper-right corner for rounding */
7491 c->cr0 = comm->cell_x1[dim0];
7493 if (dd->ndim >= 3)
7495 dim2 = dd->dim[2];
7496 for(j=0; j<4; j++)
7498 c->c[2][j] = comm->cell_x0[dim2];
7500 if (dd->bGridJump)
7502 /* Use the maximum of the i-cells that see a j-cell */
7503 for(i=0; i<zones->nizone; i++)
7505 for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7507 if (j >= 4)
7509 c->c[2][j-4] =
7510 max(c->c[2][j-4],
7511 comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7515 if (bDistMB)
7517 /* For the multi-body distance we need the maximum */
7518 c->bc[2] = comm->cell_x0[dim2];
7519 for(i=0; i<2; i++)
7521 for(j=0; j<2; j++)
7523 c->bc[2] = max(c->bc[2],comm->zone_d2[i][j].p1_0);
7529 /* Set the upper-right corner for rounding */
7530 /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7531 * Only cell (0,0,0) can see cell 7 (1,1,1)
7533 c->cr1[0] = comm->cell_x1[dim1];
7534 c->cr1[3] = comm->cell_x1[dim1];
7535 if (dd->bGridJump)
7537 c->cr1[0] = max(comm->cell_x1[dim1],comm->zone_d1[1].mch1);
7538 if (bDistMB)
7540 /* For the multi-body distance we need the maximum */
7541 c->bcr1 = max(comm->cell_x1[dim1],comm->zone_d1[1].p1_1);
7548 /* Determine which cg's we need to send in this pulse from this zone */
7549 static void
7550 get_zone_pulse_cgs(gmx_domdec_t *dd,
7551 int zonei, int zone,
7552 int cg0, int cg1,
7553 const int *index_gl,
7554 const int *cgindex,
7555 int dim, int dim_ind,
7556 int dim0, int dim1, int dim2,
7557 real r_comm2, real r_bcomm2,
7558 matrix box,
7559 ivec tric_dist,
7560 rvec *normal,
7561 real skew_fac2_d, real skew_fac_01,
7562 rvec *v_d, rvec *v_0, rvec *v_1,
7563 const dd_corners_t *c,
7564 rvec sf2_round,
7565 gmx_bool bDistBonded,
7566 gmx_bool bBondComm,
7567 gmx_bool bDist2B,
7568 gmx_bool bDistMB,
7569 rvec *cg_cm,
7570 int *cginfo,
7571 gmx_domdec_ind_t *ind,
7572 int **ibuf, int *ibuf_nalloc,
7573 vec_rvec_t *vbuf,
7574 int *nsend_ptr,
7575 int *nat_ptr,
7576 int *nsend_z_ptr)
7578 gmx_domdec_comm_t *comm;
7579 gmx_bool bScrew;
7580 gmx_bool bDistMB_pulse;
7581 int cg,i;
7582 real r2,rb2,r,tric_sh;
7583 rvec rn,rb;
7584 int dimd;
7585 int nsend_z,nsend,nat;
7587 comm = dd->comm;
7589 bScrew = (dd->bScrewPBC && dim == XX);
7591 bDistMB_pulse = (bDistMB && bDistBonded);
7593 nsend_z = 0;
7594 nsend = *nsend_ptr;
7595 nat = *nat_ptr;
7597 for(cg=cg0; cg<cg1; cg++)
7599 r2 = 0;
7600 rb2 = 0;
7601 if (tric_dist[dim_ind] == 0)
7603 /* Rectangular direction, easy */
7604 r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7605 if (r > 0)
7607 r2 += r*r;
7609 if (bDistMB_pulse)
7611 r = cg_cm[cg][dim] - c->bc[dim_ind];
7612 if (r > 0)
7614 rb2 += r*r;
7617 /* Rounding gives at most a 16% reduction
7618 * in communicated atoms
7620 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7622 r = cg_cm[cg][dim0] - c->cr0;
7623 /* This is the first dimension, so always r >= 0 */
7624 r2 += r*r;
7625 if (bDistMB_pulse)
7627 rb2 += r*r;
7630 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7632 r = cg_cm[cg][dim1] - c->cr1[zone];
7633 if (r > 0)
7635 r2 += r*r;
7637 if (bDistMB_pulse)
7639 r = cg_cm[cg][dim1] - c->bcr1;
7640 if (r > 0)
7642 rb2 += r*r;
7647 else
7649 /* Triclinic direction, more complicated */
7650 clear_rvec(rn);
7651 clear_rvec(rb);
7652 /* Rounding, conservative as the skew_fac multiplication
7653 * will slightly underestimate the distance.
7655 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7657 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7658 for(i=dim0+1; i<DIM; i++)
7660 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7662 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7663 if (bDistMB_pulse)
7665 rb[dim0] = rn[dim0];
7666 rb2 = r2;
7668 /* Take care that the cell planes along dim0 might not
7669 * be orthogonal to those along dim1 and dim2.
7671 for(i=1; i<=dim_ind; i++)
7673 dimd = dd->dim[i];
7674 if (normal[dim0][dimd] > 0)
7676 rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7677 if (bDistMB_pulse)
7679 rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7684 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7686 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7687 tric_sh = 0;
7688 for(i=dim1+1; i<DIM; i++)
7690 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7692 rn[dim1] += tric_sh;
7693 if (rn[dim1] > 0)
7695 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7696 /* Take care of coupling of the distances
7697 * to the planes along dim0 and dim1 through dim2.
7699 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7700 /* Take care that the cell planes along dim1
7701 * might not be orthogonal to that along dim2.
7703 if (normal[dim1][dim2] > 0)
7705 rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7708 if (bDistMB_pulse)
7710 rb[dim1] +=
7711 cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7712 if (rb[dim1] > 0)
7714 rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7715 /* Take care of coupling of the distances
7716 * to the planes along dim0 and dim1 through dim2.
7718 rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7719 /* Take care that the cell planes along dim1
7720 * might not be orthogonal to that along dim2.
7722 if (normal[dim1][dim2] > 0)
7724 rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7729 /* The distance along the communication direction */
7730 rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7731 tric_sh = 0;
7732 for(i=dim+1; i<DIM; i++)
7734 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7736 rn[dim] += tric_sh;
7737 if (rn[dim] > 0)
7739 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7740 /* Take care of coupling of the distances
7741 * to the planes along dim0 and dim1 through dim2.
7743 if (dim_ind == 1 && zonei == 1)
7745 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7748 if (bDistMB_pulse)
7750 clear_rvec(rb);
7751 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7752 if (rb[dim] > 0)
7754 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7755 /* Take care of coupling of the distances
7756 * to the planes along dim0 and dim1 through dim2.
7758 if (dim_ind == 1 && zonei == 1)
7760 rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7766 if (r2 < r_comm2 ||
7767 (bDistBonded &&
7768 ((bDistMB && rb2 < r_bcomm2) ||
7769 (bDist2B && r2 < r_bcomm2)) &&
7770 (!bBondComm ||
7771 (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7772 missing_link(comm->cglink,index_gl[cg],
7773 comm->bLocalCG)))))
7775 /* Make an index to the local charge groups */
7776 if (nsend+1 > ind->nalloc)
7778 ind->nalloc = over_alloc_large(nsend+1);
7779 srenew(ind->index,ind->nalloc);
7781 if (nsend+1 > *ibuf_nalloc)
7783 *ibuf_nalloc = over_alloc_large(nsend+1);
7784 srenew(*ibuf,*ibuf_nalloc);
7786 ind->index[nsend] = cg;
7787 (*ibuf)[nsend] = index_gl[cg];
7788 nsend_z++;
7789 vec_rvec_check_alloc(vbuf,nsend+1);
7791 if (dd->ci[dim] == 0)
7793 /* Correct cg_cm for pbc */
7794 rvec_add(cg_cm[cg],box[dim],vbuf->v[nsend]);
7795 if (bScrew)
7797 vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7798 vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7801 else
7803 copy_rvec(cg_cm[cg],vbuf->v[nsend]);
7805 nsend++;
7806 nat += cgindex[cg+1] - cgindex[cg];
7810 *nsend_ptr = nsend;
7811 *nat_ptr = nat;
7812 *nsend_z_ptr = nsend_z;
7815 static void setup_dd_communication(gmx_domdec_t *dd,
7816 matrix box,gmx_ddbox_t *ddbox,
7817 t_forcerec *fr,t_state *state,rvec **f)
7819 int dim_ind,dim,dim0,dim1,dim2,dimd,p,nat_tot;
7820 int nzone,nzone_send,zone,zonei,cg0,cg1;
7821 int c,i,j,cg,cg_gl,nrcg;
7822 int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7823 gmx_domdec_comm_t *comm;
7824 gmx_domdec_zones_t *zones;
7825 gmx_domdec_comm_dim_t *cd;
7826 gmx_domdec_ind_t *ind;
7827 cginfo_mb_t *cginfo_mb;
7828 gmx_bool bBondComm,bDist2B,bDistMB,bDistBonded;
7829 real r_mb,r_comm2,r_scomm2,r_bcomm2,r_0,r_1,r2inc,inv_ncg;
7830 dd_corners_t corners;
7831 ivec tric_dist;
7832 rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7833 real skew_fac2_d,skew_fac_01;
7834 rvec sf2_round;
7835 int nsend,nat;
7836 int th;
7838 if (debug)
7840 fprintf(debug,"Setting up DD communication\n");
7843 comm = dd->comm;
7845 switch (fr->cutoff_scheme)
7847 case ecutsGROUP:
7848 cg_cm = fr->cg_cm;
7849 break;
7850 case ecutsVERLET:
7851 cg_cm = state->x;
7852 break;
7853 default:
7854 gmx_incons("unimplemented");
7855 cg_cm = NULL;
7858 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7860 dim = dd->dim[dim_ind];
7862 /* Check if we need to use triclinic distances */
7863 tric_dist[dim_ind] = 0;
7864 for(i=0; i<=dim_ind; i++)
7866 if (ddbox->tric_dir[dd->dim[i]])
7868 tric_dist[dim_ind] = 1;
7873 bBondComm = comm->bBondComm;
7875 /* Do we need to determine extra distances for multi-body bondeds? */
7876 bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7878 /* Do we need to determine extra distances for only two-body bondeds? */
7879 bDist2B = (bBondComm && !bDistMB);
7881 r_comm2 = sqr(comm->cutoff);
7882 r_bcomm2 = sqr(comm->cutoff_mbody);
7884 if (debug)
7886 fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7889 zones = &comm->zones;
7891 dim0 = dd->dim[0];
7892 dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
7893 dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
7895 set_dd_corners(dd,dim0,dim1,dim2,bDistMB,&corners);
7897 /* Triclinic stuff */
7898 normal = ddbox->normal;
7899 skew_fac_01 = 0;
7900 if (dd->ndim >= 2)
7902 v_0 = ddbox->v[dim0];
7903 if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7905 /* Determine the coupling coefficient for the distances
7906 * to the cell planes along dim0 and dim1 through dim2.
7907 * This is required for correct rounding.
7909 skew_fac_01 =
7910 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7911 if (debug)
7913 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7917 if (dd->ndim >= 3)
7919 v_1 = ddbox->v[dim1];
7922 zone_cg_range = zones->cg_range;
7923 index_gl = dd->index_gl;
7924 cgindex = dd->cgindex;
7925 cginfo_mb = fr->cginfo_mb;
7927 zone_cg_range[0] = 0;
7928 zone_cg_range[1] = dd->ncg_home;
7929 comm->zone_ncg1[0] = dd->ncg_home;
7930 pos_cg = dd->ncg_home;
7932 nat_tot = dd->nat_home;
7933 nzone = 1;
7934 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7936 dim = dd->dim[dim_ind];
7937 cd = &comm->cd[dim_ind];
7939 if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7941 /* No pbc in this dimension, the first node should not comm. */
7942 nzone_send = 0;
7944 else
7946 nzone_send = nzone;
7949 v_d = ddbox->v[dim];
7950 skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7952 cd->bInPlace = TRUE;
7953 for(p=0; p<cd->np; p++)
7955 /* Only atoms communicated in the first pulse are used
7956 * for multi-body bonded interactions or for bBondComm.
7958 bDistBonded = ((bDistMB || bDist2B) && p == 0);
7960 ind = &cd->ind[p];
7961 nsend = 0;
7962 nat = 0;
7963 for(zone=0; zone<nzone_send; zone++)
7965 if (tric_dist[dim_ind] && dim_ind > 0)
7967 /* Determine slightly more optimized skew_fac's
7968 * for rounding.
7969 * This reduces the number of communicated atoms
7970 * by about 10% for 3D DD of rhombic dodecahedra.
7972 for(dimd=0; dimd<dim; dimd++)
7974 sf2_round[dimd] = 1;
7975 if (ddbox->tric_dir[dimd])
7977 for(i=dd->dim[dimd]+1; i<DIM; i++)
7979 /* If we are shifted in dimension i
7980 * and the cell plane is tilted forward
7981 * in dimension i, skip this coupling.
7983 if (!(zones->shift[nzone+zone][i] &&
7984 ddbox->v[dimd][i][dimd] >= 0))
7986 sf2_round[dimd] +=
7987 sqr(ddbox->v[dimd][i][dimd]);
7990 sf2_round[dimd] = 1/sf2_round[dimd];
7995 zonei = zone_perm[dim_ind][zone];
7996 if (p == 0)
7998 /* Here we permutate the zones to obtain a convenient order
7999 * for neighbor searching
8001 cg0 = zone_cg_range[zonei];
8002 cg1 = zone_cg_range[zonei+1];
8004 else
8006 /* Look only at the cg's received in the previous grid pulse
8008 cg1 = zone_cg_range[nzone+zone+1];
8009 cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8012 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8013 for(th=0; th<comm->nth; th++)
8015 gmx_domdec_ind_t *ind_p;
8016 int **ibuf_p,*ibuf_nalloc_p;
8017 vec_rvec_t *vbuf_p;
8018 int *nsend_p,*nat_p;
8019 int *nsend_zone_p;
8020 int cg0_th,cg1_th;
8022 if (th == 0)
8024 /* Thread 0 writes in the comm buffers */
8025 ind_p = ind;
8026 ibuf_p = &comm->buf_int;
8027 ibuf_nalloc_p = &comm->nalloc_int;
8028 vbuf_p = &comm->vbuf;
8029 nsend_p = &nsend;
8030 nat_p = &nat;
8031 nsend_zone_p = &ind->nsend[zone];
8033 else
8035 /* Other threads write into temp buffers */
8036 ind_p = &comm->dth[th].ind;
8037 ibuf_p = &comm->dth[th].ibuf;
8038 ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8039 vbuf_p = &comm->dth[th].vbuf;
8040 nsend_p = &comm->dth[th].nsend;
8041 nat_p = &comm->dth[th].nat;
8042 nsend_zone_p = &comm->dth[th].nsend_zone;
8044 comm->dth[th].nsend = 0;
8045 comm->dth[th].nat = 0;
8046 comm->dth[th].nsend_zone = 0;
8049 if (comm->nth == 1)
8051 cg0_th = cg0;
8052 cg1_th = cg1;
8054 else
8056 cg0_th = cg0 + ((cg1 - cg0)* th )/comm->nth;
8057 cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8060 /* Get the cg's for this pulse in this zone */
8061 get_zone_pulse_cgs(dd,zonei,zone,cg0_th,cg1_th,
8062 index_gl,cgindex,
8063 dim,dim_ind,dim0,dim1,dim2,
8064 r_comm2,r_bcomm2,
8065 box,tric_dist,
8066 normal,skew_fac2_d,skew_fac_01,
8067 v_d,v_0,v_1,&corners,sf2_round,
8068 bDistBonded,bBondComm,
8069 bDist2B,bDistMB,
8070 cg_cm,fr->cginfo,
8071 ind_p,
8072 ibuf_p,ibuf_nalloc_p,
8073 vbuf_p,
8074 nsend_p,nat_p,
8075 nsend_zone_p);
8078 /* Append data of threads>=1 to the communication buffers */
8079 for(th=1; th<comm->nth; th++)
8081 dd_comm_setup_work_t *dth;
8082 int i,ns1;
8084 dth = &comm->dth[th];
8086 ns1 = nsend + dth->nsend_zone;
8087 if (ns1 > ind->nalloc)
8089 ind->nalloc = over_alloc_dd(ns1);
8090 srenew(ind->index,ind->nalloc);
8092 if (ns1 > comm->nalloc_int)
8094 comm->nalloc_int = over_alloc_dd(ns1);
8095 srenew(comm->buf_int,comm->nalloc_int);
8097 if (ns1 > comm->vbuf.nalloc)
8099 comm->vbuf.nalloc = over_alloc_dd(ns1);
8100 srenew(comm->vbuf.v,comm->vbuf.nalloc);
8103 for(i=0; i<dth->nsend_zone; i++)
8105 ind->index[nsend] = dth->ind.index[i];
8106 comm->buf_int[nsend] = dth->ibuf[i];
8107 copy_rvec(dth->vbuf.v[i],
8108 comm->vbuf.v[nsend]);
8109 nsend++;
8111 nat += dth->nat;
8112 ind->nsend[zone] += dth->nsend_zone;
8115 /* Clear the counts in case we do not have pbc */
8116 for(zone=nzone_send; zone<nzone; zone++)
8118 ind->nsend[zone] = 0;
8120 ind->nsend[nzone] = nsend;
8121 ind->nsend[nzone+1] = nat;
8122 /* Communicate the number of cg's and atoms to receive */
8123 dd_sendrecv_int(dd, dim_ind, dddirBackward,
8124 ind->nsend, nzone+2,
8125 ind->nrecv, nzone+2);
8127 /* The rvec buffer is also required for atom buffers of size nsend
8128 * in dd_move_x and dd_move_f.
8130 vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
8132 if (p > 0)
8134 /* We can receive in place if only the last zone is not empty */
8135 for(zone=0; zone<nzone-1; zone++)
8137 if (ind->nrecv[zone] > 0)
8139 cd->bInPlace = FALSE;
8142 if (!cd->bInPlace)
8144 /* The int buffer is only required here for the cg indices */
8145 if (ind->nrecv[nzone] > comm->nalloc_int2)
8147 comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8148 srenew(comm->buf_int2,comm->nalloc_int2);
8150 /* The rvec buffer is also required for atom buffers
8151 * of size nrecv in dd_move_x and dd_move_f.
8153 i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
8154 vec_rvec_check_alloc(&comm->vbuf2,i);
8158 /* Make space for the global cg indices */
8159 if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8160 || dd->cg_nalloc == 0)
8162 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8163 srenew(index_gl,dd->cg_nalloc);
8164 srenew(cgindex,dd->cg_nalloc+1);
8166 /* Communicate the global cg indices */
8167 if (cd->bInPlace)
8169 recv_i = index_gl + pos_cg;
8171 else
8173 recv_i = comm->buf_int2;
8175 dd_sendrecv_int(dd, dim_ind, dddirBackward,
8176 comm->buf_int, nsend,
8177 recv_i, ind->nrecv[nzone]);
8179 /* Make space for cg_cm */
8180 dd_check_alloc_ncg(fr,state,f,pos_cg + ind->nrecv[nzone]);
8181 if (fr->cutoff_scheme == ecutsGROUP)
8183 cg_cm = fr->cg_cm;
8185 else
8187 cg_cm = state->x;
8189 /* Communicate cg_cm */
8190 if (cd->bInPlace)
8192 recv_vr = cg_cm + pos_cg;
8194 else
8196 recv_vr = comm->vbuf2.v;
8198 dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8199 comm->vbuf.v, nsend,
8200 recv_vr, ind->nrecv[nzone]);
8202 /* Make the charge group index */
8203 if (cd->bInPlace)
8205 zone = (p == 0 ? 0 : nzone - 1);
8206 while (zone < nzone)
8208 for(cg=0; cg<ind->nrecv[zone]; cg++)
8210 cg_gl = index_gl[pos_cg];
8211 fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
8212 nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8213 cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
8214 if (bBondComm)
8216 /* Update the charge group presence,
8217 * so we can use it in the next pass of the loop.
8219 comm->bLocalCG[cg_gl] = TRUE;
8221 pos_cg++;
8223 if (p == 0)
8225 comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8227 zone++;
8228 zone_cg_range[nzone+zone] = pos_cg;
8231 else
8233 /* This part of the code is never executed with bBondComm. */
8234 merge_cg_buffers(nzone,cd,p,zone_cg_range,
8235 index_gl,recv_i,cg_cm,recv_vr,
8236 cgindex,fr->cginfo_mb,fr->cginfo);
8237 pos_cg += ind->nrecv[nzone];
8239 nat_tot += ind->nrecv[nzone+1];
8241 if (!cd->bInPlace)
8243 /* Store the atom block for easy copying of communication buffers */
8244 make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
8246 nzone += nzone;
8248 dd->index_gl = index_gl;
8249 dd->cgindex = cgindex;
8251 dd->ncg_tot = zone_cg_range[zones->n];
8252 dd->nat_tot = nat_tot;
8253 comm->nat[ddnatHOME] = dd->nat_home;
8254 for(i=ddnatZONE; i<ddnatNR; i++)
8256 comm->nat[i] = dd->nat_tot;
8259 if (!bBondComm)
8261 /* We don't need to update cginfo, since that was alrady done above.
8262 * So we pass NULL for the forcerec.
8264 dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
8265 NULL,comm->bLocalCG);
8268 if (debug)
8270 fprintf(debug,"Finished setting up DD communication, zones:");
8271 for(c=0; c<zones->n; c++)
8273 fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
8275 fprintf(debug,"\n");
8279 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8281 int c;
8283 for(c=0; c<zones->nizone; c++)
8285 zones->izone[c].cg1 = zones->cg_range[c+1];
8286 zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8287 zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8291 static void set_zones_size(gmx_domdec_t *dd,
8292 matrix box,const gmx_ddbox_t *ddbox,
8293 int zone_start,int zone_end)
8295 gmx_domdec_comm_t *comm;
8296 gmx_domdec_zones_t *zones;
8297 gmx_bool bDistMB;
8298 int z,zi,zj0,zj1,d,dim;
8299 real rcs,rcmbs;
8300 int i,j;
8301 real size_j,add_tric;
8302 real vol;
8304 comm = dd->comm;
8306 zones = &comm->zones;
8308 /* Do we need to determine extra distances for multi-body bondeds? */
8309 bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8311 for(z=zone_start; z<zone_end; z++)
8313 /* Copy cell limits to zone limits.
8314 * Valid for non-DD dims and non-shifted dims.
8316 copy_rvec(comm->cell_x0,zones->size[z].x0);
8317 copy_rvec(comm->cell_x1,zones->size[z].x1);
8320 for(d=0; d<dd->ndim; d++)
8322 dim = dd->dim[d];
8324 for(z=0; z<zones->n; z++)
8326 /* With a staggered grid we have different sizes
8327 * for non-shifted dimensions.
8329 if (dd->bGridJump && zones->shift[z][dim] == 0)
8331 if (d == 1)
8333 zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8334 zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8336 else if (d == 2)
8338 zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8339 zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8344 rcs = comm->cutoff;
8345 rcmbs = comm->cutoff_mbody;
8346 if (ddbox->tric_dir[dim])
8348 rcs /= ddbox->skew_fac[dim];
8349 rcmbs /= ddbox->skew_fac[dim];
8352 /* Set the lower limit for the shifted zone dimensions */
8353 for(z=zone_start; z<zone_end; z++)
8355 if (zones->shift[z][dim] > 0)
8357 dim = dd->dim[d];
8358 if (!dd->bGridJump || d == 0)
8360 zones->size[z].x0[dim] = comm->cell_x1[dim];
8361 zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8363 else
8365 /* Here we take the lower limit of the zone from
8366 * the lowest domain of the zone below.
8368 if (z < 4)
8370 zones->size[z].x0[dim] =
8371 comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8373 else
8375 if (d == 1)
8377 zones->size[z].x0[dim] =
8378 zones->size[zone_perm[2][z-4]].x0[dim];
8380 else
8382 zones->size[z].x0[dim] =
8383 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8386 /* A temporary limit, is updated below */
8387 zones->size[z].x1[dim] = zones->size[z].x0[dim];
8389 if (bDistMB)
8391 for(zi=0; zi<zones->nizone; zi++)
8393 if (zones->shift[zi][dim] == 0)
8395 /* This takes the whole zone into account.
8396 * With multiple pulses this will lead
8397 * to a larger zone then strictly necessary.
8399 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8400 zones->size[zi].x1[dim]+rcmbs);
8408 /* Loop over the i-zones to set the upper limit of each
8409 * j-zone they see.
8411 for(zi=0; zi<zones->nizone; zi++)
8413 if (zones->shift[zi][dim] == 0)
8415 for(z=zones->izone[zi].j0; z<zones->izone[zi].j1; z++)
8417 if (zones->shift[z][dim] > 0)
8419 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8420 zones->size[zi].x1[dim]+rcs);
8427 for(z=zone_start; z<zone_end; z++)
8429 for(i=0; i<DIM; i++)
8431 zones->size[z].bb_x0[i] = zones->size[z].x0[i];
8432 zones->size[z].bb_x1[i] = zones->size[z].x1[i];
8434 for(j=i+1; j<ddbox->npbcdim; j++)
8436 /* With 1D domain decomposition the cg's are not in
8437 * the triclinic box, but trilinic x-y and rectangular y-z.
8439 if (box[j][i] != 0 &&
8440 !(dd->ndim == 1 && i == YY && j == ZZ))
8442 /* Correct for triclinic offset of the lower corner */
8443 add_tric = zones->size[z].x0[j]*box[j][i]/box[j][j];
8444 zones->size[z].bb_x0[i] += add_tric;
8445 zones->size[z].bb_x1[i] += add_tric;
8447 /* Correct for triclinic offset of the upper corner */
8448 size_j = zones->size[z].x1[j] - zones->size[z].x0[j];
8449 add_tric = size_j*box[j][i]/box[j][j];
8451 if (box[j][i] < 0)
8453 zones->size[z].bb_x0[i] += add_tric;
8455 else
8457 zones->size[z].bb_x1[i] += add_tric;
8464 if (zone_start == 0)
8466 vol = 1;
8467 for(dim=0; dim<DIM; dim++)
8469 vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8471 zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8474 if (debug)
8476 for(z=zone_start; z<zone_end; z++)
8478 fprintf(debug,"zone %d %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
8480 zones->size[z].x0[XX],zones->size[z].x1[XX],
8481 zones->size[z].x0[YY],zones->size[z].x1[YY],
8482 zones->size[z].x0[ZZ],zones->size[z].x1[ZZ]);
8483 fprintf(debug,"zone %d bb %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
8485 zones->size[z].bb_x0[XX],zones->size[z].bb_x1[XX],
8486 zones->size[z].bb_x0[YY],zones->size[z].bb_x1[YY],
8487 zones->size[z].bb_x0[ZZ],zones->size[z].bb_x1[ZZ]);
8492 static int comp_cgsort(const void *a,const void *b)
8494 int comp;
8496 gmx_cgsort_t *cga,*cgb;
8497 cga = (gmx_cgsort_t *)a;
8498 cgb = (gmx_cgsort_t *)b;
8500 comp = cga->nsc - cgb->nsc;
8501 if (comp == 0)
8503 comp = cga->ind_gl - cgb->ind_gl;
8506 return comp;
8509 static void order_int_cg(int n,const gmx_cgsort_t *sort,
8510 int *a,int *buf)
8512 int i;
8514 /* Order the data */
8515 for(i=0; i<n; i++)
8517 buf[i] = a[sort[i].ind];
8520 /* Copy back to the original array */
8521 for(i=0; i<n; i++)
8523 a[i] = buf[i];
8527 static void order_vec_cg(int n,const gmx_cgsort_t *sort,
8528 rvec *v,rvec *buf)
8530 int i;
8532 /* Order the data */
8533 for(i=0; i<n; i++)
8535 copy_rvec(v[sort[i].ind],buf[i]);
8538 /* Copy back to the original array */
8539 for(i=0; i<n; i++)
8541 copy_rvec(buf[i],v[i]);
8545 static void order_vec_atom(int ncg,const int *cgindex,const gmx_cgsort_t *sort,
8546 rvec *v,rvec *buf)
8548 int a,atot,cg,cg0,cg1,i;
8550 if (cgindex == NULL)
8552 /* Avoid the useless loop of the atoms within a cg */
8553 order_vec_cg(ncg,sort,v,buf);
8555 return;
8558 /* Order the data */
8559 a = 0;
8560 for(cg=0; cg<ncg; cg++)
8562 cg0 = cgindex[sort[cg].ind];
8563 cg1 = cgindex[sort[cg].ind+1];
8564 for(i=cg0; i<cg1; i++)
8566 copy_rvec(v[i],buf[a]);
8567 a++;
8570 atot = a;
8572 /* Copy back to the original array */
8573 for(a=0; a<atot; a++)
8575 copy_rvec(buf[a],v[a]);
8579 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
8580 int nsort_new,gmx_cgsort_t *sort_new,
8581 gmx_cgsort_t *sort1)
8583 int i1,i2,i_new;
8585 /* The new indices are not very ordered, so we qsort them */
8586 qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
8588 /* sort2 is already ordered, so now we can merge the two arrays */
8589 i1 = 0;
8590 i2 = 0;
8591 i_new = 0;
8592 while(i2 < nsort2 || i_new < nsort_new)
8594 if (i2 == nsort2)
8596 sort1[i1++] = sort_new[i_new++];
8598 else if (i_new == nsort_new)
8600 sort1[i1++] = sort2[i2++];
8602 else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8603 (sort2[i2].nsc == sort_new[i_new].nsc &&
8604 sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8606 sort1[i1++] = sort2[i2++];
8608 else
8610 sort1[i1++] = sort_new[i_new++];
8615 static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
8617 gmx_domdec_sort_t *sort;
8618 gmx_cgsort_t *cgsort,*sort_i;
8619 int ncg_new,nsort2,nsort_new,i,*a,moved,*ibuf;
8620 int sort_last,sort_skip;
8622 sort = dd->comm->sort;
8624 a = fr->ns.grid->cell_index;
8626 moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8628 if (ncg_home_old >= 0)
8630 /* The charge groups that remained in the same ns grid cell
8631 * are completely ordered. So we can sort efficiently by sorting
8632 * the charge groups that did move into the stationary list.
8634 ncg_new = 0;
8635 nsort2 = 0;
8636 nsort_new = 0;
8637 for(i=0; i<dd->ncg_home; i++)
8639 /* Check if this cg did not move to another node */
8640 if (a[i] < moved)
8642 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8644 /* This cg is new on this node or moved ns grid cell */
8645 if (nsort_new >= sort->sort_new_nalloc)
8647 sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8648 srenew(sort->sort_new,sort->sort_new_nalloc);
8650 sort_i = &(sort->sort_new[nsort_new++]);
8652 else
8654 /* This cg did not move */
8655 sort_i = &(sort->sort2[nsort2++]);
8657 /* Sort on the ns grid cell indices
8658 * and the global topology index.
8659 * index_gl is irrelevant with cell ns,
8660 * but we set it here anyhow to avoid a conditional.
8662 sort_i->nsc = a[i];
8663 sort_i->ind_gl = dd->index_gl[i];
8664 sort_i->ind = i;
8665 ncg_new++;
8668 if (debug)
8670 fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
8671 nsort2,nsort_new);
8673 /* Sort efficiently */
8674 ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,
8675 sort->sort);
8677 else
8679 cgsort = sort->sort;
8680 ncg_new = 0;
8681 for(i=0; i<dd->ncg_home; i++)
8683 /* Sort on the ns grid cell indices
8684 * and the global topology index
8686 cgsort[i].nsc = a[i];
8687 cgsort[i].ind_gl = dd->index_gl[i];
8688 cgsort[i].ind = i;
8689 if (cgsort[i].nsc < moved)
8691 ncg_new++;
8694 if (debug)
8696 fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
8698 /* Determine the order of the charge groups using qsort */
8699 qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
8702 return ncg_new;
8705 static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
8707 gmx_cgsort_t *sort;
8708 int ncg_new,i,*a,na;
8710 sort = dd->comm->sort->sort;
8712 nbnxn_get_atomorder(fr->nbv->nbs,&a,&na);
8714 ncg_new = 0;
8715 for(i=0; i<na; i++)
8717 if (a[i] >= 0)
8719 sort[ncg_new].ind = a[i];
8720 ncg_new++;
8724 return ncg_new;
8727 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
8728 rvec *cgcm,t_forcerec *fr,t_state *state,
8729 int ncg_home_old)
8731 gmx_domdec_sort_t *sort;
8732 gmx_cgsort_t *cgsort,*sort_i;
8733 int *cgindex;
8734 int ncg_new,i,*ibuf,cgsize;
8735 rvec *vbuf;
8737 sort = dd->comm->sort;
8739 if (dd->ncg_home > sort->sort_nalloc)
8741 sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8742 srenew(sort->sort,sort->sort_nalloc);
8743 srenew(sort->sort2,sort->sort_nalloc);
8745 cgsort = sort->sort;
8747 switch (fr->cutoff_scheme)
8749 case ecutsGROUP:
8750 ncg_new = dd_sort_order(dd,fr,ncg_home_old);
8751 break;
8752 case ecutsVERLET:
8753 ncg_new = dd_sort_order_nbnxn(dd,fr);
8754 break;
8755 default:
8756 gmx_incons("unimplemented");
8757 ncg_new = 0;
8760 /* We alloc with the old size, since cgindex is still old */
8761 vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
8762 vbuf = dd->comm->vbuf.v;
8764 if (dd->comm->bCGs)
8766 cgindex = dd->cgindex;
8768 else
8770 cgindex = NULL;
8773 /* Remove the charge groups which are no longer at home here */
8774 dd->ncg_home = ncg_new;
8775 if (debug)
8777 fprintf(debug,"Set the new home charge group count to %d\n",
8778 dd->ncg_home);
8781 /* Reorder the state */
8782 for(i=0; i<estNR; i++)
8784 if (EST_DISTR(i) && (state->flags & (1<<i)))
8786 switch (i)
8788 case estX:
8789 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->x,vbuf);
8790 break;
8791 case estV:
8792 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->v,vbuf);
8793 break;
8794 case estSDX:
8795 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->sd_X,vbuf);
8796 break;
8797 case estCGP:
8798 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->cg_p,vbuf);
8799 break;
8800 case estLD_RNG:
8801 case estLD_RNGI:
8802 case estDISRE_INITF:
8803 case estDISRE_RM3TAV:
8804 case estORIRE_INITF:
8805 case estORIRE_DTAV:
8806 /* No ordering required */
8807 break;
8808 default:
8809 gmx_incons("Unknown state entry encountered in dd_sort_state");
8810 break;
8814 if (fr->cutoff_scheme == ecutsGROUP)
8816 /* Reorder cgcm */
8817 order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8820 if (dd->ncg_home+1 > sort->ibuf_nalloc)
8822 sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8823 srenew(sort->ibuf,sort->ibuf_nalloc);
8825 ibuf = sort->ibuf;
8826 /* Reorder the global cg index */
8827 order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8828 /* Reorder the cginfo */
8829 order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8830 /* Rebuild the local cg index */
8831 if (dd->comm->bCGs)
8833 ibuf[0] = 0;
8834 for(i=0; i<dd->ncg_home; i++)
8836 cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8837 ibuf[i+1] = ibuf[i] + cgsize;
8839 for(i=0; i<dd->ncg_home+1; i++)
8841 dd->cgindex[i] = ibuf[i];
8844 else
8846 for(i=0; i<dd->ncg_home+1; i++)
8848 dd->cgindex[i] = i;
8851 /* Set the home atom number */
8852 dd->nat_home = dd->cgindex[dd->ncg_home];
8854 if (fr->cutoff_scheme == ecutsVERLET)
8856 /* The atoms are now exactly in grid order, update the grid order */
8857 nbnxn_set_atomorder(fr->nbv->nbs);
8859 else
8861 /* Copy the sorted ns cell indices back to the ns grid struct */
8862 for(i=0; i<dd->ncg_home; i++)
8864 fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8866 fr->ns.grid->nr = dd->ncg_home;
8870 static void add_dd_statistics(gmx_domdec_t *dd)
8872 gmx_domdec_comm_t *comm;
8873 int ddnat;
8875 comm = dd->comm;
8877 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8879 comm->sum_nat[ddnat-ddnatZONE] +=
8880 comm->nat[ddnat] - comm->nat[ddnat-1];
8882 comm->ndecomp++;
8885 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8887 gmx_domdec_comm_t *comm;
8888 int ddnat;
8890 comm = dd->comm;
8892 /* Reset all the statistics and counters for total run counting */
8893 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8895 comm->sum_nat[ddnat-ddnatZONE] = 0;
8897 comm->ndecomp = 0;
8898 comm->nload = 0;
8899 comm->load_step = 0;
8900 comm->load_sum = 0;
8901 comm->load_max = 0;
8902 clear_ivec(comm->load_lim);
8903 comm->load_mdf = 0;
8904 comm->load_pme = 0;
8907 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8909 gmx_domdec_comm_t *comm;
8910 int ddnat;
8911 double av;
8913 comm = cr->dd->comm;
8915 gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8917 if (fplog == NULL)
8919 return;
8922 fprintf(fplog,"\n D O M A I N D E C O M P O S I T I O N S T A T I S T I C S\n\n");
8924 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8926 av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8927 switch(ddnat)
8929 case ddnatZONE:
8930 fprintf(fplog,
8931 " av. #atoms communicated per step for force: %d x %.1f\n",
8932 2,av);
8933 break;
8934 case ddnatVSITE:
8935 if (cr->dd->vsite_comm)
8937 fprintf(fplog,
8938 " av. #atoms communicated per step for vsites: %d x %.1f\n",
8939 (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8940 av);
8942 break;
8943 case ddnatCON:
8944 if (cr->dd->constraint_comm)
8946 fprintf(fplog,
8947 " av. #atoms communicated per step for LINCS: %d x %.1f\n",
8948 1 + ir->nLincsIter,av);
8950 break;
8951 default:
8952 gmx_incons(" Unknown type for DD statistics");
8955 fprintf(fplog,"\n");
8957 if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8959 print_dd_load_av(fplog,cr->dd);
8963 void dd_partition_system(FILE *fplog,
8964 gmx_large_int_t step,
8965 t_commrec *cr,
8966 gmx_bool bMasterState,
8967 int nstglobalcomm,
8968 t_state *state_global,
8969 gmx_mtop_t *top_global,
8970 t_inputrec *ir,
8971 t_state *state_local,
8972 rvec **f,
8973 t_mdatoms *mdatoms,
8974 gmx_localtop_t *top_local,
8975 t_forcerec *fr,
8976 gmx_vsite_t *vsite,
8977 gmx_shellfc_t shellfc,
8978 gmx_constr_t constr,
8979 t_nrnb *nrnb,
8980 gmx_wallcycle_t wcycle,
8981 gmx_bool bVerbose)
8983 gmx_domdec_t *dd;
8984 gmx_domdec_comm_t *comm;
8985 gmx_ddbox_t ddbox={0};
8986 t_block *cgs_gl;
8987 gmx_large_int_t step_pcoupl;
8988 rvec cell_ns_x0,cell_ns_x1;
8989 int i,j,n,cg0=0,ncg_home_old=-1,ncg_moved,nat_f_novirsum;
8990 gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8991 gmx_bool bRedist,bSortCG,bResortAll;
8992 ivec ncells_old={0,0,0},ncells_new={0,0,0},np;
8993 real grid_density;
8994 char sbuf[22];
8996 dd = cr->dd;
8997 comm = dd->comm;
8999 bBoxChanged = (bMasterState || DEFORM(*ir));
9000 if (ir->epc != epcNO)
9002 /* With nstpcouple > 1 pressure coupling happens.
9003 * one step after calculating the pressure.
9004 * Box scaling happens at the end of the MD step,
9005 * after the DD partitioning.
9006 * We therefore have to do DLB in the first partitioning
9007 * after an MD step where P-coupling occured.
9008 * We need to determine the last step in which p-coupling occurred.
9009 * MRS -- need to validate this for vv?
9011 n = ir->nstpcouple;
9012 if (n == 1)
9014 step_pcoupl = step - 1;
9016 else
9018 step_pcoupl = ((step - 1)/n)*n + 1;
9020 if (step_pcoupl >= comm->partition_step)
9022 bBoxChanged = TRUE;
9026 bNStGlobalComm = (step % nstglobalcomm == 0);
9028 if (!comm->bDynLoadBal)
9030 bDoDLB = FALSE;
9032 else
9034 /* Should we do dynamic load balacing this step?
9035 * Since it requires (possibly expensive) global communication,
9036 * we might want to do DLB less frequently.
9038 if (bBoxChanged || ir->epc != epcNO)
9040 bDoDLB = bBoxChanged;
9042 else
9044 bDoDLB = bNStGlobalComm;
9048 /* Check if we have recorded loads on the nodes */
9049 if (comm->bRecordLoad && dd_load_count(comm))
9051 if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9053 /* Check if we should use DLB at the second partitioning
9054 * and every 100 partitionings,
9055 * so the extra communication cost is negligible.
9057 n = max(100,nstglobalcomm);
9058 bCheckDLB = (comm->n_load_collect == 0 ||
9059 comm->n_load_have % n == n-1);
9061 else
9063 bCheckDLB = FALSE;
9066 /* Print load every nstlog, first and last step to the log file */
9067 bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9068 comm->n_load_collect == 0 ||
9069 (ir->nsteps >= 0 &&
9070 (step + ir->nstlist > ir->init_step + ir->nsteps)));
9072 /* Avoid extra communication due to verbose screen output
9073 * when nstglobalcomm is set.
9075 if (bDoDLB || bLogLoad || bCheckDLB ||
9076 (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9078 get_load_distribution(dd,wcycle);
9079 if (DDMASTER(dd))
9081 if (bLogLoad)
9083 dd_print_load(fplog,dd,step-1);
9085 if (bVerbose)
9087 dd_print_load_verbose(dd);
9090 comm->n_load_collect++;
9092 if (bCheckDLB) {
9093 /* Since the timings are node dependent, the master decides */
9094 if (DDMASTER(dd))
9096 bTurnOnDLB =
9097 (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9098 if (debug)
9100 fprintf(debug,"step %s, imb loss %f\n",
9101 gmx_step_str(step,sbuf),
9102 dd_force_imb_perf_loss(dd));
9105 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
9106 if (bTurnOnDLB)
9108 turn_on_dlb(fplog,cr,step);
9109 bDoDLB = TRUE;
9113 comm->n_load_have++;
9116 cgs_gl = &comm->cgs_gl;
9118 bRedist = FALSE;
9119 if (bMasterState)
9121 /* Clear the old state */
9122 clear_dd_indices(dd,0,0);
9124 set_ddbox(dd,bMasterState,cr,ir,state_global->box,
9125 TRUE,cgs_gl,state_global->x,&ddbox);
9127 get_cg_distribution(fplog,step,dd,cgs_gl,
9128 state_global->box,&ddbox,state_global->x);
9130 dd_distribute_state(dd,cgs_gl,
9131 state_global,state_local,f);
9133 dd_make_local_cgs(dd,&top_local->cgs);
9135 /* Ensure that we have space for the new distribution */
9136 dd_check_alloc_ncg(fr,state_local,f,dd->ncg_home);
9138 if (fr->cutoff_scheme == ecutsGROUP)
9140 calc_cgcm(fplog,0,dd->ncg_home,
9141 &top_local->cgs,state_local->x,fr->cg_cm);
9144 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9146 dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9148 cg0 = 0;
9150 else if (state_local->ddp_count != dd->ddp_count)
9152 if (state_local->ddp_count > dd->ddp_count)
9154 gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
9157 if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9159 gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
9162 /* Clear the old state */
9163 clear_dd_indices(dd,0,0);
9165 /* Build the new indices */
9166 rebuild_cgindex(dd,cgs_gl->index,state_local);
9167 make_dd_indices(dd,cgs_gl->index,0);
9169 if (fr->cutoff_scheme == ecutsGROUP)
9171 /* Redetermine the cg COMs */
9172 calc_cgcm(fplog,0,dd->ncg_home,
9173 &top_local->cgs,state_local->x,fr->cg_cm);
9176 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9178 dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9180 set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9181 TRUE,&top_local->cgs,state_local->x,&ddbox);
9183 bRedist = comm->bDynLoadBal;
9185 else
9187 /* We have the full state, only redistribute the cgs */
9189 /* Clear the non-home indices */
9190 clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
9192 /* Avoid global communication for dim's without pbc and -gcom */
9193 if (!bNStGlobalComm)
9195 copy_rvec(comm->box0 ,ddbox.box0 );
9196 copy_rvec(comm->box_size,ddbox.box_size);
9198 set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9199 bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
9201 bBoxChanged = TRUE;
9202 bRedist = TRUE;
9204 /* For dim's without pbc and -gcom */
9205 copy_rvec(ddbox.box0 ,comm->box0 );
9206 copy_rvec(ddbox.box_size,comm->box_size);
9208 set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
9209 step,wcycle);
9211 if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9213 write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
9216 /* Check if we should sort the charge groups */
9217 if (comm->nstSortCG > 0)
9219 bSortCG = (bMasterState ||
9220 (bRedist && (step % comm->nstSortCG == 0)));
9222 else
9224 bSortCG = FALSE;
9227 ncg_home_old = dd->ncg_home;
9229 ncg_moved = 0;
9230 if (bRedist)
9232 wallcycle_sub_start(wcycle,ewcsDD_REDIST);
9234 dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
9235 state_local,f,fr,mdatoms,
9236 !bSortCG,nrnb,&cg0,&ncg_moved);
9238 wallcycle_sub_stop(wcycle,ewcsDD_REDIST);
9241 get_nsgrid_boundaries(ddbox.nboundeddim,state_local->box,
9242 dd,&ddbox,
9243 &comm->cell_x0,&comm->cell_x1,
9244 dd->ncg_home,fr->cg_cm,
9245 cell_ns_x0,cell_ns_x1,&grid_density);
9247 if (bBoxChanged)
9249 comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
9252 switch (fr->cutoff_scheme)
9254 case ecutsGROUP:
9255 copy_ivec(fr->ns.grid->n,ncells_old);
9256 grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
9257 state_local->box,cell_ns_x0,cell_ns_x1,
9258 fr->rlistlong,grid_density);
9259 break;
9260 case ecutsVERLET:
9261 nbnxn_get_ncells(fr->nbv->nbs,&ncells_old[XX],&ncells_old[YY]);
9262 break;
9263 default:
9264 gmx_incons("unimplemented");
9266 /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9267 copy_ivec(ddbox.tric_dir,comm->tric_dir);
9269 if (bSortCG)
9271 wallcycle_sub_start(wcycle,ewcsDD_GRID);
9273 /* Sort the state on charge group position.
9274 * This enables exact restarts from this step.
9275 * It also improves performance by about 15% with larger numbers
9276 * of atoms per node.
9279 /* Fill the ns grid with the home cell,
9280 * so we can sort with the indices.
9282 set_zones_ncg_home(dd);
9284 switch (fr->cutoff_scheme)
9286 case ecutsVERLET:
9287 set_zones_size(dd,state_local->box,&ddbox,0,1);
9289 nbnxn_put_on_grid(fr->nbv->nbs,fr->ePBC,state_local->box,
9291 comm->zones.size[0].bb_x0,
9292 comm->zones.size[0].bb_x1,
9293 0,dd->ncg_home,
9294 comm->zones.dens_zone0,
9295 fr->cginfo,
9296 state_local->x,
9297 ncg_moved,comm->moved,
9298 fr->nbv->grp[eintLocal].kernel_type,
9299 fr->nbv->grp[eintLocal].nbat);
9301 nbnxn_get_ncells(fr->nbv->nbs,&ncells_new[XX],&ncells_new[YY]);
9302 break;
9303 case ecutsGROUP:
9304 fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
9305 0,dd->ncg_home,fr->cg_cm);
9307 copy_ivec(fr->ns.grid->n,ncells_new);
9308 break;
9309 default:
9310 gmx_incons("unimplemented");
9313 bResortAll = bMasterState;
9315 /* Check if we can user the old order and ns grid cell indices
9316 * of the charge groups to sort the charge groups efficiently.
9318 if (ncells_new[XX] != ncells_old[XX] ||
9319 ncells_new[YY] != ncells_old[YY] ||
9320 ncells_new[ZZ] != ncells_old[ZZ])
9322 bResortAll = TRUE;
9325 if (debug)
9327 fprintf(debug,"Step %s, sorting the %d home charge groups\n",
9328 gmx_step_str(step,sbuf),dd->ncg_home);
9330 dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
9331 bResortAll ? -1 : ncg_home_old);
9332 /* Rebuild all the indices */
9333 cg0 = 0;
9334 ga2la_clear(dd->ga2la);
9336 wallcycle_sub_stop(wcycle,ewcsDD_GRID);
9339 wallcycle_sub_start(wcycle,ewcsDD_SETUPCOMM);
9341 /* Setup up the communication and communicate the coordinates */
9342 setup_dd_communication(dd,state_local->box,&ddbox,fr,state_local,f);
9344 /* Set the indices */
9345 make_dd_indices(dd,cgs_gl->index,cg0);
9347 /* Set the charge group boundaries for neighbor searching */
9348 set_cg_boundaries(&comm->zones);
9350 if (fr->cutoff_scheme == ecutsVERLET)
9352 set_zones_size(dd,state_local->box,&ddbox,
9353 bSortCG ? 1 : 0,comm->zones.n);
9356 wallcycle_sub_stop(wcycle,ewcsDD_SETUPCOMM);
9359 write_dd_pdb("dd_home",step,"dump",top_global,cr,
9360 -1,state_local->x,state_local->box);
9363 wallcycle_sub_start(wcycle,ewcsDD_MAKETOP);
9365 /* Extract a local topology from the global topology */
9366 for(i=0; i<dd->ndim; i++)
9368 np[dd->dim[i]] = comm->cd[i].np;
9370 dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
9371 comm->cellsize_min,np,
9373 fr->cutoff_scheme==ecutsGROUP ? fr->cg_cm : state_local->x,
9374 vsite,top_global,top_local);
9376 wallcycle_sub_stop(wcycle,ewcsDD_MAKETOP);
9378 wallcycle_sub_start(wcycle,ewcsDD_MAKECONSTR);
9380 /* Set up the special atom communication */
9381 n = comm->nat[ddnatZONE];
9382 for(i=ddnatZONE+1; i<ddnatNR; i++)
9384 switch(i)
9386 case ddnatVSITE:
9387 if (vsite && vsite->n_intercg_vsite)
9389 n = dd_make_local_vsites(dd,n,top_local->idef.il);
9391 break;
9392 case ddnatCON:
9393 if (dd->bInterCGcons || dd->bInterCGsettles)
9395 /* Only for inter-cg constraints we need special code */
9396 n = dd_make_local_constraints(dd,n,top_global,fr->cginfo,
9397 constr,ir->nProjOrder,
9398 top_local->idef.il);
9400 break;
9401 default:
9402 gmx_incons("Unknown special atom type setup");
9404 comm->nat[i] = n;
9407 wallcycle_sub_stop(wcycle,ewcsDD_MAKECONSTR);
9409 wallcycle_sub_start(wcycle,ewcsDD_TOPOTHER);
9411 /* Make space for the extra coordinates for virtual site
9412 * or constraint communication.
9414 state_local->natoms = comm->nat[ddnatNR-1];
9415 if (state_local->natoms > state_local->nalloc)
9417 dd_realloc_state(state_local,f,state_local->natoms);
9420 if (fr->bF_NoVirSum)
9422 if (vsite && vsite->n_intercg_vsite)
9424 nat_f_novirsum = comm->nat[ddnatVSITE];
9426 else
9428 if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9430 nat_f_novirsum = dd->nat_tot;
9432 else
9434 nat_f_novirsum = dd->nat_home;
9438 else
9440 nat_f_novirsum = 0;
9443 /* Set the number of atoms required for the force calculation.
9444 * Forces need to be constrained when using a twin-range setup
9445 * or with energy minimization. For simple simulations we could
9446 * avoid some allocation, zeroing and copying, but this is
9447 * probably not worth the complications ande checking.
9449 forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
9450 dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
9452 /* We make the all mdatoms up to nat_tot_con.
9453 * We could save some work by only setting invmass
9454 * between nat_tot and nat_tot_con.
9456 /* This call also sets the new number of home particles to dd->nat_home */
9457 atoms2md(top_global,ir,
9458 comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
9460 /* Now we have the charges we can sort the FE interactions */
9461 dd_sort_local_top(dd,mdatoms,top_local);
9463 if (shellfc)
9465 /* Make the local shell stuff, currently no communication is done */
9466 make_local_shells(cr,mdatoms,shellfc);
9469 if (ir->implicit_solvent)
9471 make_local_gb(cr,fr->born,ir->gb_algorithm);
9474 init_bonded_thread_force_reduction(fr,&top_local->idef);
9476 if (!(cr->duty & DUTY_PME))
9478 /* Send the charges to our PME only node */
9479 gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
9480 mdatoms->chargeA,mdatoms->chargeB,
9481 dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
9484 if (constr)
9486 set_constraints(constr,top_local,ir,mdatoms,cr);
9489 if (ir->ePull != epullNO)
9491 /* Update the local pull groups */
9492 dd_make_local_pull_groups(dd,ir->pull,mdatoms);
9495 if (ir->bRot)
9497 /* Update the local rotation groups */
9498 dd_make_local_rotation_groups(dd,ir->rot);
9502 add_dd_statistics(dd);
9504 /* Make sure we only count the cycles for this DD partitioning */
9505 clear_dd_cycle_counts(dd);
9507 /* Because the order of the atoms might have changed since
9508 * the last vsite construction, we need to communicate the constructing
9509 * atom coordinates again (for spreading the forces this MD step).
9511 dd_move_x_vsites(dd,state_local->box,state_local->x);
9513 wallcycle_sub_stop(wcycle,ewcsDD_TOPOTHER);
9515 if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9517 dd_move_x(dd,state_local->box,state_local->x);
9518 write_dd_pdb("dd_dump",step,"dump",top_global,cr,
9519 -1,state_local->x,state_local->box);
9522 /* Store the partitioning step */
9523 comm->partition_step = step;
9525 /* Increase the DD partitioning counter */
9526 dd->ddp_count++;
9527 /* The state currently matches this DD partitioning count, store it */
9528 state_local->ddp_count = dd->ddp_count;
9529 if (bMasterState)
9531 /* The DD master node knows the complete cg distribution,
9532 * store the count so we can possibly skip the cg info communication.
9534 comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9537 if (comm->DD_debug > 0)
9539 /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9540 check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
9541 "after partitioning");