2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2008 David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
5 * Copyright (c) 2013, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
48 #include "gmx_fatal.h"
49 #include "gmx_fatal_collective.h"
52 #include "domdec_network.h"
55 #include "chargegroup.h"
62 #include "pull_rotation.h"
66 #include "mtop_util.h"
67 #include "gmx_ga2la.h"
70 #include "nbnxn_search.h"
72 #include "gmx_omp_nthreads.h"
73 #include "gpu_utils.h"
75 #include "gromacs/fileio/futil.h"
76 #include "gromacs/fileio/gmxfio.h"
77 #include "gromacs/fileio/pdbio.h"
78 #include "gromacs/timing/wallcycle.h"
79 #include "gromacs/utility/gmxmpi.h"
81 #define DDRANK(dd, rank) (rank)
82 #define DDMASTERRANK(dd) (dd->masterrank)
84 typedef struct gmx_domdec_master
86 /* The cell boundaries */
88 /* The global charge group division */
89 int *ncg
; /* Number of home charge groups for each node */
90 int *index
; /* Index of nnodes+1 into cg */
91 int *cg
; /* Global charge group index */
92 int *nat
; /* Number of home atoms for each node. */
93 int *ibuf
; /* Buffer for communication */
94 rvec
*vbuf
; /* Buffer for state scattering and gathering */
95 } gmx_domdec_master_t
;
99 /* The numbers of charge groups to send and receive for each cell
100 * that requires communication, the last entry contains the total
101 * number of atoms that needs to be communicated.
103 int nsend
[DD_MAXIZONE
+2];
104 int nrecv
[DD_MAXIZONE
+2];
105 /* The charge groups to send */
108 /* The atom range for non-in-place communication */
109 int cell2at0
[DD_MAXIZONE
];
110 int cell2at1
[DD_MAXIZONE
];
115 int np
; /* Number of grid pulses in this dimension */
116 int np_dlb
; /* For dlb, for use with edlbAUTO */
117 gmx_domdec_ind_t
*ind
; /* The indices to communicate, size np */
119 gmx_bool bInPlace
; /* Can we communicate in place? */
120 } gmx_domdec_comm_dim_t
;
124 gmx_bool
*bCellMin
; /* Temp. var.: is this cell size at the limit */
125 real
*cell_f
; /* State var.: cell boundaries, box relative */
126 real
*old_cell_f
; /* Temp. var.: old cell size */
127 real
*cell_f_max0
; /* State var.: max lower boundary, incl neighbors */
128 real
*cell_f_min1
; /* State var.: min upper boundary, incl neighbors */
129 real
*bound_min
; /* Temp. var.: lower limit for cell boundary */
130 real
*bound_max
; /* Temp. var.: upper limit for cell boundary */
131 gmx_bool bLimited
; /* State var.: is DLB limited in this dim and row */
132 real
*buf_ncd
; /* Temp. var. */
135 #define DD_NLOAD_MAX 9
137 /* Here floats are accurate enough, since these variables
138 * only influence the load balancing, not the actual MD results.
165 gmx_cgsort_t
*sort_new
;
177 /* This enum determines the order of the coordinates.
178 * ddnatHOME and ddnatZONE should be first and second,
179 * the others can be ordered as wanted.
182 ddnatHOME
, ddnatZONE
, ddnatVSITE
, ddnatCON
, ddnatNR
186 edlbAUTO
, edlbNO
, edlbYES
, edlbNR
188 const char *edlb_names
[edlbNR
] = { "auto", "no", "yes" };
192 int dim
; /* The dimension */
193 gmx_bool dim_match
; /* Tells if DD and PME dims match */
194 int nslab
; /* The number of PME slabs in this dimension */
195 real
*slb_dim_f
; /* Cell sizes for determining the PME comm. with SLB */
196 int *pp_min
; /* The minimum pp node location, size nslab */
197 int *pp_max
; /* The maximum pp node location,size nslab */
198 int maxshift
; /* The maximum shift for coordinate redistribution in PME */
203 real min0
; /* The minimum bottom of this zone */
204 real max1
; /* The maximum top of this zone */
205 real min1
; /* The minimum top of this zone */
206 real mch0
; /* The maximum bottom communicaton height for this zone */
207 real mch1
; /* The maximum top communicaton height for this zone */
208 real p1_0
; /* The bottom value of the first cell in this zone */
209 real p1_1
; /* The top value of the first cell in this zone */
214 gmx_domdec_ind_t ind
;
221 } dd_comm_setup_work_t
;
223 typedef struct gmx_domdec_comm
225 /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
226 * unless stated otherwise.
229 /* The number of decomposition dimensions for PME, 0: no PME */
231 /* The number of nodes doing PME (PP/PME or only PME) */
235 /* The communication setup including the PME only nodes */
236 gmx_bool bCartesianPP_PME
;
239 int *pmenodes
; /* size npmenodes */
240 int *ddindex2simnodeid
; /* size npmenodes, only with bCartesianPP
241 * but with bCartesianPP_PME */
242 gmx_ddpme_t ddpme
[2];
244 /* The DD particle-particle nodes only */
245 gmx_bool bCartesianPP
;
246 int *ddindex2ddnodeid
; /* size npmenode, only with bCartesianPP_PME */
248 /* The global charge groups */
251 /* Should we sort the cgs */
253 gmx_domdec_sort_t
*sort
;
255 /* Are there charge groups? */
258 /* Are there bonded and multi-body interactions between charge groups? */
259 gmx_bool bInterCGBondeds
;
260 gmx_bool bInterCGMultiBody
;
262 /* Data for the optional bonded interaction atom communication range */
269 /* Are we actually using DLB? */
270 gmx_bool bDynLoadBal
;
272 /* Cell sizes for static load balancing, first index cartesian */
275 /* The width of the communicated boundaries */
278 /* The minimum cell size (including triclinic correction) */
280 /* For dlb, for use with edlbAUTO */
281 rvec cellsize_min_dlb
;
282 /* The lower limit for the DD cell size with DLB */
284 /* Effectively no NB cut-off limit with DLB for systems without PBC? */
285 gmx_bool bVacDLBNoLimit
;
287 /* With PME load balancing we set limits on DLB */
288 gmx_bool bPMELoadBalDLBLimits
;
289 /* DLB needs to take into account that we want to allow this maximum
290 * cut-off (for PME load balancing), this could limit cell boundaries.
292 real PMELoadBal_max_cutoff
;
294 /* tric_dir is only stored here because dd_get_ns_ranges needs it */
296 /* box0 and box_size are required with dim's without pbc and -gcom */
300 /* The cell boundaries */
304 /* The old location of the cell boundaries, to check cg displacements */
308 /* The communication setup and charge group boundaries for the zones */
309 gmx_domdec_zones_t zones
;
311 /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
312 * cell boundaries of neighboring cells for dynamic load balancing.
314 gmx_ddzone_t zone_d1
[2];
315 gmx_ddzone_t zone_d2
[2][2];
317 /* The coordinate/force communication setup and indices */
318 gmx_domdec_comm_dim_t cd
[DIM
];
319 /* The maximum number of cells to communicate with in one dimension */
322 /* Which cg distribution is stored on the master node */
323 int master_cg_ddp_count
;
325 /* The number of cg's received from the direct neighbors */
326 int zone_ncg1
[DD_MAXZONE
];
328 /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
331 /* Array for signalling if atoms have moved to another domain */
335 /* Communication buffer for general use */
339 /* Communication buffer for general use */
342 /* Temporary storage for thread parallel communication setup */
344 dd_comm_setup_work_t
*dth
;
346 /* Communication buffers only used with multiple grid pulses */
351 /* Communication buffers for local redistribution */
353 int cggl_flag_nalloc
[DIM
*2];
355 int cgcm_state_nalloc
[DIM
*2];
357 /* Cell sizes for dynamic load balancing */
358 gmx_domdec_root_t
**root
;
362 real cell_f_max0
[DIM
];
363 real cell_f_min1
[DIM
];
365 /* Stuff for load communication */
366 gmx_bool bRecordLoad
;
367 gmx_domdec_load_t
*load
;
368 int nrank_gpu_shared
;
370 MPI_Comm
*mpi_comm_load
;
371 MPI_Comm mpi_comm_gpu_shared
;
374 /* Maximum DLB scaling per load balancing step in percent */
378 float cycl
[ddCyclNr
];
379 int cycl_n
[ddCyclNr
];
380 float cycl_max
[ddCyclNr
];
381 /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
385 /* Have often have did we have load measurements */
387 /* Have often have we collected the load measurements */
391 double sum_nat
[ddnatNR
-ddnatZONE
];
401 /* The last partition step */
402 gmx_large_int_t partition_step
;
410 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
413 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
414 #define DD_FLAG_NRCG 65535
415 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
416 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
418 /* Zone permutation required to obtain consecutive charge groups
419 * for neighbor searching.
421 static const int zone_perm
[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
423 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
424 * components see only j zones with that component 0.
427 /* The DD zone order */
428 static const ivec dd_zo
[DD_MAXZONE
] =
429 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
434 static const ivec dd_zp3
[dd_zp3n
] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
439 static const ivec dd_zp2
[dd_zp2n
] = {{0, 0, 4}, {1, 3, 4}};
444 static const ivec dd_zp1
[dd_zp1n
] = {{0, 0, 2}};
446 /* Factors used to avoid problems due to rounding issues */
447 #define DD_CELL_MARGIN 1.0001
448 #define DD_CELL_MARGIN2 1.00005
449 /* Factor to account for pressure scaling during nstlist steps */
450 #define DD_PRES_SCALE_MARGIN 1.02
452 /* Allowed performance loss before we DLB or warn */
453 #define DD_PERF_LOSS 0.05
455 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
457 /* Use separate MPI send and receive commands
458 * when nnodes <= GMX_DD_NNODES_SENDRECV.
459 * This saves memory (and some copying for small nnodes).
460 * For high parallelization scatter and gather calls are used.
462 #define GMX_DD_NNODES_SENDRECV 4
466 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
468 static void index2xyz(ivec nc,int ind,ivec xyz)
470 xyz[XX] = ind % nc[XX];
471 xyz[YY] = (ind / nc[XX]) % nc[YY];
472 xyz[ZZ] = ind / (nc[YY]*nc[XX]);
476 /* This order is required to minimize the coordinate communication in PME
477 * which uses decomposition in the x direction.
479 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
481 static void ddindex2xyz(ivec nc
, int ind
, ivec xyz
)
483 xyz
[XX
] = ind
/ (nc
[YY
]*nc
[ZZ
]);
484 xyz
[YY
] = (ind
/ nc
[ZZ
]) % nc
[YY
];
485 xyz
[ZZ
] = ind
% nc
[ZZ
];
488 static int ddcoord2ddnodeid(gmx_domdec_t
*dd
, ivec c
)
493 ddindex
= dd_index(dd
->nc
, c
);
494 if (dd
->comm
->bCartesianPP_PME
)
496 ddnodeid
= dd
->comm
->ddindex2ddnodeid
[ddindex
];
498 else if (dd
->comm
->bCartesianPP
)
501 MPI_Cart_rank(dd
->mpi_comm_all
, c
, &ddnodeid
);
512 static gmx_bool
dynamic_dd_box(gmx_ddbox_t
*ddbox
, t_inputrec
*ir
)
514 return (ddbox
->nboundeddim
< DIM
|| DYNAMIC_BOX(*ir
));
517 int ddglatnr(gmx_domdec_t
*dd
, int i
)
527 if (i
>= dd
->comm
->nat
[ddnatNR
-1])
529 gmx_fatal(FARGS
, "glatnr called with %d, which is larger than the local number of atoms (%d)", i
, dd
->comm
->nat
[ddnatNR
-1]);
531 atnr
= dd
->gatindex
[i
] + 1;
537 t_block
*dd_charge_groups_global(gmx_domdec_t
*dd
)
539 return &dd
->comm
->cgs_gl
;
542 static void vec_rvec_init(vec_rvec_t
*v
)
548 static void vec_rvec_check_alloc(vec_rvec_t
*v
, int n
)
552 v
->nalloc
= over_alloc_dd(n
);
553 srenew(v
->v
, v
->nalloc
);
557 void dd_store_state(gmx_domdec_t
*dd
, t_state
*state
)
561 if (state
->ddp_count
!= dd
->ddp_count
)
563 gmx_incons("The state does not the domain decomposition state");
566 state
->ncg_gl
= dd
->ncg_home
;
567 if (state
->ncg_gl
> state
->cg_gl_nalloc
)
569 state
->cg_gl_nalloc
= over_alloc_dd(state
->ncg_gl
);
570 srenew(state
->cg_gl
, state
->cg_gl_nalloc
);
572 for (i
= 0; i
< state
->ncg_gl
; i
++)
574 state
->cg_gl
[i
] = dd
->index_gl
[i
];
577 state
->ddp_count_cg_gl
= dd
->ddp_count
;
580 gmx_domdec_zones_t
*domdec_zones(gmx_domdec_t
*dd
)
582 return &dd
->comm
->zones
;
585 void dd_get_ns_ranges(gmx_domdec_t
*dd
, int icg
,
586 int *jcg0
, int *jcg1
, ivec shift0
, ivec shift1
)
588 gmx_domdec_zones_t
*zones
;
591 zones
= &dd
->comm
->zones
;
594 while (icg
>= zones
->izone
[izone
].cg1
)
603 else if (izone
< zones
->nizone
)
605 *jcg0
= zones
->izone
[izone
].jcg0
;
609 gmx_fatal(FARGS
, "DD icg %d out of range: izone (%d) >= nizone (%d)",
610 icg
, izone
, zones
->nizone
);
613 *jcg1
= zones
->izone
[izone
].jcg1
;
615 for (d
= 0; d
< dd
->ndim
; d
++)
618 shift0
[dim
] = zones
->izone
[izone
].shift0
[dim
];
619 shift1
[dim
] = zones
->izone
[izone
].shift1
[dim
];
620 if (dd
->comm
->tric_dir
[dim
] || (dd
->bGridJump
&& d
> 0))
622 /* A conservative approach, this can be optimized */
629 int dd_natoms_vsite(gmx_domdec_t
*dd
)
631 return dd
->comm
->nat
[ddnatVSITE
];
634 void dd_get_constraint_range(gmx_domdec_t
*dd
, int *at_start
, int *at_end
)
636 *at_start
= dd
->comm
->nat
[ddnatCON
-1];
637 *at_end
= dd
->comm
->nat
[ddnatCON
];
640 void dd_move_x(gmx_domdec_t
*dd
, matrix box
, rvec x
[])
642 int nzone
, nat_tot
, n
, d
, p
, i
, j
, at0
, at1
, zone
;
643 int *index
, *cgindex
;
644 gmx_domdec_comm_t
*comm
;
645 gmx_domdec_comm_dim_t
*cd
;
646 gmx_domdec_ind_t
*ind
;
647 rvec shift
= {0, 0, 0}, *buf
, *rbuf
;
648 gmx_bool bPBC
, bScrew
;
652 cgindex
= dd
->cgindex
;
657 nat_tot
= dd
->nat_home
;
658 for (d
= 0; d
< dd
->ndim
; d
++)
660 bPBC
= (dd
->ci
[dd
->dim
[d
]] == 0);
661 bScrew
= (bPBC
&& dd
->bScrewPBC
&& dd
->dim
[d
] == XX
);
664 copy_rvec(box
[dd
->dim
[d
]], shift
);
667 for (p
= 0; p
< cd
->np
; p
++)
674 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
676 at0
= cgindex
[index
[i
]];
677 at1
= cgindex
[index
[i
]+1];
678 for (j
= at0
; j
< at1
; j
++)
680 copy_rvec(x
[j
], buf
[n
]);
687 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
689 at0
= cgindex
[index
[i
]];
690 at1
= cgindex
[index
[i
]+1];
691 for (j
= at0
; j
< at1
; j
++)
693 /* We need to shift the coordinates */
694 rvec_add(x
[j
], shift
, buf
[n
]);
701 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
703 at0
= cgindex
[index
[i
]];
704 at1
= cgindex
[index
[i
]+1];
705 for (j
= at0
; j
< at1
; j
++)
708 buf
[n
][XX
] = x
[j
][XX
] + shift
[XX
];
710 * This operation requires a special shift force
711 * treatment, which is performed in calc_vir.
713 buf
[n
][YY
] = box
[YY
][YY
] - x
[j
][YY
];
714 buf
[n
][ZZ
] = box
[ZZ
][ZZ
] - x
[j
][ZZ
];
726 rbuf
= comm
->vbuf2
.v
;
728 /* Send and receive the coordinates */
729 dd_sendrecv_rvec(dd
, d
, dddirBackward
,
730 buf
, ind
->nsend
[nzone
+1],
731 rbuf
, ind
->nrecv
[nzone
+1]);
735 for (zone
= 0; zone
< nzone
; zone
++)
737 for (i
= ind
->cell2at0
[zone
]; i
< ind
->cell2at1
[zone
]; i
++)
739 copy_rvec(rbuf
[j
], x
[i
]);
744 nat_tot
+= ind
->nrecv
[nzone
+1];
750 void dd_move_f(gmx_domdec_t
*dd
, rvec f
[], rvec
*fshift
)
752 int nzone
, nat_tot
, n
, d
, p
, i
, j
, at0
, at1
, zone
;
753 int *index
, *cgindex
;
754 gmx_domdec_comm_t
*comm
;
755 gmx_domdec_comm_dim_t
*cd
;
756 gmx_domdec_ind_t
*ind
;
760 gmx_bool bPBC
, bScrew
;
764 cgindex
= dd
->cgindex
;
769 nzone
= comm
->zones
.n
/2;
770 nat_tot
= dd
->nat_tot
;
771 for (d
= dd
->ndim
-1; d
>= 0; d
--)
773 bPBC
= (dd
->ci
[dd
->dim
[d
]] == 0);
774 bScrew
= (bPBC
&& dd
->bScrewPBC
&& dd
->dim
[d
] == XX
);
775 if (fshift
== NULL
&& !bScrew
)
779 /* Determine which shift vector we need */
785 for (p
= cd
->np
-1; p
>= 0; p
--)
788 nat_tot
-= ind
->nrecv
[nzone
+1];
795 sbuf
= comm
->vbuf2
.v
;
797 for (zone
= 0; zone
< nzone
; zone
++)
799 for (i
= ind
->cell2at0
[zone
]; i
< ind
->cell2at1
[zone
]; i
++)
801 copy_rvec(f
[i
], sbuf
[j
]);
806 /* Communicate the forces */
807 dd_sendrecv_rvec(dd
, d
, dddirForward
,
808 sbuf
, ind
->nrecv
[nzone
+1],
809 buf
, ind
->nsend
[nzone
+1]);
811 /* Add the received forces */
815 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
817 at0
= cgindex
[index
[i
]];
818 at1
= cgindex
[index
[i
]+1];
819 for (j
= at0
; j
< at1
; j
++)
821 rvec_inc(f
[j
], buf
[n
]);
828 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
830 at0
= cgindex
[index
[i
]];
831 at1
= cgindex
[index
[i
]+1];
832 for (j
= at0
; j
< at1
; j
++)
834 rvec_inc(f
[j
], buf
[n
]);
835 /* Add this force to the shift force */
836 rvec_inc(fshift
[is
], buf
[n
]);
843 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
845 at0
= cgindex
[index
[i
]];
846 at1
= cgindex
[index
[i
]+1];
847 for (j
= at0
; j
< at1
; j
++)
849 /* Rotate the force */
850 f
[j
][XX
] += buf
[n
][XX
];
851 f
[j
][YY
] -= buf
[n
][YY
];
852 f
[j
][ZZ
] -= buf
[n
][ZZ
];
855 /* Add this force to the shift force */
856 rvec_inc(fshift
[is
], buf
[n
]);
867 void dd_atom_spread_real(gmx_domdec_t
*dd
, real v
[])
869 int nzone
, nat_tot
, n
, d
, p
, i
, j
, at0
, at1
, zone
;
870 int *index
, *cgindex
;
871 gmx_domdec_comm_t
*comm
;
872 gmx_domdec_comm_dim_t
*cd
;
873 gmx_domdec_ind_t
*ind
;
878 cgindex
= dd
->cgindex
;
880 buf
= &comm
->vbuf
.v
[0][0];
883 nat_tot
= dd
->nat_home
;
884 for (d
= 0; d
< dd
->ndim
; d
++)
887 for (p
= 0; p
< cd
->np
; p
++)
892 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
894 at0
= cgindex
[index
[i
]];
895 at1
= cgindex
[index
[i
]+1];
896 for (j
= at0
; j
< at1
; j
++)
909 rbuf
= &comm
->vbuf2
.v
[0][0];
911 /* Send and receive the coordinates */
912 dd_sendrecv_real(dd
, d
, dddirBackward
,
913 buf
, ind
->nsend
[nzone
+1],
914 rbuf
, ind
->nrecv
[nzone
+1]);
918 for (zone
= 0; zone
< nzone
; zone
++)
920 for (i
= ind
->cell2at0
[zone
]; i
< ind
->cell2at1
[zone
]; i
++)
927 nat_tot
+= ind
->nrecv
[nzone
+1];
933 void dd_atom_sum_real(gmx_domdec_t
*dd
, real v
[])
935 int nzone
, nat_tot
, n
, d
, p
, i
, j
, at0
, at1
, zone
;
936 int *index
, *cgindex
;
937 gmx_domdec_comm_t
*comm
;
938 gmx_domdec_comm_dim_t
*cd
;
939 gmx_domdec_ind_t
*ind
;
944 cgindex
= dd
->cgindex
;
946 buf
= &comm
->vbuf
.v
[0][0];
949 nzone
= comm
->zones
.n
/2;
950 nat_tot
= dd
->nat_tot
;
951 for (d
= dd
->ndim
-1; d
>= 0; d
--)
954 for (p
= cd
->np
-1; p
>= 0; p
--)
957 nat_tot
-= ind
->nrecv
[nzone
+1];
964 sbuf
= &comm
->vbuf2
.v
[0][0];
966 for (zone
= 0; zone
< nzone
; zone
++)
968 for (i
= ind
->cell2at0
[zone
]; i
< ind
->cell2at1
[zone
]; i
++)
975 /* Communicate the forces */
976 dd_sendrecv_real(dd
, d
, dddirForward
,
977 sbuf
, ind
->nrecv
[nzone
+1],
978 buf
, ind
->nsend
[nzone
+1]);
980 /* Add the received forces */
982 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
984 at0
= cgindex
[index
[i
]];
985 at1
= cgindex
[index
[i
]+1];
986 for (j
= at0
; j
< at1
; j
++)
997 static void print_ddzone(FILE *fp
, int d
, int i
, int j
, gmx_ddzone_t
*zone
)
999 fprintf(fp
, "zone d0 %d d1 %d d2 %d min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
1001 zone
->min0
, zone
->max1
,
1002 zone
->mch0
, zone
->mch0
,
1003 zone
->p1_0
, zone
->p1_1
);
1007 #define DDZONECOMM_MAXZONE 5
1008 #define DDZONECOMM_BUFSIZE 3
1010 static void dd_sendrecv_ddzone(const gmx_domdec_t
*dd
,
1011 int ddimind
, int direction
,
1012 gmx_ddzone_t
*buf_s
, int n_s
,
1013 gmx_ddzone_t
*buf_r
, int n_r
)
1015 #define ZBS DDZONECOMM_BUFSIZE
1016 rvec vbuf_s
[DDZONECOMM_MAXZONE
*ZBS
];
1017 rvec vbuf_r
[DDZONECOMM_MAXZONE
*ZBS
];
1020 for (i
= 0; i
< n_s
; i
++)
1022 vbuf_s
[i
*ZBS
][0] = buf_s
[i
].min0
;
1023 vbuf_s
[i
*ZBS
][1] = buf_s
[i
].max1
;
1024 vbuf_s
[i
*ZBS
][2] = buf_s
[i
].min1
;
1025 vbuf_s
[i
*ZBS
+1][0] = buf_s
[i
].mch0
;
1026 vbuf_s
[i
*ZBS
+1][1] = buf_s
[i
].mch1
;
1027 vbuf_s
[i
*ZBS
+1][2] = 0;
1028 vbuf_s
[i
*ZBS
+2][0] = buf_s
[i
].p1_0
;
1029 vbuf_s
[i
*ZBS
+2][1] = buf_s
[i
].p1_1
;
1030 vbuf_s
[i
*ZBS
+2][2] = 0;
1033 dd_sendrecv_rvec(dd
, ddimind
, direction
,
1037 for (i
= 0; i
< n_r
; i
++)
1039 buf_r
[i
].min0
= vbuf_r
[i
*ZBS
][0];
1040 buf_r
[i
].max1
= vbuf_r
[i
*ZBS
][1];
1041 buf_r
[i
].min1
= vbuf_r
[i
*ZBS
][2];
1042 buf_r
[i
].mch0
= vbuf_r
[i
*ZBS
+1][0];
1043 buf_r
[i
].mch1
= vbuf_r
[i
*ZBS
+1][1];
1044 buf_r
[i
].p1_0
= vbuf_r
[i
*ZBS
+2][0];
1045 buf_r
[i
].p1_1
= vbuf_r
[i
*ZBS
+2][1];
1051 static void dd_move_cellx(gmx_domdec_t
*dd
, gmx_ddbox_t
*ddbox
,
1052 rvec cell_ns_x0
, rvec cell_ns_x1
)
1054 int d
, d1
, dim
, dim1
, pos
, buf_size
, i
, j
, k
, p
, npulse
, npulse_min
;
1056 gmx_ddzone_t buf_s
[DDZONECOMM_MAXZONE
];
1057 gmx_ddzone_t buf_r
[DDZONECOMM_MAXZONE
];
1058 gmx_ddzone_t buf_e
[DDZONECOMM_MAXZONE
];
1059 rvec extr_s
[2], extr_r
[2];
1061 real dist_d
, c
= 0, det
;
1062 gmx_domdec_comm_t
*comm
;
1063 gmx_bool bPBC
, bUse
;
1067 for (d
= 1; d
< dd
->ndim
; d
++)
1070 zp
= (d
== 1) ? &comm
->zone_d1
[0] : &comm
->zone_d2
[0][0];
1071 zp
->min0
= cell_ns_x0
[dim
];
1072 zp
->max1
= cell_ns_x1
[dim
];
1073 zp
->min1
= cell_ns_x1
[dim
];
1074 zp
->mch0
= cell_ns_x0
[dim
];
1075 zp
->mch1
= cell_ns_x1
[dim
];
1076 zp
->p1_0
= cell_ns_x0
[dim
];
1077 zp
->p1_1
= cell_ns_x1
[dim
];
1080 for (d
= dd
->ndim
-2; d
>= 0; d
--)
1083 bPBC
= (dim
< ddbox
->npbcdim
);
1085 /* Use an rvec to store two reals */
1086 extr_s
[d
][0] = comm
->cell_f0
[d
+1];
1087 extr_s
[d
][1] = comm
->cell_f1
[d
+1];
1088 extr_s
[d
][2] = comm
->cell_f1
[d
+1];
1091 /* Store the extremes in the backward sending buffer,
1092 * so the get updated separately from the forward communication.
1094 for (d1
= d
; d1
< dd
->ndim
-1; d1
++)
1096 /* We invert the order to be able to use the same loop for buf_e */
1097 buf_s
[pos
].min0
= extr_s
[d1
][1];
1098 buf_s
[pos
].max1
= extr_s
[d1
][0];
1099 buf_s
[pos
].min1
= extr_s
[d1
][2];
1100 buf_s
[pos
].mch0
= 0;
1101 buf_s
[pos
].mch1
= 0;
1102 /* Store the cell corner of the dimension we communicate along */
1103 buf_s
[pos
].p1_0
= comm
->cell_x0
[dim
];
1104 buf_s
[pos
].p1_1
= 0;
1108 buf_s
[pos
] = (dd
->ndim
== 2) ? comm
->zone_d1
[0] : comm
->zone_d2
[0][0];
1111 if (dd
->ndim
== 3 && d
== 0)
1113 buf_s
[pos
] = comm
->zone_d2
[0][1];
1115 buf_s
[pos
] = comm
->zone_d1
[0];
1119 /* We only need to communicate the extremes
1120 * in the forward direction
1122 npulse
= comm
->cd
[d
].np
;
1125 /* Take the minimum to avoid double communication */
1126 npulse_min
= min(npulse
, dd
->nc
[dim
]-1-npulse
);
1130 /* Without PBC we should really not communicate over
1131 * the boundaries, but implementing that complicates
1132 * the communication setup and therefore we simply
1133 * do all communication, but ignore some data.
1135 npulse_min
= npulse
;
1137 for (p
= 0; p
< npulse_min
; p
++)
1139 /* Communicate the extremes forward */
1140 bUse
= (bPBC
|| dd
->ci
[dim
] > 0);
1142 dd_sendrecv_rvec(dd
, d
, dddirForward
,
1143 extr_s
+d
, dd
->ndim
-d
-1,
1144 extr_r
+d
, dd
->ndim
-d
-1);
1148 for (d1
= d
; d1
< dd
->ndim
-1; d1
++)
1150 extr_s
[d1
][0] = max(extr_s
[d1
][0], extr_r
[d1
][0]);
1151 extr_s
[d1
][1] = min(extr_s
[d1
][1], extr_r
[d1
][1]);
1152 extr_s
[d1
][2] = min(extr_s
[d1
][2], extr_r
[d1
][2]);
1158 for (p
= 0; p
< npulse
; p
++)
1160 /* Communicate all the zone information backward */
1161 bUse
= (bPBC
|| dd
->ci
[dim
] < dd
->nc
[dim
] - 1);
1163 dd_sendrecv_ddzone(dd
, d
, dddirBackward
,
1170 for (d1
= d
+1; d1
< dd
->ndim
; d1
++)
1172 /* Determine the decrease of maximum required
1173 * communication height along d1 due to the distance along d,
1174 * this avoids a lot of useless atom communication.
1176 dist_d
= comm
->cell_x1
[dim
] - buf_r
[0].p1_0
;
1178 if (ddbox
->tric_dir
[dim
])
1180 /* c is the off-diagonal coupling between the cell planes
1181 * along directions d and d1.
1183 c
= ddbox
->v
[dim
][dd
->dim
[d1
]][dim
];
1189 det
= (1 + c
*c
)*comm
->cutoff
*comm
->cutoff
- dist_d
*dist_d
;
1192 dh
[d1
] = comm
->cutoff
- (c
*dist_d
+ sqrt(det
))/(1 + c
*c
);
1196 /* A negative value signals out of range */
1202 /* Accumulate the extremes over all pulses */
1203 for (i
= 0; i
< buf_size
; i
++)
1207 buf_e
[i
] = buf_r
[i
];
1213 buf_e
[i
].min0
= min(buf_e
[i
].min0
, buf_r
[i
].min0
);
1214 buf_e
[i
].max1
= max(buf_e
[i
].max1
, buf_r
[i
].max1
);
1215 buf_e
[i
].min1
= min(buf_e
[i
].min1
, buf_r
[i
].min1
);
1218 if (dd
->ndim
== 3 && d
== 0 && i
== buf_size
- 1)
1226 if (bUse
&& dh
[d1
] >= 0)
1228 buf_e
[i
].mch0
= max(buf_e
[i
].mch0
, buf_r
[i
].mch0
-dh
[d1
]);
1229 buf_e
[i
].mch1
= max(buf_e
[i
].mch1
, buf_r
[i
].mch1
-dh
[d1
]);
1232 /* Copy the received buffer to the send buffer,
1233 * to pass the data through with the next pulse.
1235 buf_s
[i
] = buf_r
[i
];
1237 if (((bPBC
|| dd
->ci
[dim
]+npulse
< dd
->nc
[dim
]) && p
== npulse
-1) ||
1238 (!bPBC
&& dd
->ci
[dim
]+1+p
== dd
->nc
[dim
]-1))
1240 /* Store the extremes */
1243 for (d1
= d
; d1
< dd
->ndim
-1; d1
++)
1245 extr_s
[d1
][1] = min(extr_s
[d1
][1], buf_e
[pos
].min0
);
1246 extr_s
[d1
][0] = max(extr_s
[d1
][0], buf_e
[pos
].max1
);
1247 extr_s
[d1
][2] = min(extr_s
[d1
][2], buf_e
[pos
].min1
);
1251 if (d
== 1 || (d
== 0 && dd
->ndim
== 3))
1253 for (i
= d
; i
< 2; i
++)
1255 comm
->zone_d2
[1-d
][i
] = buf_e
[pos
];
1261 comm
->zone_d1
[1] = buf_e
[pos
];
1271 for (i
= 0; i
< 2; i
++)
1275 print_ddzone(debug
, 1, i
, 0, &comm
->zone_d1
[i
]);
1277 cell_ns_x0
[dim
] = min(cell_ns_x0
[dim
], comm
->zone_d1
[i
].min0
);
1278 cell_ns_x1
[dim
] = max(cell_ns_x1
[dim
], comm
->zone_d1
[i
].max1
);
1284 for (i
= 0; i
< 2; i
++)
1286 for (j
= 0; j
< 2; j
++)
1290 print_ddzone(debug
, 2, i
, j
, &comm
->zone_d2
[i
][j
]);
1292 cell_ns_x0
[dim
] = min(cell_ns_x0
[dim
], comm
->zone_d2
[i
][j
].min0
);
1293 cell_ns_x1
[dim
] = max(cell_ns_x1
[dim
], comm
->zone_d2
[i
][j
].max1
);
1297 for (d
= 1; d
< dd
->ndim
; d
++)
1299 comm
->cell_f_max0
[d
] = extr_s
[d
-1][0];
1300 comm
->cell_f_min1
[d
] = extr_s
[d
-1][1];
1303 fprintf(debug
, "Cell fraction d %d, max0 %f, min1 %f\n",
1304 d
, comm
->cell_f_max0
[d
], comm
->cell_f_min1
[d
]);
1309 static void dd_collect_cg(gmx_domdec_t
*dd
,
1310 t_state
*state_local
)
1312 gmx_domdec_master_t
*ma
= NULL
;
1313 int buf2
[2], *ibuf
, i
, ncg_home
= 0, *cg
= NULL
, nat_home
= 0;
1316 if (state_local
->ddp_count
== dd
->comm
->master_cg_ddp_count
)
1318 /* The master has the correct distribution */
1322 if (state_local
->ddp_count
== dd
->ddp_count
)
1324 ncg_home
= dd
->ncg_home
;
1326 nat_home
= dd
->nat_home
;
1328 else if (state_local
->ddp_count_cg_gl
== state_local
->ddp_count
)
1330 cgs_gl
= &dd
->comm
->cgs_gl
;
1332 ncg_home
= state_local
->ncg_gl
;
1333 cg
= state_local
->cg_gl
;
1335 for (i
= 0; i
< ncg_home
; i
++)
1337 nat_home
+= cgs_gl
->index
[cg
[i
]+1] - cgs_gl
->index
[cg
[i
]];
1342 gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1345 buf2
[0] = dd
->ncg_home
;
1346 buf2
[1] = dd
->nat_home
;
1356 /* Collect the charge group and atom counts on the master */
1357 dd_gather(dd
, 2*sizeof(int), buf2
, ibuf
);
1362 for (i
= 0; i
< dd
->nnodes
; i
++)
1364 ma
->ncg
[i
] = ma
->ibuf
[2*i
];
1365 ma
->nat
[i
] = ma
->ibuf
[2*i
+1];
1366 ma
->index
[i
+1] = ma
->index
[i
] + ma
->ncg
[i
];
1369 /* Make byte counts and indices */
1370 for (i
= 0; i
< dd
->nnodes
; i
++)
1372 ma
->ibuf
[i
] = ma
->ncg
[i
]*sizeof(int);
1373 ma
->ibuf
[dd
->nnodes
+i
] = ma
->index
[i
]*sizeof(int);
1377 fprintf(debug
, "Initial charge group distribution: ");
1378 for (i
= 0; i
< dd
->nnodes
; i
++)
1380 fprintf(debug
, " %d", ma
->ncg
[i
]);
1382 fprintf(debug
, "\n");
1386 /* Collect the charge group indices on the master */
1388 dd
->ncg_home
*sizeof(int), dd
->index_gl
,
1389 DDMASTER(dd
) ? ma
->ibuf
: NULL
,
1390 DDMASTER(dd
) ? ma
->ibuf
+dd
->nnodes
: NULL
,
1391 DDMASTER(dd
) ? ma
->cg
: NULL
);
1393 dd
->comm
->master_cg_ddp_count
= state_local
->ddp_count
;
1396 static void dd_collect_vec_sendrecv(gmx_domdec_t
*dd
,
1399 gmx_domdec_master_t
*ma
;
1400 int n
, i
, c
, a
, nalloc
= 0;
1409 MPI_Send(lv
, dd
->nat_home
*sizeof(rvec
), MPI_BYTE
, DDMASTERRANK(dd
),
1410 dd
->rank
, dd
->mpi_comm_all
);
1415 /* Copy the master coordinates to the global array */
1416 cgs_gl
= &dd
->comm
->cgs_gl
;
1418 n
= DDMASTERRANK(dd
);
1420 for (i
= ma
->index
[n
]; i
< ma
->index
[n
+1]; i
++)
1422 for (c
= cgs_gl
->index
[ma
->cg
[i
]]; c
< cgs_gl
->index
[ma
->cg
[i
]+1]; c
++)
1424 copy_rvec(lv
[a
++], v
[c
]);
1428 for (n
= 0; n
< dd
->nnodes
; n
++)
1432 if (ma
->nat
[n
] > nalloc
)
1434 nalloc
= over_alloc_dd(ma
->nat
[n
]);
1435 srenew(buf
, nalloc
);
1438 MPI_Recv(buf
, ma
->nat
[n
]*sizeof(rvec
), MPI_BYTE
, DDRANK(dd
, n
),
1439 n
, dd
->mpi_comm_all
, MPI_STATUS_IGNORE
);
1442 for (i
= ma
->index
[n
]; i
< ma
->index
[n
+1]; i
++)
1444 for (c
= cgs_gl
->index
[ma
->cg
[i
]]; c
< cgs_gl
->index
[ma
->cg
[i
]+1]; c
++)
1446 copy_rvec(buf
[a
++], v
[c
]);
1455 static void get_commbuffer_counts(gmx_domdec_t
*dd
,
1456 int **counts
, int **disps
)
1458 gmx_domdec_master_t
*ma
;
1463 /* Make the rvec count and displacment arrays */
1465 *disps
= ma
->ibuf
+ dd
->nnodes
;
1466 for (n
= 0; n
< dd
->nnodes
; n
++)
1468 (*counts
)[n
] = ma
->nat
[n
]*sizeof(rvec
);
1469 (*disps
)[n
] = (n
== 0 ? 0 : (*disps
)[n
-1] + (*counts
)[n
-1]);
1473 static void dd_collect_vec_gatherv(gmx_domdec_t
*dd
,
1476 gmx_domdec_master_t
*ma
;
1477 int *rcounts
= NULL
, *disps
= NULL
;
1486 get_commbuffer_counts(dd
, &rcounts
, &disps
);
1491 dd_gatherv(dd
, dd
->nat_home
*sizeof(rvec
), lv
, rcounts
, disps
, buf
);
1495 cgs_gl
= &dd
->comm
->cgs_gl
;
1498 for (n
= 0; n
< dd
->nnodes
; n
++)
1500 for (i
= ma
->index
[n
]; i
< ma
->index
[n
+1]; i
++)
1502 for (c
= cgs_gl
->index
[ma
->cg
[i
]]; c
< cgs_gl
->index
[ma
->cg
[i
]+1]; c
++)
1504 copy_rvec(buf
[a
++], v
[c
]);
1511 void dd_collect_vec(gmx_domdec_t
*dd
,
1512 t_state
*state_local
, rvec
*lv
, rvec
*v
)
1514 gmx_domdec_master_t
*ma
;
1515 int n
, i
, c
, a
, nalloc
= 0;
1518 dd_collect_cg(dd
, state_local
);
1520 if (dd
->nnodes
<= GMX_DD_NNODES_SENDRECV
)
1522 dd_collect_vec_sendrecv(dd
, lv
, v
);
1526 dd_collect_vec_gatherv(dd
, lv
, v
);
1531 void dd_collect_state(gmx_domdec_t
*dd
,
1532 t_state
*state_local
, t_state
*state
)
1536 nh
= state
->nhchainlength
;
1540 for (i
= 0; i
< efptNR
; i
++)
1542 state
->lambda
[i
] = state_local
->lambda
[i
];
1544 state
->fep_state
= state_local
->fep_state
;
1545 state
->veta
= state_local
->veta
;
1546 state
->vol0
= state_local
->vol0
;
1547 copy_mat(state_local
->box
, state
->box
);
1548 copy_mat(state_local
->boxv
, state
->boxv
);
1549 copy_mat(state_local
->svir_prev
, state
->svir_prev
);
1550 copy_mat(state_local
->fvir_prev
, state
->fvir_prev
);
1551 copy_mat(state_local
->pres_prev
, state
->pres_prev
);
1553 for (i
= 0; i
< state_local
->ngtc
; i
++)
1555 for (j
= 0; j
< nh
; j
++)
1557 state
->nosehoover_xi
[i
*nh
+j
] = state_local
->nosehoover_xi
[i
*nh
+j
];
1558 state
->nosehoover_vxi
[i
*nh
+j
] = state_local
->nosehoover_vxi
[i
*nh
+j
];
1560 state
->therm_integral
[i
] = state_local
->therm_integral
[i
];
1562 for (i
= 0; i
< state_local
->nnhpres
; i
++)
1564 for (j
= 0; j
< nh
; j
++)
1566 state
->nhpres_xi
[i
*nh
+j
] = state_local
->nhpres_xi
[i
*nh
+j
];
1567 state
->nhpres_vxi
[i
*nh
+j
] = state_local
->nhpres_vxi
[i
*nh
+j
];
1571 for (est
= 0; est
< estNR
; est
++)
1573 if (EST_DISTR(est
) && (state_local
->flags
& (1<<est
)))
1578 dd_collect_vec(dd
, state_local
, state_local
->x
, state
->x
);
1581 dd_collect_vec(dd
, state_local
, state_local
->v
, state
->v
);
1584 dd_collect_vec(dd
, state_local
, state_local
->sd_X
, state
->sd_X
);
1587 dd_collect_vec(dd
, state_local
, state_local
->cg_p
, state
->cg_p
);
1590 if (state
->nrngi
== 1)
1594 for (i
= 0; i
< state_local
->nrng
; i
++)
1596 state
->ld_rng
[i
] = state_local
->ld_rng
[i
];
1602 dd_gather(dd
, state_local
->nrng
*sizeof(state
->ld_rng
[0]),
1603 state_local
->ld_rng
, state
->ld_rng
);
1607 if (state
->nrngi
== 1)
1611 state
->ld_rngi
[0] = state_local
->ld_rngi
[0];
1616 dd_gather(dd
, sizeof(state
->ld_rngi
[0]),
1617 state_local
->ld_rngi
, state
->ld_rngi
);
1620 case estDISRE_INITF
:
1621 case estDISRE_RM3TAV
:
1622 case estORIRE_INITF
:
1626 gmx_incons("Unknown state entry encountered in dd_collect_state");
1632 static void dd_realloc_state(t_state
*state
, rvec
**f
, int nalloc
)
1638 fprintf(debug
, "Reallocating state: currently %d, required %d, allocating %d\n", state
->nalloc
, nalloc
, over_alloc_dd(nalloc
));
1641 state
->nalloc
= over_alloc_dd(nalloc
);
1643 for (est
= 0; est
< estNR
; est
++)
1645 if (EST_DISTR(est
) && (state
->flags
& (1<<est
)))
1650 srenew(state
->x
, state
->nalloc
);
1653 srenew(state
->v
, state
->nalloc
);
1656 srenew(state
->sd_X
, state
->nalloc
);
1659 srenew(state
->cg_p
, state
->nalloc
);
1663 case estDISRE_INITF
:
1664 case estDISRE_RM3TAV
:
1665 case estORIRE_INITF
:
1667 /* No reallocation required */
1670 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1677 srenew(*f
, state
->nalloc
);
1681 static void dd_check_alloc_ncg(t_forcerec
*fr
, t_state
*state
, rvec
**f
,
1684 if (nalloc
> fr
->cg_nalloc
)
1688 fprintf(debug
, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr
->cg_nalloc
, nalloc
, over_alloc_dd(nalloc
));
1690 fr
->cg_nalloc
= over_alloc_dd(nalloc
);
1691 srenew(fr
->cginfo
, fr
->cg_nalloc
);
1692 if (fr
->cutoff_scheme
== ecutsGROUP
)
1694 srenew(fr
->cg_cm
, fr
->cg_nalloc
);
1697 if (fr
->cutoff_scheme
== ecutsVERLET
&& nalloc
> state
->nalloc
)
1699 /* We don't use charge groups, we use x in state to set up
1700 * the atom communication.
1702 dd_realloc_state(state
, f
, nalloc
);
1706 static void dd_distribute_vec_sendrecv(gmx_domdec_t
*dd
, t_block
*cgs
,
1709 gmx_domdec_master_t
*ma
;
1710 int n
, i
, c
, a
, nalloc
= 0;
1717 for (n
= 0; n
< dd
->nnodes
; n
++)
1721 if (ma
->nat
[n
] > nalloc
)
1723 nalloc
= over_alloc_dd(ma
->nat
[n
]);
1724 srenew(buf
, nalloc
);
1726 /* Use lv as a temporary buffer */
1728 for (i
= ma
->index
[n
]; i
< ma
->index
[n
+1]; i
++)
1730 for (c
= cgs
->index
[ma
->cg
[i
]]; c
< cgs
->index
[ma
->cg
[i
]+1]; c
++)
1732 copy_rvec(v
[c
], buf
[a
++]);
1735 if (a
!= ma
->nat
[n
])
1737 gmx_fatal(FARGS
, "Internal error a (%d) != nat (%d)",
1742 MPI_Send(buf
, ma
->nat
[n
]*sizeof(rvec
), MPI_BYTE
,
1743 DDRANK(dd
, n
), n
, dd
->mpi_comm_all
);
1748 n
= DDMASTERRANK(dd
);
1750 for (i
= ma
->index
[n
]; i
< ma
->index
[n
+1]; i
++)
1752 for (c
= cgs
->index
[ma
->cg
[i
]]; c
< cgs
->index
[ma
->cg
[i
]+1]; c
++)
1754 copy_rvec(v
[c
], lv
[a
++]);
1761 MPI_Recv(lv
, dd
->nat_home
*sizeof(rvec
), MPI_BYTE
, DDMASTERRANK(dd
),
1762 MPI_ANY_TAG
, dd
->mpi_comm_all
, MPI_STATUS_IGNORE
);
1767 static void dd_distribute_vec_scatterv(gmx_domdec_t
*dd
, t_block
*cgs
,
1770 gmx_domdec_master_t
*ma
;
1771 int *scounts
= NULL
, *disps
= NULL
;
1772 int n
, i
, c
, a
, nalloc
= 0;
1779 get_commbuffer_counts(dd
, &scounts
, &disps
);
1783 for (n
= 0; n
< dd
->nnodes
; n
++)
1785 for (i
= ma
->index
[n
]; i
< ma
->index
[n
+1]; i
++)
1787 for (c
= cgs
->index
[ma
->cg
[i
]]; c
< cgs
->index
[ma
->cg
[i
]+1]; c
++)
1789 copy_rvec(v
[c
], buf
[a
++]);
1795 dd_scatterv(dd
, scounts
, disps
, buf
, dd
->nat_home
*sizeof(rvec
), lv
);
1798 static void dd_distribute_vec(gmx_domdec_t
*dd
, t_block
*cgs
, rvec
*v
, rvec
*lv
)
1800 if (dd
->nnodes
<= GMX_DD_NNODES_SENDRECV
)
1802 dd_distribute_vec_sendrecv(dd
, cgs
, v
, lv
);
1806 dd_distribute_vec_scatterv(dd
, cgs
, v
, lv
);
1810 static void dd_distribute_dfhist(gmx_domdec_t
*dd
, df_history_t
*dfhist
)
1813 dd_bcast(dd
, sizeof(int), &dfhist
->bEquil
);
1814 dd_bcast(dd
, sizeof(int), &dfhist
->nlambda
);
1815 dd_bcast(dd
, sizeof(real
), &dfhist
->wl_delta
);
1817 if (dfhist
->nlambda
> 0)
1819 int nlam
= dfhist
->nlambda
;
1820 dd_bcast(dd
, sizeof(int)*nlam
, dfhist
->n_at_lam
);
1821 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->wl_histo
);
1822 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->sum_weights
);
1823 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->sum_dg
);
1824 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->sum_minvar
);
1825 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->sum_variance
);
1827 for (i
= 0; i
< nlam
; i
++)
1829 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->accum_p
[i
]);
1830 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->accum_m
[i
]);
1831 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->accum_p2
[i
]);
1832 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->accum_m2
[i
]);
1833 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->Tij
[i
]);
1834 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->Tij_empirical
[i
]);
1839 static void dd_distribute_state(gmx_domdec_t
*dd
, t_block
*cgs
,
1840 t_state
*state
, t_state
*state_local
,
1845 nh
= state
->nhchainlength
;
1849 for (i
= 0; i
< efptNR
; i
++)
1851 state_local
->lambda
[i
] = state
->lambda
[i
];
1853 state_local
->fep_state
= state
->fep_state
;
1854 state_local
->veta
= state
->veta
;
1855 state_local
->vol0
= state
->vol0
;
1856 copy_mat(state
->box
, state_local
->box
);
1857 copy_mat(state
->box_rel
, state_local
->box_rel
);
1858 copy_mat(state
->boxv
, state_local
->boxv
);
1859 copy_mat(state
->svir_prev
, state_local
->svir_prev
);
1860 copy_mat(state
->fvir_prev
, state_local
->fvir_prev
);
1861 copy_df_history(&state_local
->dfhist
, &state
->dfhist
);
1862 for (i
= 0; i
< state_local
->ngtc
; i
++)
1864 for (j
= 0; j
< nh
; j
++)
1866 state_local
->nosehoover_xi
[i
*nh
+j
] = state
->nosehoover_xi
[i
*nh
+j
];
1867 state_local
->nosehoover_vxi
[i
*nh
+j
] = state
->nosehoover_vxi
[i
*nh
+j
];
1869 state_local
->therm_integral
[i
] = state
->therm_integral
[i
];
1871 for (i
= 0; i
< state_local
->nnhpres
; i
++)
1873 for (j
= 0; j
< nh
; j
++)
1875 state_local
->nhpres_xi
[i
*nh
+j
] = state
->nhpres_xi
[i
*nh
+j
];
1876 state_local
->nhpres_vxi
[i
*nh
+j
] = state
->nhpres_vxi
[i
*nh
+j
];
1880 dd_bcast(dd
, ((efptNR
)*sizeof(real
)), state_local
->lambda
);
1881 dd_bcast(dd
, sizeof(int), &state_local
->fep_state
);
1882 dd_bcast(dd
, sizeof(real
), &state_local
->veta
);
1883 dd_bcast(dd
, sizeof(real
), &state_local
->vol0
);
1884 dd_bcast(dd
, sizeof(state_local
->box
), state_local
->box
);
1885 dd_bcast(dd
, sizeof(state_local
->box_rel
), state_local
->box_rel
);
1886 dd_bcast(dd
, sizeof(state_local
->boxv
), state_local
->boxv
);
1887 dd_bcast(dd
, sizeof(state_local
->svir_prev
), state_local
->svir_prev
);
1888 dd_bcast(dd
, sizeof(state_local
->fvir_prev
), state_local
->fvir_prev
);
1889 dd_bcast(dd
, ((state_local
->ngtc
*nh
)*sizeof(double)), state_local
->nosehoover_xi
);
1890 dd_bcast(dd
, ((state_local
->ngtc
*nh
)*sizeof(double)), state_local
->nosehoover_vxi
);
1891 dd_bcast(dd
, state_local
->ngtc
*sizeof(double), state_local
->therm_integral
);
1892 dd_bcast(dd
, ((state_local
->nnhpres
*nh
)*sizeof(double)), state_local
->nhpres_xi
);
1893 dd_bcast(dd
, ((state_local
->nnhpres
*nh
)*sizeof(double)), state_local
->nhpres_vxi
);
1895 /* communicate df_history -- required for restarting from checkpoint */
1896 dd_distribute_dfhist(dd
, &state_local
->dfhist
);
1898 if (dd
->nat_home
> state_local
->nalloc
)
1900 dd_realloc_state(state_local
, f
, dd
->nat_home
);
1902 for (i
= 0; i
< estNR
; i
++)
1904 if (EST_DISTR(i
) && (state_local
->flags
& (1<<i
)))
1909 dd_distribute_vec(dd
, cgs
, state
->x
, state_local
->x
);
1912 dd_distribute_vec(dd
, cgs
, state
->v
, state_local
->v
);
1915 dd_distribute_vec(dd
, cgs
, state
->sd_X
, state_local
->sd_X
);
1918 dd_distribute_vec(dd
, cgs
, state
->cg_p
, state_local
->cg_p
);
1921 if (state
->nrngi
== 1)
1924 state_local
->nrng
*sizeof(state_local
->ld_rng
[0]),
1925 state
->ld_rng
, state_local
->ld_rng
);
1930 state_local
->nrng
*sizeof(state_local
->ld_rng
[0]),
1931 state
->ld_rng
, state_local
->ld_rng
);
1935 if (state
->nrngi
== 1)
1937 dd_bcastc(dd
, sizeof(state_local
->ld_rngi
[0]),
1938 state
->ld_rngi
, state_local
->ld_rngi
);
1942 dd_scatter(dd
, sizeof(state_local
->ld_rngi
[0]),
1943 state
->ld_rngi
, state_local
->ld_rngi
);
1946 case estDISRE_INITF
:
1947 case estDISRE_RM3TAV
:
1948 case estORIRE_INITF
:
1950 /* Not implemented yet */
1953 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1959 static char dim2char(int dim
)
1965 case XX
: c
= 'X'; break;
1966 case YY
: c
= 'Y'; break;
1967 case ZZ
: c
= 'Z'; break;
1968 default: gmx_fatal(FARGS
, "Unknown dim %d", dim
);
1974 static void write_dd_grid_pdb(const char *fn
, gmx_large_int_t step
,
1975 gmx_domdec_t
*dd
, matrix box
, gmx_ddbox_t
*ddbox
)
1977 rvec grid_s
[2], *grid_r
= NULL
, cx
, r
;
1978 char fname
[STRLEN
], format
[STRLEN
], buf
[22];
1980 int a
, i
, d
, z
, y
, x
;
1984 copy_rvec(dd
->comm
->cell_x0
, grid_s
[0]);
1985 copy_rvec(dd
->comm
->cell_x1
, grid_s
[1]);
1989 snew(grid_r
, 2*dd
->nnodes
);
1992 dd_gather(dd
, 2*sizeof(rvec
), grid_s
[0], DDMASTER(dd
) ? grid_r
[0] : NULL
);
1996 for (d
= 0; d
< DIM
; d
++)
1998 for (i
= 0; i
< DIM
; i
++)
2006 if (d
< ddbox
->npbcdim
&& dd
->nc
[d
] > 1)
2008 tric
[d
][i
] = box
[i
][d
]/box
[i
][i
];
2017 sprintf(fname
, "%s_%s.pdb", fn
, gmx_step_str(step
, buf
));
2018 sprintf(format
, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
2019 out
= gmx_fio_fopen(fname
, "w");
2020 gmx_write_pdb_box(out
, dd
->bScrewPBC
? epbcSCREW
: epbcXYZ
, box
);
2022 for (i
= 0; i
< dd
->nnodes
; i
++)
2024 vol
= dd
->nnodes
/(box
[XX
][XX
]*box
[YY
][YY
]*box
[ZZ
][ZZ
]);
2025 for (d
= 0; d
< DIM
; d
++)
2027 vol
*= grid_r
[i
*2+1][d
] - grid_r
[i
*2][d
];
2029 for (z
= 0; z
< 2; z
++)
2031 for (y
= 0; y
< 2; y
++)
2033 for (x
= 0; x
< 2; x
++)
2035 cx
[XX
] = grid_r
[i
*2+x
][XX
];
2036 cx
[YY
] = grid_r
[i
*2+y
][YY
];
2037 cx
[ZZ
] = grid_r
[i
*2+z
][ZZ
];
2039 fprintf(out
, format
, "ATOM", a
++, "CA", "GLY", ' ', 1+i
,
2040 ' ', 10*r
[XX
], 10*r
[YY
], 10*r
[ZZ
], 1.0, vol
);
2044 for (d
= 0; d
< DIM
; d
++)
2046 for (x
= 0; x
< 4; x
++)
2050 case 0: y
= 1 + i
*8 + 2*x
; break;
2051 case 1: y
= 1 + i
*8 + 2*x
- (x
% 2); break;
2052 case 2: y
= 1 + i
*8 + x
; break;
2054 fprintf(out
, "%6s%5d%5d\n", "CONECT", y
, y
+(1<<d
));
2058 gmx_fio_fclose(out
);
2063 void write_dd_pdb(const char *fn
, gmx_large_int_t step
, const char *title
,
2064 gmx_mtop_t
*mtop
, t_commrec
*cr
,
2065 int natoms
, rvec x
[], matrix box
)
2067 char fname
[STRLEN
], format
[STRLEN
], format4
[STRLEN
], buf
[22];
2069 int i
, ii
, resnr
, c
;
2070 char *atomname
, *resname
;
2077 natoms
= dd
->comm
->nat
[ddnatVSITE
];
2080 sprintf(fname
, "%s_%s_n%d.pdb", fn
, gmx_step_str(step
, buf
), cr
->sim_nodeid
);
2082 sprintf(format
, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
2083 sprintf(format4
, "%s%s\n", get_pdbformat4(), "%6.2f%6.2f");
2085 out
= gmx_fio_fopen(fname
, "w");
2087 fprintf(out
, "TITLE %s\n", title
);
2088 gmx_write_pdb_box(out
, dd
->bScrewPBC
? epbcSCREW
: epbcXYZ
, box
);
2089 for (i
= 0; i
< natoms
; i
++)
2091 ii
= dd
->gatindex
[i
];
2092 gmx_mtop_atominfo_global(mtop
, ii
, &atomname
, &resnr
, &resname
);
2093 if (i
< dd
->comm
->nat
[ddnatZONE
])
2096 while (i
>= dd
->cgindex
[dd
->comm
->zones
.cg_range
[c
+1]])
2102 else if (i
< dd
->comm
->nat
[ddnatVSITE
])
2104 b
= dd
->comm
->zones
.n
;
2108 b
= dd
->comm
->zones
.n
+ 1;
2110 fprintf(out
, strlen(atomname
) < 4 ? format
: format4
,
2111 "ATOM", (ii
+1)%100000,
2112 atomname
, resname
, ' ', resnr
%10000, ' ',
2113 10*x
[i
][XX
], 10*x
[i
][YY
], 10*x
[i
][ZZ
], 1.0, b
);
2115 fprintf(out
, "TER\n");
2117 gmx_fio_fclose(out
);
2120 real
dd_cutoff_mbody(gmx_domdec_t
*dd
)
2122 gmx_domdec_comm_t
*comm
;
2129 if (comm
->bInterCGBondeds
)
2131 if (comm
->cutoff_mbody
> 0)
2133 r
= comm
->cutoff_mbody
;
2137 /* cutoff_mbody=0 means we do not have DLB */
2138 r
= comm
->cellsize_min
[dd
->dim
[0]];
2139 for (di
= 1; di
< dd
->ndim
; di
++)
2141 r
= min(r
, comm
->cellsize_min
[dd
->dim
[di
]]);
2143 if (comm
->bBondComm
)
2145 r
= max(r
, comm
->cutoff_mbody
);
2149 r
= min(r
, comm
->cutoff
);
2157 real
dd_cutoff_twobody(gmx_domdec_t
*dd
)
2161 r_mb
= dd_cutoff_mbody(dd
);
2163 return max(dd
->comm
->cutoff
, r_mb
);
2167 static void dd_cart_coord2pmecoord(gmx_domdec_t
*dd
, ivec coord
, ivec coord_pme
)
2171 nc
= dd
->nc
[dd
->comm
->cartpmedim
];
2172 ntot
= dd
->comm
->ntot
[dd
->comm
->cartpmedim
];
2173 copy_ivec(coord
, coord_pme
);
2174 coord_pme
[dd
->comm
->cartpmedim
] =
2175 nc
+ (coord
[dd
->comm
->cartpmedim
]*(ntot
- nc
) + (ntot
- nc
)/2)/nc
;
2178 static int low_ddindex2pmeindex(int ndd
, int npme
, int ddindex
)
2180 /* Here we assign a PME node to communicate with this DD node
2181 * by assuming that the major index of both is x.
2182 * We add cr->npmenodes/2 to obtain an even distribution.
2184 return (ddindex
*npme
+ npme
/2)/ndd
;
2187 static int ddindex2pmeindex(const gmx_domdec_t
*dd
, int ddindex
)
2189 return low_ddindex2pmeindex(dd
->nnodes
, dd
->comm
->npmenodes
, ddindex
);
2192 static int cr_ddindex2pmeindex(const t_commrec
*cr
, int ddindex
)
2194 return low_ddindex2pmeindex(cr
->dd
->nnodes
, cr
->npmenodes
, ddindex
);
2197 static int *dd_pmenodes(t_commrec
*cr
)
2202 snew(pmenodes
, cr
->npmenodes
);
2204 for (i
= 0; i
< cr
->dd
->nnodes
; i
++)
2206 p0
= cr_ddindex2pmeindex(cr
, i
);
2207 p1
= cr_ddindex2pmeindex(cr
, i
+1);
2208 if (i
+1 == cr
->dd
->nnodes
|| p1
> p0
)
2212 fprintf(debug
, "pmenode[%d] = %d\n", n
, i
+1+n
);
2214 pmenodes
[n
] = i
+ 1 + n
;
2222 static int gmx_ddcoord2pmeindex(t_commrec
*cr
, int x
, int y
, int z
)
2225 ivec coords
, coords_pme
, nc
;
2230 if (dd->comm->bCartesian) {
2231 gmx_ddindex2xyz(dd->nc,ddindex,coords);
2232 dd_coords2pmecoords(dd,coords,coords_pme);
2233 copy_ivec(dd->ntot,nc);
2234 nc[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2235 coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2237 slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2239 slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2245 slab
= ddindex2pmeindex(dd
, dd_index(dd
->nc
, coords
));
2250 static int ddcoord2simnodeid(t_commrec
*cr
, int x
, int y
, int z
)
2252 gmx_domdec_comm_t
*comm
;
2254 int ddindex
, nodeid
= -1;
2256 comm
= cr
->dd
->comm
;
2261 if (comm
->bCartesianPP_PME
)
2264 MPI_Cart_rank(cr
->mpi_comm_mysim
, coords
, &nodeid
);
2269 ddindex
= dd_index(cr
->dd
->nc
, coords
);
2270 if (comm
->bCartesianPP
)
2272 nodeid
= comm
->ddindex2simnodeid
[ddindex
];
2278 nodeid
= ddindex
+ gmx_ddcoord2pmeindex(cr
, x
, y
, z
);
2290 static int dd_simnode2pmenode(t_commrec
*cr
, int sim_nodeid
)
2293 gmx_domdec_comm_t
*comm
;
2294 ivec coord
, coord_pme
;
2301 /* This assumes a uniform x domain decomposition grid cell size */
2302 if (comm
->bCartesianPP_PME
)
2305 MPI_Cart_coords(cr
->mpi_comm_mysim
, sim_nodeid
, DIM
, coord
);
2306 if (coord
[comm
->cartpmedim
] < dd
->nc
[comm
->cartpmedim
])
2308 /* This is a PP node */
2309 dd_cart_coord2pmecoord(dd
, coord
, coord_pme
);
2310 MPI_Cart_rank(cr
->mpi_comm_mysim
, coord_pme
, &pmenode
);
2314 else if (comm
->bCartesianPP
)
2316 if (sim_nodeid
< dd
->nnodes
)
2318 pmenode
= dd
->nnodes
+ ddindex2pmeindex(dd
, sim_nodeid
);
2323 /* This assumes DD cells with identical x coordinates
2324 * are numbered sequentially.
2326 if (dd
->comm
->pmenodes
== NULL
)
2328 if (sim_nodeid
< dd
->nnodes
)
2330 /* The DD index equals the nodeid */
2331 pmenode
= dd
->nnodes
+ ddindex2pmeindex(dd
, sim_nodeid
);
2337 while (sim_nodeid
> dd
->comm
->pmenodes
[i
])
2341 if (sim_nodeid
< dd
->comm
->pmenodes
[i
])
2343 pmenode
= dd
->comm
->pmenodes
[i
];
2351 void get_pme_nnodes(const gmx_domdec_t
*dd
,
2352 int *npmenodes_x
, int *npmenodes_y
)
2356 *npmenodes_x
= dd
->comm
->npmenodes_x
;
2357 *npmenodes_y
= dd
->comm
->npmenodes_y
;
2366 gmx_bool
gmx_pmeonlynode(t_commrec
*cr
, int sim_nodeid
)
2368 gmx_bool bPMEOnlyNode
;
2370 if (DOMAINDECOMP(cr
))
2372 bPMEOnlyNode
= (dd_simnode2pmenode(cr
, sim_nodeid
) == -1);
2376 bPMEOnlyNode
= FALSE
;
2379 return bPMEOnlyNode
;
2382 void get_pme_ddnodes(t_commrec
*cr
, int pmenodeid
,
2383 int *nmy_ddnodes
, int **my_ddnodes
, int *node_peer
)
2387 ivec coord
, coord_pme
;
2391 snew(*my_ddnodes
, (dd
->nnodes
+cr
->npmenodes
-1)/cr
->npmenodes
);
2394 for (x
= 0; x
< dd
->nc
[XX
]; x
++)
2396 for (y
= 0; y
< dd
->nc
[YY
]; y
++)
2398 for (z
= 0; z
< dd
->nc
[ZZ
]; z
++)
2400 if (dd
->comm
->bCartesianPP_PME
)
2405 dd_cart_coord2pmecoord(dd
, coord
, coord_pme
);
2406 if (dd
->ci
[XX
] == coord_pme
[XX
] &&
2407 dd
->ci
[YY
] == coord_pme
[YY
] &&
2408 dd
->ci
[ZZ
] == coord_pme
[ZZ
])
2410 (*my_ddnodes
)[(*nmy_ddnodes
)++] = ddcoord2simnodeid(cr
, x
, y
, z
);
2415 /* The slab corresponds to the nodeid in the PME group */
2416 if (gmx_ddcoord2pmeindex(cr
, x
, y
, z
) == pmenodeid
)
2418 (*my_ddnodes
)[(*nmy_ddnodes
)++] = ddcoord2simnodeid(cr
, x
, y
, z
);
2425 /* The last PP-only node is the peer node */
2426 *node_peer
= (*my_ddnodes
)[*nmy_ddnodes
-1];
2430 fprintf(debug
, "Receive coordinates from PP nodes:");
2431 for (x
= 0; x
< *nmy_ddnodes
; x
++)
2433 fprintf(debug
, " %d", (*my_ddnodes
)[x
]);
2435 fprintf(debug
, "\n");
2439 static gmx_bool
receive_vir_ener(t_commrec
*cr
)
2441 gmx_domdec_comm_t
*comm
;
2442 int pmenode
, coords
[DIM
], rank
;
2446 if (cr
->npmenodes
< cr
->dd
->nnodes
)
2448 comm
= cr
->dd
->comm
;
2449 if (comm
->bCartesianPP_PME
)
2451 pmenode
= dd_simnode2pmenode(cr
, cr
->sim_nodeid
);
2453 MPI_Cart_coords(cr
->mpi_comm_mysim
, cr
->sim_nodeid
, DIM
, coords
);
2454 coords
[comm
->cartpmedim
]++;
2455 if (coords
[comm
->cartpmedim
] < cr
->dd
->nc
[comm
->cartpmedim
])
2457 MPI_Cart_rank(cr
->mpi_comm_mysim
, coords
, &rank
);
2458 if (dd_simnode2pmenode(cr
, rank
) == pmenode
)
2460 /* This is not the last PP node for pmenode */
2468 pmenode
= dd_simnode2pmenode(cr
, cr
->sim_nodeid
);
2469 if (cr
->sim_nodeid
+1 < cr
->nnodes
&&
2470 dd_simnode2pmenode(cr
, cr
->sim_nodeid
+1) == pmenode
)
2472 /* This is not the last PP node for pmenode */
2481 static void set_zones_ncg_home(gmx_domdec_t
*dd
)
2483 gmx_domdec_zones_t
*zones
;
2486 zones
= &dd
->comm
->zones
;
2488 zones
->cg_range
[0] = 0;
2489 for (i
= 1; i
< zones
->n
+1; i
++)
2491 zones
->cg_range
[i
] = dd
->ncg_home
;
2493 /* zone_ncg1[0] should always be equal to ncg_home */
2494 dd
->comm
->zone_ncg1
[0] = dd
->ncg_home
;
2497 static void rebuild_cgindex(gmx_domdec_t
*dd
,
2498 const int *gcgs_index
, t_state
*state
)
2500 int nat
, i
, *ind
, *dd_cg_gl
, *cgindex
, cg_gl
;
2503 dd_cg_gl
= dd
->index_gl
;
2504 cgindex
= dd
->cgindex
;
2507 for (i
= 0; i
< state
->ncg_gl
; i
++)
2511 dd_cg_gl
[i
] = cg_gl
;
2512 nat
+= gcgs_index
[cg_gl
+1] - gcgs_index
[cg_gl
];
2516 dd
->ncg_home
= state
->ncg_gl
;
2519 set_zones_ncg_home(dd
);
2522 static int ddcginfo(const cginfo_mb_t
*cginfo_mb
, int cg
)
2524 while (cg
>= cginfo_mb
->cg_end
)
2529 return cginfo_mb
->cginfo
[(cg
- cginfo_mb
->cg_start
) % cginfo_mb
->cg_mod
];
2532 static void dd_set_cginfo(int *index_gl
, int cg0
, int cg1
,
2533 t_forcerec
*fr
, char *bLocalCG
)
2535 cginfo_mb_t
*cginfo_mb
;
2541 cginfo_mb
= fr
->cginfo_mb
;
2542 cginfo
= fr
->cginfo
;
2544 for (cg
= cg0
; cg
< cg1
; cg
++)
2546 cginfo
[cg
] = ddcginfo(cginfo_mb
, index_gl
[cg
]);
2550 if (bLocalCG
!= NULL
)
2552 for (cg
= cg0
; cg
< cg1
; cg
++)
2554 bLocalCG
[index_gl
[cg
]] = TRUE
;
2559 static void make_dd_indices(gmx_domdec_t
*dd
,
2560 const int *gcgs_index
, int cg_start
)
2562 int nzone
, zone
, zone1
, cg0
, cg1
, cg1_p1
, cg
, cg_gl
, a
, a_gl
;
2563 int *zone2cg
, *zone_ncg1
, *index_gl
, *gatindex
;
2568 bLocalCG
= dd
->comm
->bLocalCG
;
2570 if (dd
->nat_tot
> dd
->gatindex_nalloc
)
2572 dd
->gatindex_nalloc
= over_alloc_dd(dd
->nat_tot
);
2573 srenew(dd
->gatindex
, dd
->gatindex_nalloc
);
2576 nzone
= dd
->comm
->zones
.n
;
2577 zone2cg
= dd
->comm
->zones
.cg_range
;
2578 zone_ncg1
= dd
->comm
->zone_ncg1
;
2579 index_gl
= dd
->index_gl
;
2580 gatindex
= dd
->gatindex
;
2581 bCGs
= dd
->comm
->bCGs
;
2583 if (zone2cg
[1] != dd
->ncg_home
)
2585 gmx_incons("dd->ncg_zone is not up to date");
2588 /* Make the local to global and global to local atom index */
2589 a
= dd
->cgindex
[cg_start
];
2590 for (zone
= 0; zone
< nzone
; zone
++)
2598 cg0
= zone2cg
[zone
];
2600 cg1
= zone2cg
[zone
+1];
2601 cg1_p1
= cg0
+ zone_ncg1
[zone
];
2603 for (cg
= cg0
; cg
< cg1
; cg
++)
2608 /* Signal that this cg is from more than one pulse away */
2611 cg_gl
= index_gl
[cg
];
2614 for (a_gl
= gcgs_index
[cg_gl
]; a_gl
< gcgs_index
[cg_gl
+1]; a_gl
++)
2617 ga2la_set(dd
->ga2la
, a_gl
, a
, zone1
);
2623 gatindex
[a
] = cg_gl
;
2624 ga2la_set(dd
->ga2la
, cg_gl
, a
, zone1
);
2631 static int check_bLocalCG(gmx_domdec_t
*dd
, int ncg_sys
, const char *bLocalCG
,
2634 int ncg
, i
, ngl
, nerr
;
2637 if (bLocalCG
== NULL
)
2641 for (i
= 0; i
< dd
->ncg_tot
; i
++)
2643 if (!bLocalCG
[dd
->index_gl
[i
]])
2646 "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd
->rank
, where
, i
+1, dd
->index_gl
[i
]+1, dd
->ncg_home
);
2651 for (i
= 0; i
< ncg_sys
; i
++)
2658 if (ngl
!= dd
->ncg_tot
)
2660 fprintf(stderr
, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd
->rank
, where
, ngl
, dd
->ncg_tot
);
2667 static void check_index_consistency(gmx_domdec_t
*dd
,
2668 int natoms_sys
, int ncg_sys
,
2671 int nerr
, ngl
, i
, a
, cell
;
2676 if (dd
->comm
->DD_debug
> 1)
2678 snew(have
, natoms_sys
);
2679 for (a
= 0; a
< dd
->nat_tot
; a
++)
2681 if (have
[dd
->gatindex
[a
]] > 0)
2683 fprintf(stderr
, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd
->rank
, dd
->gatindex
[a
]+1, have
[dd
->gatindex
[a
]], a
+1);
2687 have
[dd
->gatindex
[a
]] = a
+ 1;
2693 snew(have
, dd
->nat_tot
);
2696 for (i
= 0; i
< natoms_sys
; i
++)
2698 if (ga2la_get(dd
->ga2la
, i
, &a
, &cell
))
2700 if (a
>= dd
->nat_tot
)
2702 fprintf(stderr
, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd
->rank
, i
+1, a
+1, dd
->nat_tot
);
2708 if (dd
->gatindex
[a
] != i
)
2710 fprintf(stderr
, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd
->rank
, i
+1, a
+1, dd
->gatindex
[a
]+1);
2717 if (ngl
!= dd
->nat_tot
)
2720 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2721 dd
->rank
, where
, ngl
, dd
->nat_tot
);
2723 for (a
= 0; a
< dd
->nat_tot
; a
++)
2728 "DD node %d, %s: local atom %d, global %d has no global index\n",
2729 dd
->rank
, where
, a
+1, dd
->gatindex
[a
]+1);
2734 nerr
+= check_bLocalCG(dd
, ncg_sys
, dd
->comm
->bLocalCG
, where
);
2738 gmx_fatal(FARGS
, "DD node %d, %s: %d atom/cg index inconsistencies",
2739 dd
->rank
, where
, nerr
);
2743 static void clear_dd_indices(gmx_domdec_t
*dd
, int cg_start
, int a_start
)
2750 /* Clear the whole list without searching */
2751 ga2la_clear(dd
->ga2la
);
2755 for (i
= a_start
; i
< dd
->nat_tot
; i
++)
2757 ga2la_del(dd
->ga2la
, dd
->gatindex
[i
]);
2761 bLocalCG
= dd
->comm
->bLocalCG
;
2764 for (i
= cg_start
; i
< dd
->ncg_tot
; i
++)
2766 bLocalCG
[dd
->index_gl
[i
]] = FALSE
;
2770 dd_clear_local_vsite_indices(dd
);
2772 if (dd
->constraints
)
2774 dd_clear_local_constraint_indices(dd
);
2778 /* This function should be used for moving the domain boudaries during DLB,
2779 * for obtaining the minimum cell size. It checks the initially set limit
2780 * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2781 * and, possibly, a longer cut-off limit set for PME load balancing.
2783 static real
cellsize_min_dlb(gmx_domdec_comm_t
*comm
, int dim_ind
, int dim
)
2787 cellsize_min
= comm
->cellsize_min
[dim
];
2789 if (!comm
->bVacDLBNoLimit
)
2791 /* The cut-off might have changed, e.g. by PME load balacning,
2792 * from the value used to set comm->cellsize_min, so check it.
2794 cellsize_min
= max(cellsize_min
, comm
->cutoff
/comm
->cd
[dim_ind
].np_dlb
);
2796 if (comm
->bPMELoadBalDLBLimits
)
2798 /* Check for the cut-off limit set by the PME load balancing */
2799 cellsize_min
= max(cellsize_min
, comm
->PMELoadBal_max_cutoff
/comm
->cd
[dim_ind
].np_dlb
);
2803 return cellsize_min
;
2806 static real
grid_jump_limit(gmx_domdec_comm_t
*comm
, real cutoff
,
2809 real grid_jump_limit
;
2811 /* The distance between the boundaries of cells at distance
2812 * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2813 * and by the fact that cells should not be shifted by more than
2814 * half their size, such that cg's only shift by one cell
2815 * at redecomposition.
2817 grid_jump_limit
= comm
->cellsize_limit
;
2818 if (!comm
->bVacDLBNoLimit
)
2820 if (comm
->bPMELoadBalDLBLimits
)
2822 cutoff
= max(cutoff
, comm
->PMELoadBal_max_cutoff
);
2824 grid_jump_limit
= max(grid_jump_limit
,
2825 cutoff
/comm
->cd
[dim_ind
].np
);
2828 return grid_jump_limit
;
2831 static gmx_bool
check_grid_jump(gmx_large_int_t step
,
2837 gmx_domdec_comm_t
*comm
;
2846 for (d
= 1; d
< dd
->ndim
; d
++)
2849 limit
= grid_jump_limit(comm
, cutoff
, d
);
2850 bfac
= ddbox
->box_size
[dim
];
2851 if (ddbox
->tric_dir
[dim
])
2853 bfac
*= ddbox
->skew_fac
[dim
];
2855 if ((comm
->cell_f1
[d
] - comm
->cell_f_max0
[d
])*bfac
< limit
||
2856 (comm
->cell_f0
[d
] - comm
->cell_f_min1
[d
])*bfac
> -limit
)
2864 /* This error should never be triggered under normal
2865 * circumstances, but you never know ...
2867 gmx_fatal(FARGS
, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2868 gmx_step_str(step
, buf
),
2869 dim2char(dim
), dd
->ci
[XX
], dd
->ci
[YY
], dd
->ci
[ZZ
]);
2877 static int dd_load_count(gmx_domdec_comm_t
*comm
)
2879 return (comm
->eFlop
? comm
->flop_n
: comm
->cycl_n
[ddCyclF
]);
2882 static float dd_force_load(gmx_domdec_comm_t
*comm
)
2889 if (comm
->eFlop
> 1)
2891 load
*= 1.0 + (comm
->eFlop
- 1)*(0.1*rand()/RAND_MAX
- 0.05);
2896 load
= comm
->cycl
[ddCyclF
];
2897 if (comm
->cycl_n
[ddCyclF
] > 1)
2899 /* Subtract the maximum of the last n cycle counts
2900 * to get rid of possible high counts due to other sources,
2901 * for instance system activity, that would otherwise
2902 * affect the dynamic load balancing.
2904 load
-= comm
->cycl_max
[ddCyclF
];
2908 if (comm
->cycl_n
[ddCyclWaitGPU
] && comm
->nrank_gpu_shared
> 1)
2910 float gpu_wait
, gpu_wait_sum
;
2912 gpu_wait
= comm
->cycl
[ddCyclWaitGPU
];
2913 if (comm
->cycl_n
[ddCyclF
] > 1)
2915 /* We should remove the WaitGPU time of the same MD step
2916 * as the one with the maximum F time, since the F time
2917 * and the wait time are not independent.
2918 * Furthermore, the step for the max F time should be chosen
2919 * the same on all ranks that share the same GPU.
2920 * But to keep the code simple, we remove the average instead.
2921 * The main reason for artificially long times at some steps
2922 * is spurious CPU activity or MPI time, so we don't expect
2923 * that changes in the GPU wait time matter a lot here.
2925 gpu_wait
*= (comm
->cycl_n
[ddCyclF
] - 1)/(float)comm
->cycl_n
[ddCyclF
];
2927 /* Sum the wait times over the ranks that share the same GPU */
2928 MPI_Allreduce(&gpu_wait
, &gpu_wait_sum
, 1, MPI_FLOAT
, MPI_SUM
,
2929 comm
->mpi_comm_gpu_shared
);
2930 /* Replace the wait time by the average over the ranks */
2931 load
+= -gpu_wait
+ gpu_wait_sum
/comm
->nrank_gpu_shared
;
2939 static void set_slb_pme_dim_f(gmx_domdec_t
*dd
, int dim
, real
**dim_f
)
2941 gmx_domdec_comm_t
*comm
;
2946 snew(*dim_f
, dd
->nc
[dim
]+1);
2948 for (i
= 1; i
< dd
->nc
[dim
]; i
++)
2950 if (comm
->slb_frac
[dim
])
2952 (*dim_f
)[i
] = (*dim_f
)[i
-1] + comm
->slb_frac
[dim
][i
-1];
2956 (*dim_f
)[i
] = (real
)i
/(real
)dd
->nc
[dim
];
2959 (*dim_f
)[dd
->nc
[dim
]] = 1;
2962 static void init_ddpme(gmx_domdec_t
*dd
, gmx_ddpme_t
*ddpme
, int dimind
)
2964 int pmeindex
, slab
, nso
, i
;
2967 if (dimind
== 0 && dd
->dim
[0] == YY
&& dd
->comm
->npmenodes_x
== 1)
2973 ddpme
->dim
= dimind
;
2975 ddpme
->dim_match
= (ddpme
->dim
== dd
->dim
[dimind
]);
2977 ddpme
->nslab
= (ddpme
->dim
== 0 ?
2978 dd
->comm
->npmenodes_x
:
2979 dd
->comm
->npmenodes_y
);
2981 if (ddpme
->nslab
<= 1)
2986 nso
= dd
->comm
->npmenodes
/ddpme
->nslab
;
2987 /* Determine for each PME slab the PP location range for dimension dim */
2988 snew(ddpme
->pp_min
, ddpme
->nslab
);
2989 snew(ddpme
->pp_max
, ddpme
->nslab
);
2990 for (slab
= 0; slab
< ddpme
->nslab
; slab
++)
2992 ddpme
->pp_min
[slab
] = dd
->nc
[dd
->dim
[dimind
]] - 1;
2993 ddpme
->pp_max
[slab
] = 0;
2995 for (i
= 0; i
< dd
->nnodes
; i
++)
2997 ddindex2xyz(dd
->nc
, i
, xyz
);
2998 /* For y only use our y/z slab.
2999 * This assumes that the PME x grid size matches the DD grid size.
3001 if (dimind
== 0 || xyz
[XX
] == dd
->ci
[XX
])
3003 pmeindex
= ddindex2pmeindex(dd
, i
);
3006 slab
= pmeindex
/nso
;
3010 slab
= pmeindex
% ddpme
->nslab
;
3012 ddpme
->pp_min
[slab
] = min(ddpme
->pp_min
[slab
], xyz
[dimind
]);
3013 ddpme
->pp_max
[slab
] = max(ddpme
->pp_max
[slab
], xyz
[dimind
]);
3017 set_slb_pme_dim_f(dd
, ddpme
->dim
, &ddpme
->slb_dim_f
);
3020 int dd_pme_maxshift_x(gmx_domdec_t
*dd
)
3022 if (dd
->comm
->ddpme
[0].dim
== XX
)
3024 return dd
->comm
->ddpme
[0].maxshift
;
3032 int dd_pme_maxshift_y(gmx_domdec_t
*dd
)
3034 if (dd
->comm
->ddpme
[0].dim
== YY
)
3036 return dd
->comm
->ddpme
[0].maxshift
;
3038 else if (dd
->comm
->npmedecompdim
>= 2 && dd
->comm
->ddpme
[1].dim
== YY
)
3040 return dd
->comm
->ddpme
[1].maxshift
;
3048 static void set_pme_maxshift(gmx_domdec_t
*dd
, gmx_ddpme_t
*ddpme
,
3049 gmx_bool bUniform
, gmx_ddbox_t
*ddbox
, real
*cell_f
)
3051 gmx_domdec_comm_t
*comm
;
3054 real range
, pme_boundary
;
3058 nc
= dd
->nc
[ddpme
->dim
];
3061 if (!ddpme
->dim_match
)
3063 /* PP decomposition is not along dim: the worst situation */
3066 else if (ns
<= 3 || (bUniform
&& ns
== nc
))
3068 /* The optimal situation */
3073 /* We need to check for all pme nodes which nodes they
3074 * could possibly need to communicate with.
3076 xmin
= ddpme
->pp_min
;
3077 xmax
= ddpme
->pp_max
;
3078 /* Allow for atoms to be maximally 2/3 times the cut-off
3079 * out of their DD cell. This is a reasonable balance between
3080 * between performance and support for most charge-group/cut-off
3083 range
= 2.0/3.0*comm
->cutoff
/ddbox
->box_size
[ddpme
->dim
];
3084 /* Avoid extra communication when we are exactly at a boundary */
3088 for (s
= 0; s
< ns
; s
++)
3090 /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
3091 pme_boundary
= (real
)s
/ns
;
3094 cell_f
[xmax
[s
-(sh
+1) ]+1] + range
> pme_boundary
) ||
3096 cell_f
[xmax
[s
-(sh
+1)+ns
]+1] - 1 + range
> pme_boundary
)))
3100 pme_boundary
= (real
)(s
+1)/ns
;
3103 cell_f
[xmin
[s
+(sh
+1) ] ] - range
< pme_boundary
) ||
3105 cell_f
[xmin
[s
+(sh
+1)-ns
] ] + 1 - range
< pme_boundary
)))
3112 ddpme
->maxshift
= sh
;
3116 fprintf(debug
, "PME slab communication range for dim %d is %d\n",
3117 ddpme
->dim
, ddpme
->maxshift
);
3121 static void check_box_size(gmx_domdec_t
*dd
, gmx_ddbox_t
*ddbox
)
3125 for (d
= 0; d
< dd
->ndim
; d
++)
3128 if (dim
< ddbox
->nboundeddim
&&
3129 ddbox
->box_size
[dim
]*ddbox
->skew_fac
[dim
] <
3130 dd
->nc
[dim
]*dd
->comm
->cellsize_limit
*DD_CELL_MARGIN
)
3132 gmx_fatal(FARGS
, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3133 dim2char(dim
), ddbox
->box_size
[dim
], ddbox
->skew_fac
[dim
],
3134 dd
->nc
[dim
], dd
->comm
->cellsize_limit
);
3139 static void set_dd_cell_sizes_slb(gmx_domdec_t
*dd
, gmx_ddbox_t
*ddbox
,
3140 gmx_bool bMaster
, ivec npulse
)
3142 gmx_domdec_comm_t
*comm
;
3145 real
*cell_x
, cell_dx
, cellsize
;
3149 for (d
= 0; d
< DIM
; d
++)
3151 cellsize_min
[d
] = ddbox
->box_size
[d
]*ddbox
->skew_fac
[d
];
3153 if (dd
->nc
[d
] == 1 || comm
->slb_frac
[d
] == NULL
)
3156 cell_dx
= ddbox
->box_size
[d
]/dd
->nc
[d
];
3159 for (j
= 0; j
< dd
->nc
[d
]+1; j
++)
3161 dd
->ma
->cell_x
[d
][j
] = ddbox
->box0
[d
] + j
*cell_dx
;
3166 comm
->cell_x0
[d
] = ddbox
->box0
[d
] + (dd
->ci
[d
] )*cell_dx
;
3167 comm
->cell_x1
[d
] = ddbox
->box0
[d
] + (dd
->ci
[d
]+1)*cell_dx
;
3169 cellsize
= cell_dx
*ddbox
->skew_fac
[d
];
3170 while (cellsize
*npulse
[d
] < comm
->cutoff
&& npulse
[d
] < dd
->nc
[d
]-1)
3174 cellsize_min
[d
] = cellsize
;
3178 /* Statically load balanced grid */
3179 /* Also when we are not doing a master distribution we determine
3180 * all cell borders in a loop to obtain identical values
3181 * to the master distribution case and to determine npulse.
3185 cell_x
= dd
->ma
->cell_x
[d
];
3189 snew(cell_x
, dd
->nc
[d
]+1);
3191 cell_x
[0] = ddbox
->box0
[d
];
3192 for (j
= 0; j
< dd
->nc
[d
]; j
++)
3194 cell_dx
= ddbox
->box_size
[d
]*comm
->slb_frac
[d
][j
];
3195 cell_x
[j
+1] = cell_x
[j
] + cell_dx
;
3196 cellsize
= cell_dx
*ddbox
->skew_fac
[d
];
3197 while (cellsize
*npulse
[d
] < comm
->cutoff
&&
3198 npulse
[d
] < dd
->nc
[d
]-1)
3202 cellsize_min
[d
] = min(cellsize_min
[d
], cellsize
);
3206 comm
->cell_x0
[d
] = cell_x
[dd
->ci
[d
]];
3207 comm
->cell_x1
[d
] = cell_x
[dd
->ci
[d
]+1];
3211 /* The following limitation is to avoid that a cell would receive
3212 * some of its own home charge groups back over the periodic boundary.
3213 * Double charge groups cause trouble with the global indices.
3215 if (d
< ddbox
->npbcdim
&&
3216 dd
->nc
[d
] > 1 && npulse
[d
] >= dd
->nc
[d
])
3218 gmx_fatal_collective(FARGS
, NULL
, dd
,
3219 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3220 dim2char(d
), ddbox
->box_size
[d
], ddbox
->skew_fac
[d
],
3222 dd
->nc
[d
], dd
->nc
[d
],
3223 dd
->nnodes
> dd
->nc
[d
] ? "cells" : "processors");
3227 if (!comm
->bDynLoadBal
)
3229 copy_rvec(cellsize_min
, comm
->cellsize_min
);
3232 for (d
= 0; d
< comm
->npmedecompdim
; d
++)
3234 set_pme_maxshift(dd
, &comm
->ddpme
[d
],
3235 comm
->slb_frac
[dd
->dim
[d
]] == NULL
, ddbox
,
3236 comm
->ddpme
[d
].slb_dim_f
);
3241 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t
*dd
,
3242 int d
, int dim
, gmx_domdec_root_t
*root
,
3244 gmx_bool bUniform
, gmx_large_int_t step
, real cellsize_limit_f
, int range
[])
3246 gmx_domdec_comm_t
*comm
;
3247 int ncd
, i
, j
, nmin
, nmin_old
;
3248 gmx_bool bLimLo
, bLimHi
;
3250 real fac
, halfway
, cellsize_limit_f_i
, region_size
;
3251 gmx_bool bPBC
, bLastHi
= FALSE
;
3252 int nrange
[] = {range
[0], range
[1]};
3254 region_size
= root
->cell_f
[range
[1]]-root
->cell_f
[range
[0]];
3260 bPBC
= (dim
< ddbox
->npbcdim
);
3262 cell_size
= root
->buf_ncd
;
3266 fprintf(debug
, "enforce_limits: %d %d\n", range
[0], range
[1]);
3269 /* First we need to check if the scaling does not make cells
3270 * smaller than the smallest allowed size.
3271 * We need to do this iteratively, since if a cell is too small,
3272 * it needs to be enlarged, which makes all the other cells smaller,
3273 * which could in turn make another cell smaller than allowed.
3275 for (i
= range
[0]; i
< range
[1]; i
++)
3277 root
->bCellMin
[i
] = FALSE
;
3283 /* We need the total for normalization */
3285 for (i
= range
[0]; i
< range
[1]; i
++)
3287 if (root
->bCellMin
[i
] == FALSE
)
3289 fac
+= cell_size
[i
];
3292 fac
= ( region_size
- nmin
*cellsize_limit_f
)/fac
; /* substracting cells already set to cellsize_limit_f */
3293 /* Determine the cell boundaries */
3294 for (i
= range
[0]; i
< range
[1]; i
++)
3296 if (root
->bCellMin
[i
] == FALSE
)
3298 cell_size
[i
] *= fac
;
3299 if (!bPBC
&& (i
== 0 || i
== dd
->nc
[dim
] -1))
3301 cellsize_limit_f_i
= 0;
3305 cellsize_limit_f_i
= cellsize_limit_f
;
3307 if (cell_size
[i
] < cellsize_limit_f_i
)
3309 root
->bCellMin
[i
] = TRUE
;
3310 cell_size
[i
] = cellsize_limit_f_i
;
3314 root
->cell_f
[i
+1] = root
->cell_f
[i
] + cell_size
[i
];
3317 while (nmin
> nmin_old
);
3320 cell_size
[i
] = root
->cell_f
[i
+1] - root
->cell_f
[i
];
3321 /* For this check we should not use DD_CELL_MARGIN,
3322 * but a slightly smaller factor,
3323 * since rounding could get use below the limit.
3325 if (bPBC
&& cell_size
[i
] < cellsize_limit_f
*DD_CELL_MARGIN2
/DD_CELL_MARGIN
)
3328 gmx_fatal(FARGS
, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3329 gmx_step_str(step
, buf
),
3330 dim2char(dim
), ddbox
->box_size
[dim
], ddbox
->skew_fac
[dim
],
3331 ncd
, comm
->cellsize_min
[dim
]);
3334 root
->bLimited
= (nmin
> 0) || (range
[0] > 0) || (range
[1] < ncd
);
3338 /* Check if the boundary did not displace more than halfway
3339 * each of the cells it bounds, as this could cause problems,
3340 * especially when the differences between cell sizes are large.
3341 * If changes are applied, they will not make cells smaller
3342 * than the cut-off, as we check all the boundaries which
3343 * might be affected by a change and if the old state was ok,
3344 * the cells will at most be shrunk back to their old size.
3346 for (i
= range
[0]+1; i
< range
[1]; i
++)
3348 halfway
= 0.5*(root
->old_cell_f
[i
] + root
->old_cell_f
[i
-1]);
3349 if (root
->cell_f
[i
] < halfway
)
3351 root
->cell_f
[i
] = halfway
;
3352 /* Check if the change also causes shifts of the next boundaries */
3353 for (j
= i
+1; j
< range
[1]; j
++)
3355 if (root
->cell_f
[j
] < root
->cell_f
[j
-1] + cellsize_limit_f
)
3357 root
->cell_f
[j
] = root
->cell_f
[j
-1] + cellsize_limit_f
;
3361 halfway
= 0.5*(root
->old_cell_f
[i
] + root
->old_cell_f
[i
+1]);
3362 if (root
->cell_f
[i
] > halfway
)
3364 root
->cell_f
[i
] = halfway
;
3365 /* Check if the change also causes shifts of the next boundaries */
3366 for (j
= i
-1; j
>= range
[0]+1; j
--)
3368 if (root
->cell_f
[j
] > root
->cell_f
[j
+1] - cellsize_limit_f
)
3370 root
->cell_f
[j
] = root
->cell_f
[j
+1] - cellsize_limit_f
;
3377 /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3378 /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3379 * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3380 * for a and b nrange is used */
3383 /* Take care of the staggering of the cell boundaries */
3386 for (i
= range
[0]; i
< range
[1]; i
++)
3388 root
->cell_f_max0
[i
] = root
->cell_f
[i
];
3389 root
->cell_f_min1
[i
] = root
->cell_f
[i
+1];
3394 for (i
= range
[0]+1; i
< range
[1]; i
++)
3396 bLimLo
= (root
->cell_f
[i
] < root
->bound_min
[i
]);
3397 bLimHi
= (root
->cell_f
[i
] > root
->bound_max
[i
]);
3398 if (bLimLo
&& bLimHi
)
3400 /* Both limits violated, try the best we can */
3401 /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3402 root
->cell_f
[i
] = 0.5*(root
->bound_min
[i
] + root
->bound_max
[i
]);
3403 nrange
[0] = range
[0];
3405 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, nrange
);
3408 nrange
[1] = range
[1];
3409 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, nrange
);
3415 /* root->cell_f[i] = root->bound_min[i]; */
3416 nrange
[1] = i
; /* only store violation location. There could be a LimLo violation following with an higher index */
3419 else if (bLimHi
&& !bLastHi
)
3422 if (nrange
[1] < range
[1]) /* found a LimLo before */
3424 root
->cell_f
[nrange
[1]] = root
->bound_min
[nrange
[1]];
3425 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, nrange
);
3426 nrange
[0] = nrange
[1];
3428 root
->cell_f
[i
] = root
->bound_max
[i
];
3430 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, nrange
);
3432 nrange
[1] = range
[1];
3435 if (nrange
[1] < range
[1]) /* found last a LimLo */
3437 root
->cell_f
[nrange
[1]] = root
->bound_min
[nrange
[1]];
3438 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, nrange
);
3439 nrange
[0] = nrange
[1];
3440 nrange
[1] = range
[1];
3441 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, nrange
);
3443 else if (nrange
[0] > range
[0]) /* found at least one LimHi */
3445 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, nrange
);
3452 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t
*dd
,
3453 int d
, int dim
, gmx_domdec_root_t
*root
,
3454 gmx_ddbox_t
*ddbox
, gmx_bool bDynamicBox
,
3455 gmx_bool bUniform
, gmx_large_int_t step
)
3457 gmx_domdec_comm_t
*comm
;
3458 int ncd
, d1
, i
, j
, pos
;
3460 real load_aver
, load_i
, imbalance
, change
, change_max
, sc
;
3461 real cellsize_limit_f
, dist_min_f
, dist_min_f_hard
, space
;
3465 int range
[] = { 0, 0 };
3469 /* Convert the maximum change from the input percentage to a fraction */
3470 change_limit
= comm
->dlb_scale_lim
*0.01;
3474 bPBC
= (dim
< ddbox
->npbcdim
);
3476 cell_size
= root
->buf_ncd
;
3478 /* Store the original boundaries */
3479 for (i
= 0; i
< ncd
+1; i
++)
3481 root
->old_cell_f
[i
] = root
->cell_f
[i
];
3485 for (i
= 0; i
< ncd
; i
++)
3487 cell_size
[i
] = 1.0/ncd
;
3490 else if (dd_load_count(comm
))
3492 load_aver
= comm
->load
[d
].sum_m
/ncd
;
3494 for (i
= 0; i
< ncd
; i
++)
3496 /* Determine the relative imbalance of cell i */
3497 load_i
= comm
->load
[d
].load
[i
*comm
->load
[d
].nload
+2];
3498 imbalance
= (load_i
- load_aver
)/(load_aver
> 0 ? load_aver
: 1);
3499 /* Determine the change of the cell size using underrelaxation */
3500 change
= -relax
*imbalance
;
3501 change_max
= max(change_max
, max(change
, -change
));
3503 /* Limit the amount of scaling.
3504 * We need to use the same rescaling for all cells in one row,
3505 * otherwise the load balancing might not converge.
3508 if (change_max
> change_limit
)
3510 sc
*= change_limit
/change_max
;
3512 for (i
= 0; i
< ncd
; i
++)
3514 /* Determine the relative imbalance of cell i */
3515 load_i
= comm
->load
[d
].load
[i
*comm
->load
[d
].nload
+2];
3516 imbalance
= (load_i
- load_aver
)/(load_aver
> 0 ? load_aver
: 1);
3517 /* Determine the change of the cell size using underrelaxation */
3518 change
= -sc
*imbalance
;
3519 cell_size
[i
] = (root
->cell_f
[i
+1]-root
->cell_f
[i
])*(1 + change
);
3523 cellsize_limit_f
= cellsize_min_dlb(comm
, d
, dim
)/ddbox
->box_size
[dim
];
3524 cellsize_limit_f
*= DD_CELL_MARGIN
;
3525 dist_min_f_hard
= grid_jump_limit(comm
, comm
->cutoff
, d
)/ddbox
->box_size
[dim
];
3526 dist_min_f
= dist_min_f_hard
* DD_CELL_MARGIN
;
3527 if (ddbox
->tric_dir
[dim
])
3529 cellsize_limit_f
/= ddbox
->skew_fac
[dim
];
3530 dist_min_f
/= ddbox
->skew_fac
[dim
];
3532 if (bDynamicBox
&& d
> 0)
3534 dist_min_f
*= DD_PRES_SCALE_MARGIN
;
3536 if (d
> 0 && !bUniform
)
3538 /* Make sure that the grid is not shifted too much */
3539 for (i
= 1; i
< ncd
; i
++)
3541 if (root
->cell_f_min1
[i
] - root
->cell_f_max0
[i
-1] < 2 * dist_min_f_hard
)
3543 gmx_incons("Inconsistent DD boundary staggering limits!");
3545 root
->bound_min
[i
] = root
->cell_f_max0
[i
-1] + dist_min_f
;
3546 space
= root
->cell_f
[i
] - (root
->cell_f_max0
[i
-1] + dist_min_f
);
3549 root
->bound_min
[i
] += 0.5*space
;
3551 root
->bound_max
[i
] = root
->cell_f_min1
[i
] - dist_min_f
;
3552 space
= root
->cell_f
[i
] - (root
->cell_f_min1
[i
] - dist_min_f
);
3555 root
->bound_max
[i
] += 0.5*space
;
3560 "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3562 root
->cell_f_max0
[i
-1] + dist_min_f
,
3563 root
->bound_min
[i
], root
->cell_f
[i
], root
->bound_max
[i
],
3564 root
->cell_f_min1
[i
] - dist_min_f
);
3569 root
->cell_f
[0] = 0;
3570 root
->cell_f
[ncd
] = 1;
3571 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, range
);
3574 /* After the checks above, the cells should obey the cut-off
3575 * restrictions, but it does not hurt to check.
3577 for (i
= 0; i
< ncd
; i
++)
3581 fprintf(debug
, "Relative bounds dim %d cell %d: %f %f\n",
3582 dim
, i
, root
->cell_f
[i
], root
->cell_f
[i
+1]);
3585 if ((bPBC
|| (i
!= 0 && i
!= dd
->nc
[dim
]-1)) &&
3586 root
->cell_f
[i
+1] - root
->cell_f
[i
] <
3587 cellsize_limit_f
/DD_CELL_MARGIN
)
3591 "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3592 gmx_step_str(step
, buf
), dim2char(dim
), i
,
3593 (root
->cell_f
[i
+1] - root
->cell_f
[i
])
3594 *ddbox
->box_size
[dim
]*ddbox
->skew_fac
[dim
]);
3599 /* Store the cell boundaries of the lower dimensions at the end */
3600 for (d1
= 0; d1
< d
; d1
++)
3602 root
->cell_f
[pos
++] = comm
->cell_f0
[d1
];
3603 root
->cell_f
[pos
++] = comm
->cell_f1
[d1
];
3606 if (d
< comm
->npmedecompdim
)
3608 /* The master determines the maximum shift for
3609 * the coordinate communication between separate PME nodes.
3611 set_pme_maxshift(dd
, &comm
->ddpme
[d
], bUniform
, ddbox
, root
->cell_f
);
3613 root
->cell_f
[pos
++] = comm
->ddpme
[0].maxshift
;
3616 root
->cell_f
[pos
++] = comm
->ddpme
[1].maxshift
;
3620 static void relative_to_absolute_cell_bounds(gmx_domdec_t
*dd
,
3621 gmx_ddbox_t
*ddbox
, int dimind
)
3623 gmx_domdec_comm_t
*comm
;
3628 /* Set the cell dimensions */
3629 dim
= dd
->dim
[dimind
];
3630 comm
->cell_x0
[dim
] = comm
->cell_f0
[dimind
]*ddbox
->box_size
[dim
];
3631 comm
->cell_x1
[dim
] = comm
->cell_f1
[dimind
]*ddbox
->box_size
[dim
];
3632 if (dim
>= ddbox
->nboundeddim
)
3634 comm
->cell_x0
[dim
] += ddbox
->box0
[dim
];
3635 comm
->cell_x1
[dim
] += ddbox
->box0
[dim
];
3639 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t
*dd
,
3640 int d
, int dim
, real
*cell_f_row
,
3643 gmx_domdec_comm_t
*comm
;
3649 /* Each node would only need to know two fractions,
3650 * but it is probably cheaper to broadcast the whole array.
3652 MPI_Bcast(cell_f_row
, DD_CELL_F_SIZE(dd
, d
)*sizeof(real
), MPI_BYTE
,
3653 0, comm
->mpi_comm_load
[d
]);
3655 /* Copy the fractions for this dimension from the buffer */
3656 comm
->cell_f0
[d
] = cell_f_row
[dd
->ci
[dim
] ];
3657 comm
->cell_f1
[d
] = cell_f_row
[dd
->ci
[dim
]+1];
3658 /* The whole array was communicated, so set the buffer position */
3659 pos
= dd
->nc
[dim
] + 1;
3660 for (d1
= 0; d1
<= d
; d1
++)
3664 /* Copy the cell fractions of the lower dimensions */
3665 comm
->cell_f0
[d1
] = cell_f_row
[pos
++];
3666 comm
->cell_f1
[d1
] = cell_f_row
[pos
++];
3668 relative_to_absolute_cell_bounds(dd
, ddbox
, d1
);
3670 /* Convert the communicated shift from float to int */
3671 comm
->ddpme
[0].maxshift
= (int)(cell_f_row
[pos
++] + 0.5);
3674 comm
->ddpme
[1].maxshift
= (int)(cell_f_row
[pos
++] + 0.5);
3678 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t
*dd
,
3679 gmx_ddbox_t
*ddbox
, gmx_bool bDynamicBox
,
3680 gmx_bool bUniform
, gmx_large_int_t step
)
3682 gmx_domdec_comm_t
*comm
;
3684 gmx_bool bRowMember
, bRowRoot
;
3689 for (d
= 0; d
< dd
->ndim
; d
++)
3694 for (d1
= d
; d1
< dd
->ndim
; d1
++)
3696 if (dd
->ci
[dd
->dim
[d1
]] > 0)
3709 set_dd_cell_sizes_dlb_root(dd
, d
, dim
, comm
->root
[d
],
3710 ddbox
, bDynamicBox
, bUniform
, step
);
3711 cell_f_row
= comm
->root
[d
]->cell_f
;
3715 cell_f_row
= comm
->cell_f_row
;
3717 distribute_dd_cell_sizes_dlb(dd
, d
, dim
, cell_f_row
, ddbox
);
3722 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t
*dd
, gmx_ddbox_t
*ddbox
)
3726 /* This function assumes the box is static and should therefore
3727 * not be called when the box has changed since the last
3728 * call to dd_partition_system.
3730 for (d
= 0; d
< dd
->ndim
; d
++)
3732 relative_to_absolute_cell_bounds(dd
, ddbox
, d
);
3738 static void set_dd_cell_sizes_dlb(gmx_domdec_t
*dd
,
3739 gmx_ddbox_t
*ddbox
, gmx_bool bDynamicBox
,
3740 gmx_bool bUniform
, gmx_bool bDoDLB
, gmx_large_int_t step
,
3741 gmx_wallcycle_t wcycle
)
3743 gmx_domdec_comm_t
*comm
;
3750 wallcycle_start(wcycle
, ewcDDCOMMBOUND
);
3751 set_dd_cell_sizes_dlb_change(dd
, ddbox
, bDynamicBox
, bUniform
, step
);
3752 wallcycle_stop(wcycle
, ewcDDCOMMBOUND
);
3754 else if (bDynamicBox
)
3756 set_dd_cell_sizes_dlb_nochange(dd
, ddbox
);
3759 /* Set the dimensions for which no DD is used */
3760 for (dim
= 0; dim
< DIM
; dim
++)
3762 if (dd
->nc
[dim
] == 1)
3764 comm
->cell_x0
[dim
] = 0;
3765 comm
->cell_x1
[dim
] = ddbox
->box_size
[dim
];
3766 if (dim
>= ddbox
->nboundeddim
)
3768 comm
->cell_x0
[dim
] += ddbox
->box0
[dim
];
3769 comm
->cell_x1
[dim
] += ddbox
->box0
[dim
];
3775 static void realloc_comm_ind(gmx_domdec_t
*dd
, ivec npulse
)
3778 gmx_domdec_comm_dim_t
*cd
;
3780 for (d
= 0; d
< dd
->ndim
; d
++)
3782 cd
= &dd
->comm
->cd
[d
];
3783 np
= npulse
[dd
->dim
[d
]];
3784 if (np
> cd
->np_nalloc
)
3788 fprintf(debug
, "(Re)allocing cd for %c to %d pulses\n",
3789 dim2char(dd
->dim
[d
]), np
);
3791 if (DDMASTER(dd
) && cd
->np_nalloc
> 0)
3793 fprintf(stderr
, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd
->dim
[d
]), np
);
3795 srenew(cd
->ind
, np
);
3796 for (i
= cd
->np_nalloc
; i
< np
; i
++)
3798 cd
->ind
[i
].index
= NULL
;
3799 cd
->ind
[i
].nalloc
= 0;
3808 static void set_dd_cell_sizes(gmx_domdec_t
*dd
,
3809 gmx_ddbox_t
*ddbox
, gmx_bool bDynamicBox
,
3810 gmx_bool bUniform
, gmx_bool bDoDLB
, gmx_large_int_t step
,
3811 gmx_wallcycle_t wcycle
)
3813 gmx_domdec_comm_t
*comm
;
3819 /* Copy the old cell boundaries for the cg displacement check */
3820 copy_rvec(comm
->cell_x0
, comm
->old_cell_x0
);
3821 copy_rvec(comm
->cell_x1
, comm
->old_cell_x1
);
3823 if (comm
->bDynLoadBal
)
3827 check_box_size(dd
, ddbox
);
3829 set_dd_cell_sizes_dlb(dd
, ddbox
, bDynamicBox
, bUniform
, bDoDLB
, step
, wcycle
);
3833 set_dd_cell_sizes_slb(dd
, ddbox
, FALSE
, npulse
);
3834 realloc_comm_ind(dd
, npulse
);
3839 for (d
= 0; d
< DIM
; d
++)
3841 fprintf(debug
, "cell_x[%d] %f - %f skew_fac %f\n",
3842 d
, comm
->cell_x0
[d
], comm
->cell_x1
[d
], ddbox
->skew_fac
[d
]);
3847 static void comm_dd_ns_cell_sizes(gmx_domdec_t
*dd
,
3849 rvec cell_ns_x0
, rvec cell_ns_x1
,
3850 gmx_large_int_t step
)
3852 gmx_domdec_comm_t
*comm
;
3857 for (dim_ind
= 0; dim_ind
< dd
->ndim
; dim_ind
++)
3859 dim
= dd
->dim
[dim_ind
];
3861 /* Without PBC we don't have restrictions on the outer cells */
3862 if (!(dim
>= ddbox
->npbcdim
&&
3863 (dd
->ci
[dim
] == 0 || dd
->ci
[dim
] == dd
->nc
[dim
] - 1)) &&
3864 comm
->bDynLoadBal
&&
3865 (comm
->cell_x1
[dim
] - comm
->cell_x0
[dim
])*ddbox
->skew_fac
[dim
] <
3866 comm
->cellsize_min
[dim
])
3869 gmx_fatal(FARGS
, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3870 gmx_step_str(step
, buf
), dim2char(dim
),
3871 comm
->cell_x1
[dim
] - comm
->cell_x0
[dim
],
3872 ddbox
->skew_fac
[dim
],
3873 dd
->comm
->cellsize_min
[dim
],
3874 dd
->ci
[XX
], dd
->ci
[YY
], dd
->ci
[ZZ
]);
3878 if ((dd
->bGridJump
&& dd
->ndim
> 1) || ddbox
->nboundeddim
< DIM
)
3880 /* Communicate the boundaries and update cell_ns_x0/1 */
3881 dd_move_cellx(dd
, ddbox
, cell_ns_x0
, cell_ns_x1
);
3882 if (dd
->bGridJump
&& dd
->ndim
> 1)
3884 check_grid_jump(step
, dd
, dd
->comm
->cutoff
, ddbox
, TRUE
);
3889 static void make_tric_corr_matrix(int npbcdim
, matrix box
, matrix tcm
)
3893 tcm
[YY
][XX
] = -box
[YY
][XX
]/box
[YY
][YY
];
3901 tcm
[ZZ
][XX
] = -(box
[ZZ
][YY
]*tcm
[YY
][XX
] + box
[ZZ
][XX
])/box
[ZZ
][ZZ
];
3902 tcm
[ZZ
][YY
] = -box
[ZZ
][YY
]/box
[ZZ
][ZZ
];
3911 static void check_screw_box(matrix box
)
3913 /* Mathematical limitation */
3914 if (box
[YY
][XX
] != 0 || box
[ZZ
][XX
] != 0)
3916 gmx_fatal(FARGS
, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3919 /* Limitation due to the asymmetry of the eighth shell method */
3920 if (box
[ZZ
][YY
] != 0)
3922 gmx_fatal(FARGS
, "pbc=screw with non-zero box_zy is not supported");
3926 static void distribute_cg(FILE *fplog
, gmx_large_int_t step
,
3927 matrix box
, ivec tric_dir
, t_block
*cgs
, rvec pos
[],
3930 gmx_domdec_master_t
*ma
;
3931 int **tmp_ind
= NULL
, *tmp_nalloc
= NULL
;
3932 int i
, icg
, j
, k
, k0
, k1
, d
, npbcdim
;
3934 rvec box_size
, cg_cm
;
3936 real nrcg
, inv_ncg
, pos_d
;
3938 gmx_bool bUnbounded
, bScrew
;
3942 if (tmp_ind
== NULL
)
3944 snew(tmp_nalloc
, dd
->nnodes
);
3945 snew(tmp_ind
, dd
->nnodes
);
3946 for (i
= 0; i
< dd
->nnodes
; i
++)
3948 tmp_nalloc
[i
] = over_alloc_large(cgs
->nr
/dd
->nnodes
+1);
3949 snew(tmp_ind
[i
], tmp_nalloc
[i
]);
3953 /* Clear the count */
3954 for (i
= 0; i
< dd
->nnodes
; i
++)
3960 make_tric_corr_matrix(dd
->npbcdim
, box
, tcm
);
3962 cgindex
= cgs
->index
;
3964 /* Compute the center of geometry for all charge groups */
3965 for (icg
= 0; icg
< cgs
->nr
; icg
++)
3968 k1
= cgindex
[icg
+1];
3972 copy_rvec(pos
[k0
], cg_cm
);
3979 for (k
= k0
; (k
< k1
); k
++)
3981 rvec_inc(cg_cm
, pos
[k
]);
3983 for (d
= 0; (d
< DIM
); d
++)
3985 cg_cm
[d
] *= inv_ncg
;
3988 /* Put the charge group in the box and determine the cell index */
3989 for (d
= DIM
-1; d
>= 0; d
--)
3992 if (d
< dd
->npbcdim
)
3994 bScrew
= (dd
->bScrewPBC
&& d
== XX
);
3995 if (tric_dir
[d
] && dd
->nc
[d
] > 1)
3997 /* Use triclinic coordintates for this dimension */
3998 for (j
= d
+1; j
< DIM
; j
++)
4000 pos_d
+= cg_cm
[j
]*tcm
[j
][d
];
4003 while (pos_d
>= box
[d
][d
])
4006 rvec_dec(cg_cm
, box
[d
]);
4009 cg_cm
[YY
] = box
[YY
][YY
] - cg_cm
[YY
];
4010 cg_cm
[ZZ
] = box
[ZZ
][ZZ
] - cg_cm
[ZZ
];
4012 for (k
= k0
; (k
< k1
); k
++)
4014 rvec_dec(pos
[k
], box
[d
]);
4017 pos
[k
][YY
] = box
[YY
][YY
] - pos
[k
][YY
];
4018 pos
[k
][ZZ
] = box
[ZZ
][ZZ
] - pos
[k
][ZZ
];
4025 rvec_inc(cg_cm
, box
[d
]);
4028 cg_cm
[YY
] = box
[YY
][YY
] - cg_cm
[YY
];
4029 cg_cm
[ZZ
] = box
[ZZ
][ZZ
] - cg_cm
[ZZ
];
4031 for (k
= k0
; (k
< k1
); k
++)
4033 rvec_inc(pos
[k
], box
[d
]);
4036 pos
[k
][YY
] = box
[YY
][YY
] - pos
[k
][YY
];
4037 pos
[k
][ZZ
] = box
[ZZ
][ZZ
] - pos
[k
][ZZ
];
4042 /* This could be done more efficiently */
4044 while (ind
[d
]+1 < dd
->nc
[d
] && pos_d
>= ma
->cell_x
[d
][ind
[d
]+1])
4049 i
= dd_index(dd
->nc
, ind
);
4050 if (ma
->ncg
[i
] == tmp_nalloc
[i
])
4052 tmp_nalloc
[i
] = over_alloc_large(ma
->ncg
[i
]+1);
4053 srenew(tmp_ind
[i
], tmp_nalloc
[i
]);
4055 tmp_ind
[i
][ma
->ncg
[i
]] = icg
;
4057 ma
->nat
[i
] += cgindex
[icg
+1] - cgindex
[icg
];
4061 for (i
= 0; i
< dd
->nnodes
; i
++)
4064 for (k
= 0; k
< ma
->ncg
[i
]; k
++)
4066 ma
->cg
[k1
++] = tmp_ind
[i
][k
];
4069 ma
->index
[dd
->nnodes
] = k1
;
4071 for (i
= 0; i
< dd
->nnodes
; i
++)
4081 fprintf(fplog
, "Charge group distribution at step %s:",
4082 gmx_step_str(step
, buf
));
4083 for (i
= 0; i
< dd
->nnodes
; i
++)
4085 fprintf(fplog
, " %d", ma
->ncg
[i
]);
4087 fprintf(fplog
, "\n");
4091 static void get_cg_distribution(FILE *fplog
, gmx_large_int_t step
, gmx_domdec_t
*dd
,
4092 t_block
*cgs
, matrix box
, gmx_ddbox_t
*ddbox
,
4095 gmx_domdec_master_t
*ma
= NULL
;
4098 int *ibuf
, buf2
[2] = { 0, 0 };
4099 gmx_bool bMaster
= DDMASTER(dd
);
4106 check_screw_box(box
);
4109 set_dd_cell_sizes_slb(dd
, ddbox
, TRUE
, npulse
);
4111 distribute_cg(fplog
, step
, box
, ddbox
->tric_dir
, cgs
, pos
, dd
);
4112 for (i
= 0; i
< dd
->nnodes
; i
++)
4114 ma
->ibuf
[2*i
] = ma
->ncg
[i
];
4115 ma
->ibuf
[2*i
+1] = ma
->nat
[i
];
4123 dd_scatter(dd
, 2*sizeof(int), ibuf
, buf2
);
4125 dd
->ncg_home
= buf2
[0];
4126 dd
->nat_home
= buf2
[1];
4127 dd
->ncg_tot
= dd
->ncg_home
;
4128 dd
->nat_tot
= dd
->nat_home
;
4129 if (dd
->ncg_home
> dd
->cg_nalloc
|| dd
->cg_nalloc
== 0)
4131 dd
->cg_nalloc
= over_alloc_dd(dd
->ncg_home
);
4132 srenew(dd
->index_gl
, dd
->cg_nalloc
);
4133 srenew(dd
->cgindex
, dd
->cg_nalloc
+1);
4137 for (i
= 0; i
< dd
->nnodes
; i
++)
4139 ma
->ibuf
[i
] = ma
->ncg
[i
]*sizeof(int);
4140 ma
->ibuf
[dd
->nnodes
+i
] = ma
->index
[i
]*sizeof(int);
4145 DDMASTER(dd
) ? ma
->ibuf
: NULL
,
4146 DDMASTER(dd
) ? ma
->ibuf
+dd
->nnodes
: NULL
,
4147 DDMASTER(dd
) ? ma
->cg
: NULL
,
4148 dd
->ncg_home
*sizeof(int), dd
->index_gl
);
4150 /* Determine the home charge group sizes */
4152 for (i
= 0; i
< dd
->ncg_home
; i
++)
4154 cg_gl
= dd
->index_gl
[i
];
4156 dd
->cgindex
[i
] + cgs
->index
[cg_gl
+1] - cgs
->index
[cg_gl
];
4161 fprintf(debug
, "Home charge groups:\n");
4162 for (i
= 0; i
< dd
->ncg_home
; i
++)
4164 fprintf(debug
, " %d", dd
->index_gl
[i
]);
4167 fprintf(debug
, "\n");
4170 fprintf(debug
, "\n");
4174 static int compact_and_copy_vec_at(int ncg
, int *move
,
4177 rvec
*src
, gmx_domdec_comm_t
*comm
,
4180 int m
, icg
, i
, i0
, i1
, nrcg
;
4186 for (m
= 0; m
< DIM
*2; m
++)
4192 for (icg
= 0; icg
< ncg
; icg
++)
4194 i1
= cgindex
[icg
+1];
4200 /* Compact the home array in place */
4201 for (i
= i0
; i
< i1
; i
++)
4203 copy_rvec(src
[i
], src
[home_pos
++]);
4209 /* Copy to the communication buffer */
4211 pos_vec
[m
] += 1 + vec
*nrcg
;
4212 for (i
= i0
; i
< i1
; i
++)
4214 copy_rvec(src
[i
], comm
->cgcm_state
[m
][pos_vec
[m
]++]);
4216 pos_vec
[m
] += (nvec
- vec
- 1)*nrcg
;
4220 home_pos
+= i1
- i0
;
4228 static int compact_and_copy_vec_cg(int ncg
, int *move
,
4230 int nvec
, rvec
*src
, gmx_domdec_comm_t
*comm
,
4233 int m
, icg
, i0
, i1
, nrcg
;
4239 for (m
= 0; m
< DIM
*2; m
++)
4245 for (icg
= 0; icg
< ncg
; icg
++)
4247 i1
= cgindex
[icg
+1];
4253 /* Compact the home array in place */
4254 copy_rvec(src
[icg
], src
[home_pos
++]);
4260 /* Copy to the communication buffer */
4261 copy_rvec(src
[icg
], comm
->cgcm_state
[m
][pos_vec
[m
]]);
4262 pos_vec
[m
] += 1 + nrcg
*nvec
;
4274 static int compact_ind(int ncg
, int *move
,
4275 int *index_gl
, int *cgindex
,
4277 gmx_ga2la_t ga2la
, char *bLocalCG
,
4280 int cg
, nat
, a0
, a1
, a
, a_gl
;
4285 for (cg
= 0; cg
< ncg
; cg
++)
4291 /* Compact the home arrays in place.
4292 * Anything that can be done here avoids access to global arrays.
4294 cgindex
[home_pos
] = nat
;
4295 for (a
= a0
; a
< a1
; a
++)
4298 gatindex
[nat
] = a_gl
;
4299 /* The cell number stays 0, so we don't need to set it */
4300 ga2la_change_la(ga2la
, a_gl
, nat
);
4303 index_gl
[home_pos
] = index_gl
[cg
];
4304 cginfo
[home_pos
] = cginfo
[cg
];
4305 /* The charge group remains local, so bLocalCG does not change */
4310 /* Clear the global indices */
4311 for (a
= a0
; a
< a1
; a
++)
4313 ga2la_del(ga2la
, gatindex
[a
]);
4317 bLocalCG
[index_gl
[cg
]] = FALSE
;
4321 cgindex
[home_pos
] = nat
;
4326 static void clear_and_mark_ind(int ncg
, int *move
,
4327 int *index_gl
, int *cgindex
, int *gatindex
,
4328 gmx_ga2la_t ga2la
, char *bLocalCG
,
4333 for (cg
= 0; cg
< ncg
; cg
++)
4339 /* Clear the global indices */
4340 for (a
= a0
; a
< a1
; a
++)
4342 ga2la_del(ga2la
, gatindex
[a
]);
4346 bLocalCG
[index_gl
[cg
]] = FALSE
;
4348 /* Signal that this cg has moved using the ns cell index.
4349 * Here we set it to -1. fill_grid will change it
4350 * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4352 cell_index
[cg
] = -1;
4357 static void print_cg_move(FILE *fplog
,
4359 gmx_large_int_t step
, int cg
, int dim
, int dir
,
4360 gmx_bool bHaveLimitdAndCMOld
, real limitd
,
4361 rvec cm_old
, rvec cm_new
, real pos_d
)
4363 gmx_domdec_comm_t
*comm
;
4368 fprintf(fplog
, "\nStep %s:\n", gmx_step_str(step
, buf
));
4369 if (bHaveLimitdAndCMOld
)
4371 fprintf(fplog
, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4372 ddglatnr(dd
, dd
->cgindex
[cg
]), limitd
, dim2char(dim
));
4376 fprintf(fplog
, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4377 ddglatnr(dd
, dd
->cgindex
[cg
]), dim2char(dim
));
4379 fprintf(fplog
, "distance out of cell %f\n",
4380 dir
== 1 ? pos_d
- comm
->cell_x1
[dim
] : pos_d
- comm
->cell_x0
[dim
]);
4381 if (bHaveLimitdAndCMOld
)
4383 fprintf(fplog
, "Old coordinates: %8.3f %8.3f %8.3f\n",
4384 cm_old
[XX
], cm_old
[YY
], cm_old
[ZZ
]);
4386 fprintf(fplog
, "New coordinates: %8.3f %8.3f %8.3f\n",
4387 cm_new
[XX
], cm_new
[YY
], cm_new
[ZZ
]);
4388 fprintf(fplog
, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4390 comm
->old_cell_x0
[dim
], comm
->old_cell_x1
[dim
]);
4391 fprintf(fplog
, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4393 comm
->cell_x0
[dim
], comm
->cell_x1
[dim
]);
4396 static void cg_move_error(FILE *fplog
,
4398 gmx_large_int_t step
, int cg
, int dim
, int dir
,
4399 gmx_bool bHaveLimitdAndCMOld
, real limitd
,
4400 rvec cm_old
, rvec cm_new
, real pos_d
)
4404 print_cg_move(fplog
, dd
, step
, cg
, dim
, dir
,
4405 bHaveLimitdAndCMOld
, limitd
, cm_old
, cm_new
, pos_d
);
4407 print_cg_move(stderr
, dd
, step
, cg
, dim
, dir
,
4408 bHaveLimitdAndCMOld
, limitd
, cm_old
, cm_new
, pos_d
);
4410 "A charge group moved too far between two domain decomposition steps\n"
4411 "This usually means that your system is not well equilibrated");
4414 static void rotate_state_atom(t_state
*state
, int a
)
4418 for (est
= 0; est
< estNR
; est
++)
4420 if (EST_DISTR(est
) && (state
->flags
& (1<<est
)))
4425 /* Rotate the complete state; for a rectangular box only */
4426 state
->x
[a
][YY
] = state
->box
[YY
][YY
] - state
->x
[a
][YY
];
4427 state
->x
[a
][ZZ
] = state
->box
[ZZ
][ZZ
] - state
->x
[a
][ZZ
];
4430 state
->v
[a
][YY
] = -state
->v
[a
][YY
];
4431 state
->v
[a
][ZZ
] = -state
->v
[a
][ZZ
];
4434 state
->sd_X
[a
][YY
] = -state
->sd_X
[a
][YY
];
4435 state
->sd_X
[a
][ZZ
] = -state
->sd_X
[a
][ZZ
];
4438 state
->cg_p
[a
][YY
] = -state
->cg_p
[a
][YY
];
4439 state
->cg_p
[a
][ZZ
] = -state
->cg_p
[a
][ZZ
];
4441 case estDISRE_INITF
:
4442 case estDISRE_RM3TAV
:
4443 case estORIRE_INITF
:
4445 /* These are distances, so not affected by rotation */
4448 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4454 static int *get_moved(gmx_domdec_comm_t
*comm
, int natoms
)
4456 if (natoms
> comm
->moved_nalloc
)
4458 /* Contents should be preserved here */
4459 comm
->moved_nalloc
= over_alloc_dd(natoms
);
4460 srenew(comm
->moved
, comm
->moved_nalloc
);
4466 static void calc_cg_move(FILE *fplog
, gmx_large_int_t step
,
4469 ivec tric_dir
, matrix tcm
,
4470 rvec cell_x0
, rvec cell_x1
,
4471 rvec limitd
, rvec limit0
, rvec limit1
,
4473 int cg_start
, int cg_end
,
4478 int c
, i
, cg
, k
, k0
, k1
, d
, dim
, dim2
, dir
, d2
, d3
, d4
, cell_d
;
4479 int mc
, cdd
, nrcg
, ncg_recv
, nat_recv
, nvs
, nvr
, nvec
, vec
;
4483 real inv_ncg
, pos_d
;
4486 npbcdim
= dd
->npbcdim
;
4488 for (cg
= cg_start
; cg
< cg_end
; cg
++)
4495 copy_rvec(state
->x
[k0
], cm_new
);
4502 for (k
= k0
; (k
< k1
); k
++)
4504 rvec_inc(cm_new
, state
->x
[k
]);
4506 for (d
= 0; (d
< DIM
); d
++)
4508 cm_new
[d
] = inv_ncg
*cm_new
[d
];
4513 /* Do pbc and check DD cell boundary crossings */
4514 for (d
= DIM
-1; d
>= 0; d
--)
4518 bScrew
= (dd
->bScrewPBC
&& d
== XX
);
4519 /* Determine the location of this cg in lattice coordinates */
4523 for (d2
= d
+1; d2
< DIM
; d2
++)
4525 pos_d
+= cm_new
[d2
]*tcm
[d2
][d
];
4528 /* Put the charge group in the triclinic unit-cell */
4529 if (pos_d
>= cell_x1
[d
])
4531 if (pos_d
>= limit1
[d
])
4533 cg_move_error(fplog
, dd
, step
, cg
, d
, 1, TRUE
, limitd
[d
],
4534 cg_cm
[cg
], cm_new
, pos_d
);
4537 if (dd
->ci
[d
] == dd
->nc
[d
] - 1)
4539 rvec_dec(cm_new
, state
->box
[d
]);
4542 cm_new
[YY
] = state
->box
[YY
][YY
] - cm_new
[YY
];
4543 cm_new
[ZZ
] = state
->box
[ZZ
][ZZ
] - cm_new
[ZZ
];
4545 for (k
= k0
; (k
< k1
); k
++)
4547 rvec_dec(state
->x
[k
], state
->box
[d
]);
4550 rotate_state_atom(state
, k
);
4555 else if (pos_d
< cell_x0
[d
])
4557 if (pos_d
< limit0
[d
])
4559 cg_move_error(fplog
, dd
, step
, cg
, d
, -1, TRUE
, limitd
[d
],
4560 cg_cm
[cg
], cm_new
, pos_d
);
4565 rvec_inc(cm_new
, state
->box
[d
]);
4568 cm_new
[YY
] = state
->box
[YY
][YY
] - cm_new
[YY
];
4569 cm_new
[ZZ
] = state
->box
[ZZ
][ZZ
] - cm_new
[ZZ
];
4571 for (k
= k0
; (k
< k1
); k
++)
4573 rvec_inc(state
->x
[k
], state
->box
[d
]);
4576 rotate_state_atom(state
, k
);
4582 else if (d
< npbcdim
)
4584 /* Put the charge group in the rectangular unit-cell */
4585 while (cm_new
[d
] >= state
->box
[d
][d
])
4587 rvec_dec(cm_new
, state
->box
[d
]);
4588 for (k
= k0
; (k
< k1
); k
++)
4590 rvec_dec(state
->x
[k
], state
->box
[d
]);
4593 while (cm_new
[d
] < 0)
4595 rvec_inc(cm_new
, state
->box
[d
]);
4596 for (k
= k0
; (k
< k1
); k
++)
4598 rvec_inc(state
->x
[k
], state
->box
[d
]);
4604 copy_rvec(cm_new
, cg_cm
[cg
]);
4606 /* Determine where this cg should go */
4609 for (d
= 0; d
< dd
->ndim
; d
++)
4614 flag
|= DD_FLAG_FW(d
);
4620 else if (dev
[dim
] == -1)
4622 flag
|= DD_FLAG_BW(d
);
4625 if (dd
->nc
[dim
] > 2)
4636 /* Temporarily store the flag in move */
4637 move
[cg
] = mc
+ flag
;
4641 static void dd_redistribute_cg(FILE *fplog
, gmx_large_int_t step
,
4642 gmx_domdec_t
*dd
, ivec tric_dir
,
4643 t_state
*state
, rvec
**f
,
4652 int ncg
[DIM
*2], nat
[DIM
*2];
4653 int c
, i
, cg
, k
, k0
, k1
, d
, dim
, dim2
, dir
, d2
, d3
, d4
, cell_d
;
4654 int mc
, cdd
, nrcg
, ncg_recv
, nat_recv
, nvs
, nvr
, nvec
, vec
;
4655 int sbuf
[2], rbuf
[2];
4656 int home_pos_cg
, home_pos_at
, buf_pos
;
4658 gmx_bool bV
= FALSE
, bSDX
= FALSE
, bCGP
= FALSE
;
4661 real inv_ncg
, pos_d
;
4663 rvec
*cg_cm
= NULL
, cell_x0
, cell_x1
, limitd
, limit0
, limit1
, cm_new
;
4665 cginfo_mb_t
*cginfo_mb
;
4666 gmx_domdec_comm_t
*comm
;
4668 int nthread
, thread
;
4672 check_screw_box(state
->box
);
4676 if (fr
->cutoff_scheme
== ecutsGROUP
)
4681 for (i
= 0; i
< estNR
; i
++)
4687 case estX
: /* Always present */ break;
4688 case estV
: bV
= (state
->flags
& (1<<i
)); break;
4689 case estSDX
: bSDX
= (state
->flags
& (1<<i
)); break;
4690 case estCGP
: bCGP
= (state
->flags
& (1<<i
)); break;
4693 case estDISRE_INITF
:
4694 case estDISRE_RM3TAV
:
4695 case estORIRE_INITF
:
4697 /* No processing required */
4700 gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4705 if (dd
->ncg_tot
> comm
->nalloc_int
)
4707 comm
->nalloc_int
= over_alloc_dd(dd
->ncg_tot
);
4708 srenew(comm
->buf_int
, comm
->nalloc_int
);
4710 move
= comm
->buf_int
;
4712 /* Clear the count */
4713 for (c
= 0; c
< dd
->ndim
*2; c
++)
4719 npbcdim
= dd
->npbcdim
;
4721 for (d
= 0; (d
< DIM
); d
++)
4723 limitd
[d
] = dd
->comm
->cellsize_min
[d
];
4724 if (d
>= npbcdim
&& dd
->ci
[d
] == 0)
4726 cell_x0
[d
] = -GMX_FLOAT_MAX
;
4730 cell_x0
[d
] = comm
->cell_x0
[d
];
4732 if (d
>= npbcdim
&& dd
->ci
[d
] == dd
->nc
[d
] - 1)
4734 cell_x1
[d
] = GMX_FLOAT_MAX
;
4738 cell_x1
[d
] = comm
->cell_x1
[d
];
4742 limit0
[d
] = comm
->old_cell_x0
[d
] - limitd
[d
];
4743 limit1
[d
] = comm
->old_cell_x1
[d
] + limitd
[d
];
4747 /* We check after communication if a charge group moved
4748 * more than one cell. Set the pre-comm check limit to float_max.
4750 limit0
[d
] = -GMX_FLOAT_MAX
;
4751 limit1
[d
] = GMX_FLOAT_MAX
;
4755 make_tric_corr_matrix(npbcdim
, state
->box
, tcm
);
4757 cgindex
= dd
->cgindex
;
4759 nthread
= gmx_omp_nthreads_get(emntDomdec
);
4761 /* Compute the center of geometry for all home charge groups
4762 * and put them in the box and determine where they should go.
4764 #pragma omp parallel for num_threads(nthread) schedule(static)
4765 for (thread
= 0; thread
< nthread
; thread
++)
4767 calc_cg_move(fplog
, step
, dd
, state
, tric_dir
, tcm
,
4768 cell_x0
, cell_x1
, limitd
, limit0
, limit1
,
4770 ( thread
*dd
->ncg_home
)/nthread
,
4771 ((thread
+1)*dd
->ncg_home
)/nthread
,
4772 fr
->cutoff_scheme
== ecutsGROUP
? cg_cm
: state
->x
,
4776 for (cg
= 0; cg
< dd
->ncg_home
; cg
++)
4781 flag
= mc
& ~DD_FLAG_NRCG
;
4782 mc
= mc
& DD_FLAG_NRCG
;
4785 if (ncg
[mc
]+1 > comm
->cggl_flag_nalloc
[mc
])
4787 comm
->cggl_flag_nalloc
[mc
] = over_alloc_dd(ncg
[mc
]+1);
4788 srenew(comm
->cggl_flag
[mc
], comm
->cggl_flag_nalloc
[mc
]*DD_CGIBS
);
4790 comm
->cggl_flag
[mc
][ncg
[mc
]*DD_CGIBS
] = dd
->index_gl
[cg
];
4791 /* We store the cg size in the lower 16 bits
4792 * and the place where the charge group should go
4793 * in the next 6 bits. This saves some communication volume.
4795 nrcg
= cgindex
[cg
+1] - cgindex
[cg
];
4796 comm
->cggl_flag
[mc
][ncg
[mc
]*DD_CGIBS
+1] = nrcg
| flag
;
4802 inc_nrnb(nrnb
, eNR_CGCM
, dd
->nat_home
);
4803 inc_nrnb(nrnb
, eNR_RESETX
, dd
->ncg_home
);
4806 for (i
= 0; i
< dd
->ndim
*2; i
++)
4808 *ncg_moved
+= ncg
[i
];
4825 /* Make sure the communication buffers are large enough */
4826 for (mc
= 0; mc
< dd
->ndim
*2; mc
++)
4828 nvr
= ncg
[mc
] + nat
[mc
]*nvec
;
4829 if (nvr
> comm
->cgcm_state_nalloc
[mc
])
4831 comm
->cgcm_state_nalloc
[mc
] = over_alloc_dd(nvr
);
4832 srenew(comm
->cgcm_state
[mc
], comm
->cgcm_state_nalloc
[mc
]);
4836 switch (fr
->cutoff_scheme
)
4839 /* Recalculating cg_cm might be cheaper than communicating,
4840 * but that could give rise to rounding issues.
4843 compact_and_copy_vec_cg(dd
->ncg_home
, move
, cgindex
,
4844 nvec
, cg_cm
, comm
, bCompact
);
4847 /* Without charge groups we send the moved atom coordinates
4848 * over twice. This is so the code below can be used without
4849 * many conditionals for both for with and without charge groups.
4852 compact_and_copy_vec_cg(dd
->ncg_home
, move
, cgindex
,
4853 nvec
, state
->x
, comm
, FALSE
);
4856 home_pos_cg
-= *ncg_moved
;
4860 gmx_incons("unimplemented");
4866 compact_and_copy_vec_at(dd
->ncg_home
, move
, cgindex
,
4867 nvec
, vec
++, state
->x
, comm
, bCompact
);
4870 compact_and_copy_vec_at(dd
->ncg_home
, move
, cgindex
,
4871 nvec
, vec
++, state
->v
, comm
, bCompact
);
4875 compact_and_copy_vec_at(dd
->ncg_home
, move
, cgindex
,
4876 nvec
, vec
++, state
->sd_X
, comm
, bCompact
);
4880 compact_and_copy_vec_at(dd
->ncg_home
, move
, cgindex
,
4881 nvec
, vec
++, state
->cg_p
, comm
, bCompact
);
4886 compact_ind(dd
->ncg_home
, move
,
4887 dd
->index_gl
, dd
->cgindex
, dd
->gatindex
,
4888 dd
->ga2la
, comm
->bLocalCG
,
4893 if (fr
->cutoff_scheme
== ecutsVERLET
)
4895 moved
= get_moved(comm
, dd
->ncg_home
);
4897 for (k
= 0; k
< dd
->ncg_home
; k
++)
4904 moved
= fr
->ns
.grid
->cell_index
;
4907 clear_and_mark_ind(dd
->ncg_home
, move
,
4908 dd
->index_gl
, dd
->cgindex
, dd
->gatindex
,
4909 dd
->ga2la
, comm
->bLocalCG
,
4913 cginfo_mb
= fr
->cginfo_mb
;
4915 *ncg_stay_home
= home_pos_cg
;
4916 for (d
= 0; d
< dd
->ndim
; d
++)
4922 for (dir
= 0; dir
< (dd
->nc
[dim
] == 2 ? 1 : 2); dir
++)
4925 /* Communicate the cg and atom counts */
4930 fprintf(debug
, "Sending ddim %d dir %d: ncg %d nat %d\n",
4931 d
, dir
, sbuf
[0], sbuf
[1]);
4933 dd_sendrecv_int(dd
, d
, dir
, sbuf
, 2, rbuf
, 2);
4935 if ((ncg_recv
+rbuf
[0])*DD_CGIBS
> comm
->nalloc_int
)
4937 comm
->nalloc_int
= over_alloc_dd((ncg_recv
+rbuf
[0])*DD_CGIBS
);
4938 srenew(comm
->buf_int
, comm
->nalloc_int
);
4941 /* Communicate the charge group indices, sizes and flags */
4942 dd_sendrecv_int(dd
, d
, dir
,
4943 comm
->cggl_flag
[cdd
], sbuf
[0]*DD_CGIBS
,
4944 comm
->buf_int
+ncg_recv
*DD_CGIBS
, rbuf
[0]*DD_CGIBS
);
4946 nvs
= ncg
[cdd
] + nat
[cdd
]*nvec
;
4947 i
= rbuf
[0] + rbuf
[1] *nvec
;
4948 vec_rvec_check_alloc(&comm
->vbuf
, nvr
+i
);
4950 /* Communicate cgcm and state */
4951 dd_sendrecv_rvec(dd
, d
, dir
,
4952 comm
->cgcm_state
[cdd
], nvs
,
4953 comm
->vbuf
.v
+nvr
, i
);
4954 ncg_recv
+= rbuf
[0];
4955 nat_recv
+= rbuf
[1];
4959 /* Process the received charge groups */
4961 for (cg
= 0; cg
< ncg_recv
; cg
++)
4963 flag
= comm
->buf_int
[cg
*DD_CGIBS
+1];
4965 if (dim
>= npbcdim
&& dd
->nc
[dim
] > 2)
4967 /* No pbc in this dim and more than one domain boundary.
4968 * We do a separate check if a charge group didn't move too far.
4970 if (((flag
& DD_FLAG_FW(d
)) &&
4971 comm
->vbuf
.v
[buf_pos
][dim
] > cell_x1
[dim
]) ||
4972 ((flag
& DD_FLAG_BW(d
)) &&
4973 comm
->vbuf
.v
[buf_pos
][dim
] < cell_x0
[dim
]))
4975 cg_move_error(fplog
, dd
, step
, cg
, dim
,
4976 (flag
& DD_FLAG_FW(d
)) ? 1 : 0,
4978 comm
->vbuf
.v
[buf_pos
],
4979 comm
->vbuf
.v
[buf_pos
],
4980 comm
->vbuf
.v
[buf_pos
][dim
]);
4987 /* Check which direction this cg should go */
4988 for (d2
= d
+1; (d2
< dd
->ndim
&& mc
== -1); d2
++)
4992 /* The cell boundaries for dimension d2 are not equal
4993 * for each cell row of the lower dimension(s),
4994 * therefore we might need to redetermine where
4995 * this cg should go.
4998 /* If this cg crosses the box boundary in dimension d2
4999 * we can use the communicated flag, so we do not
5000 * have to worry about pbc.
5002 if (!((dd
->ci
[dim2
] == dd
->nc
[dim2
]-1 &&
5003 (flag
& DD_FLAG_FW(d2
))) ||
5004 (dd
->ci
[dim2
] == 0 &&
5005 (flag
& DD_FLAG_BW(d2
)))))
5007 /* Clear the two flags for this dimension */
5008 flag
&= ~(DD_FLAG_FW(d2
) | DD_FLAG_BW(d2
));
5009 /* Determine the location of this cg
5010 * in lattice coordinates
5012 pos_d
= comm
->vbuf
.v
[buf_pos
][dim2
];
5015 for (d3
= dim2
+1; d3
< DIM
; d3
++)
5018 comm
->vbuf
.v
[buf_pos
][d3
]*tcm
[d3
][dim2
];
5021 /* Check of we are not at the box edge.
5022 * pbc is only handled in the first step above,
5023 * but this check could move over pbc while
5024 * the first step did not due to different rounding.
5026 if (pos_d
>= cell_x1
[dim2
] &&
5027 dd
->ci
[dim2
] != dd
->nc
[dim2
]-1)
5029 flag
|= DD_FLAG_FW(d2
);
5031 else if (pos_d
< cell_x0
[dim2
] &&
5034 flag
|= DD_FLAG_BW(d2
);
5036 comm
->buf_int
[cg
*DD_CGIBS
+1] = flag
;
5039 /* Set to which neighboring cell this cg should go */
5040 if (flag
& DD_FLAG_FW(d2
))
5044 else if (flag
& DD_FLAG_BW(d2
))
5046 if (dd
->nc
[dd
->dim
[d2
]] > 2)
5058 nrcg
= flag
& DD_FLAG_NRCG
;
5061 if (home_pos_cg
+1 > dd
->cg_nalloc
)
5063 dd
->cg_nalloc
= over_alloc_dd(home_pos_cg
+1);
5064 srenew(dd
->index_gl
, dd
->cg_nalloc
);
5065 srenew(dd
->cgindex
, dd
->cg_nalloc
+1);
5067 /* Set the global charge group index and size */
5068 dd
->index_gl
[home_pos_cg
] = comm
->buf_int
[cg
*DD_CGIBS
];
5069 dd
->cgindex
[home_pos_cg
+1] = dd
->cgindex
[home_pos_cg
] + nrcg
;
5070 /* Copy the state from the buffer */
5071 dd_check_alloc_ncg(fr
, state
, f
, home_pos_cg
+1);
5072 if (fr
->cutoff_scheme
== ecutsGROUP
)
5075 copy_rvec(comm
->vbuf
.v
[buf_pos
], cg_cm
[home_pos_cg
]);
5079 /* Set the cginfo */
5080 fr
->cginfo
[home_pos_cg
] = ddcginfo(cginfo_mb
,
5081 dd
->index_gl
[home_pos_cg
]);
5084 comm
->bLocalCG
[dd
->index_gl
[home_pos_cg
]] = TRUE
;
5087 if (home_pos_at
+nrcg
> state
->nalloc
)
5089 dd_realloc_state(state
, f
, home_pos_at
+nrcg
);
5091 for (i
= 0; i
< nrcg
; i
++)
5093 copy_rvec(comm
->vbuf
.v
[buf_pos
++],
5094 state
->x
[home_pos_at
+i
]);
5098 for (i
= 0; i
< nrcg
; i
++)
5100 copy_rvec(comm
->vbuf
.v
[buf_pos
++],
5101 state
->v
[home_pos_at
+i
]);
5106 for (i
= 0; i
< nrcg
; i
++)
5108 copy_rvec(comm
->vbuf
.v
[buf_pos
++],
5109 state
->sd_X
[home_pos_at
+i
]);
5114 for (i
= 0; i
< nrcg
; i
++)
5116 copy_rvec(comm
->vbuf
.v
[buf_pos
++],
5117 state
->cg_p
[home_pos_at
+i
]);
5121 home_pos_at
+= nrcg
;
5125 /* Reallocate the buffers if necessary */
5126 if (ncg
[mc
]+1 > comm
->cggl_flag_nalloc
[mc
])
5128 comm
->cggl_flag_nalloc
[mc
] = over_alloc_dd(ncg
[mc
]+1);
5129 srenew(comm
->cggl_flag
[mc
], comm
->cggl_flag_nalloc
[mc
]*DD_CGIBS
);
5131 nvr
= ncg
[mc
] + nat
[mc
]*nvec
;
5132 if (nvr
+ 1 + nrcg
*nvec
> comm
->cgcm_state_nalloc
[mc
])
5134 comm
->cgcm_state_nalloc
[mc
] = over_alloc_dd(nvr
+ 1 + nrcg
*nvec
);
5135 srenew(comm
->cgcm_state
[mc
], comm
->cgcm_state_nalloc
[mc
]);
5137 /* Copy from the receive to the send buffers */
5138 memcpy(comm
->cggl_flag
[mc
] + ncg
[mc
]*DD_CGIBS
,
5139 comm
->buf_int
+ cg
*DD_CGIBS
,
5140 DD_CGIBS
*sizeof(int));
5141 memcpy(comm
->cgcm_state
[mc
][nvr
],
5142 comm
->vbuf
.v
[buf_pos
],
5143 (1+nrcg
*nvec
)*sizeof(rvec
));
5144 buf_pos
+= 1 + nrcg
*nvec
;
5151 /* With sorting (!bCompact) the indices are now only partially up to date
5152 * and ncg_home and nat_home are not the real count, since there are
5153 * "holes" in the arrays for the charge groups that moved to neighbors.
5155 if (fr
->cutoff_scheme
== ecutsVERLET
)
5157 moved
= get_moved(comm
, home_pos_cg
);
5159 for (i
= dd
->ncg_home
; i
< home_pos_cg
; i
++)
5164 dd
->ncg_home
= home_pos_cg
;
5165 dd
->nat_home
= home_pos_at
;
5170 "Finished repartitioning: cgs moved out %d, new home %d\n",
5171 *ncg_moved
, dd
->ncg_home
-*ncg_moved
);
5176 void dd_cycles_add(gmx_domdec_t
*dd
, float cycles
, int ddCycl
)
5178 dd
->comm
->cycl
[ddCycl
] += cycles
;
5179 dd
->comm
->cycl_n
[ddCycl
]++;
5180 if (cycles
> dd
->comm
->cycl_max
[ddCycl
])
5182 dd
->comm
->cycl_max
[ddCycl
] = cycles
;
5186 static double force_flop_count(t_nrnb
*nrnb
)
5193 for (i
= 0; i
< eNR_NBKERNEL_FREE_ENERGY
; i
++)
5195 /* To get closer to the real timings, we half the count
5196 * for the normal loops and again half it for water loops.
5199 if (strstr(name
, "W3") != NULL
|| strstr(name
, "W4") != NULL
)
5201 sum
+= nrnb
->n
[i
]*0.25*cost_nrnb(i
);
5205 sum
+= nrnb
->n
[i
]*0.50*cost_nrnb(i
);
5208 for (i
= eNR_NBKERNEL_FREE_ENERGY
; i
<= eNR_NB14
; i
++)
5211 if (strstr(name
, "W3") != NULL
|| strstr(name
, "W4") != NULL
)
5213 sum
+= nrnb
->n
[i
]*cost_nrnb(i
);
5216 for (i
= eNR_BONDS
; i
<= eNR_WALLS
; i
++)
5218 sum
+= nrnb
->n
[i
]*cost_nrnb(i
);
5224 void dd_force_flop_start(gmx_domdec_t
*dd
, t_nrnb
*nrnb
)
5226 if (dd
->comm
->eFlop
)
5228 dd
->comm
->flop
-= force_flop_count(nrnb
);
5231 void dd_force_flop_stop(gmx_domdec_t
*dd
, t_nrnb
*nrnb
)
5233 if (dd
->comm
->eFlop
)
5235 dd
->comm
->flop
+= force_flop_count(nrnb
);
5240 static void clear_dd_cycle_counts(gmx_domdec_t
*dd
)
5244 for (i
= 0; i
< ddCyclNr
; i
++)
5246 dd
->comm
->cycl
[i
] = 0;
5247 dd
->comm
->cycl_n
[i
] = 0;
5248 dd
->comm
->cycl_max
[i
] = 0;
5251 dd
->comm
->flop_n
= 0;
5254 static void get_load_distribution(gmx_domdec_t
*dd
, gmx_wallcycle_t wcycle
)
5256 gmx_domdec_comm_t
*comm
;
5257 gmx_domdec_load_t
*load
;
5258 gmx_domdec_root_t
*root
= NULL
;
5259 int d
, dim
, cid
, i
, pos
;
5260 float cell_frac
= 0, sbuf
[DD_NLOAD_MAX
];
5265 fprintf(debug
, "get_load_distribution start\n");
5268 wallcycle_start(wcycle
, ewcDDCOMMLOAD
);
5272 bSepPME
= (dd
->pme_nodeid
>= 0);
5274 for (d
= dd
->ndim
-1; d
>= 0; d
--)
5277 /* Check if we participate in the communication in this dimension */
5278 if (d
== dd
->ndim
-1 ||
5279 (dd
->ci
[dd
->dim
[d
+1]] == 0 && dd
->ci
[dd
->dim
[dd
->ndim
-1]] == 0))
5281 load
= &comm
->load
[d
];
5284 cell_frac
= comm
->cell_f1
[d
] - comm
->cell_f0
[d
];
5287 if (d
== dd
->ndim
-1)
5289 sbuf
[pos
++] = dd_force_load(comm
);
5290 sbuf
[pos
++] = sbuf
[0];
5293 sbuf
[pos
++] = sbuf
[0];
5294 sbuf
[pos
++] = cell_frac
;
5297 sbuf
[pos
++] = comm
->cell_f_max0
[d
];
5298 sbuf
[pos
++] = comm
->cell_f_min1
[d
];
5303 sbuf
[pos
++] = comm
->cycl
[ddCyclPPduringPME
];
5304 sbuf
[pos
++] = comm
->cycl
[ddCyclPME
];
5309 sbuf
[pos
++] = comm
->load
[d
+1].sum
;
5310 sbuf
[pos
++] = comm
->load
[d
+1].max
;
5313 sbuf
[pos
++] = comm
->load
[d
+1].sum_m
;
5314 sbuf
[pos
++] = comm
->load
[d
+1].cvol_min
*cell_frac
;
5315 sbuf
[pos
++] = comm
->load
[d
+1].flags
;
5318 sbuf
[pos
++] = comm
->cell_f_max0
[d
];
5319 sbuf
[pos
++] = comm
->cell_f_min1
[d
];
5324 sbuf
[pos
++] = comm
->load
[d
+1].mdf
;
5325 sbuf
[pos
++] = comm
->load
[d
+1].pme
;
5329 /* Communicate a row in DD direction d.
5330 * The communicators are setup such that the root always has rank 0.
5333 MPI_Gather(sbuf
, load
->nload
*sizeof(float), MPI_BYTE
,
5334 load
->load
, load
->nload
*sizeof(float), MPI_BYTE
,
5335 0, comm
->mpi_comm_load
[d
]);
5337 if (dd
->ci
[dim
] == dd
->master_ci
[dim
])
5339 /* We are the root, process this row */
5340 if (comm
->bDynLoadBal
)
5342 root
= comm
->root
[d
];
5352 for (i
= 0; i
< dd
->nc
[dim
]; i
++)
5354 load
->sum
+= load
->load
[pos
++];
5355 load
->max
= max(load
->max
, load
->load
[pos
]);
5361 /* This direction could not be load balanced properly,
5362 * therefore we need to use the maximum iso the average load.
5364 load
->sum_m
= max(load
->sum_m
, load
->load
[pos
]);
5368 load
->sum_m
+= load
->load
[pos
];
5371 load
->cvol_min
= min(load
->cvol_min
, load
->load
[pos
]);
5375 load
->flags
= (int)(load
->load
[pos
++] + 0.5);
5379 root
->cell_f_max0
[i
] = load
->load
[pos
++];
5380 root
->cell_f_min1
[i
] = load
->load
[pos
++];
5385 load
->mdf
= max(load
->mdf
, load
->load
[pos
]);
5387 load
->pme
= max(load
->pme
, load
->load
[pos
]);
5391 if (comm
->bDynLoadBal
&& root
->bLimited
)
5393 load
->sum_m
*= dd
->nc
[dim
];
5394 load
->flags
|= (1<<d
);
5402 comm
->nload
+= dd_load_count(comm
);
5403 comm
->load_step
+= comm
->cycl
[ddCyclStep
];
5404 comm
->load_sum
+= comm
->load
[0].sum
;
5405 comm
->load_max
+= comm
->load
[0].max
;
5406 if (comm
->bDynLoadBal
)
5408 for (d
= 0; d
< dd
->ndim
; d
++)
5410 if (comm
->load
[0].flags
& (1<<d
))
5412 comm
->load_lim
[d
]++;
5418 comm
->load_mdf
+= comm
->load
[0].mdf
;
5419 comm
->load_pme
+= comm
->load
[0].pme
;
5423 wallcycle_stop(wcycle
, ewcDDCOMMLOAD
);
5427 fprintf(debug
, "get_load_distribution finished\n");
5431 static float dd_force_imb_perf_loss(gmx_domdec_t
*dd
)
5433 /* Return the relative performance loss on the total run time
5434 * due to the force calculation load imbalance.
5436 if (dd
->comm
->nload
> 0)
5439 (dd
->comm
->load_max
*dd
->nnodes
- dd
->comm
->load_sum
)/
5440 (dd
->comm
->load_step
*dd
->nnodes
);
5448 static void print_dd_load_av(FILE *fplog
, gmx_domdec_t
*dd
)
5451 int npp
, npme
, nnodes
, d
, limp
;
5452 float imbal
, pme_f_ratio
, lossf
, lossp
= 0;
5454 gmx_domdec_comm_t
*comm
;
5457 if (DDMASTER(dd
) && comm
->nload
> 0)
5460 npme
= (dd
->pme_nodeid
>= 0) ? comm
->npmenodes
: 0;
5461 nnodes
= npp
+ npme
;
5462 imbal
= comm
->load_max
*npp
/comm
->load_sum
- 1;
5463 lossf
= dd_force_imb_perf_loss(dd
);
5464 sprintf(buf
, " Average load imbalance: %.1f %%\n", imbal
*100);
5465 fprintf(fplog
, "%s", buf
);
5466 fprintf(stderr
, "\n");
5467 fprintf(stderr
, "%s", buf
);
5468 sprintf(buf
, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf
*100);
5469 fprintf(fplog
, "%s", buf
);
5470 fprintf(stderr
, "%s", buf
);
5472 if (comm
->bDynLoadBal
)
5474 sprintf(buf
, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5475 for (d
= 0; d
< dd
->ndim
; d
++)
5477 limp
= (200*comm
->load_lim
[d
]+1)/(2*comm
->nload
);
5478 sprintf(buf
+strlen(buf
), " %c %d %%", dim2char(dd
->dim
[d
]), limp
);
5484 sprintf(buf
+strlen(buf
), "\n");
5485 fprintf(fplog
, "%s", buf
);
5486 fprintf(stderr
, "%s", buf
);
5490 pme_f_ratio
= comm
->load_pme
/comm
->load_mdf
;
5491 lossp
= (comm
->load_pme
-comm
->load_mdf
)/comm
->load_step
;
5494 lossp
*= (float)npme
/(float)nnodes
;
5498 lossp
*= (float)npp
/(float)nnodes
;
5500 sprintf(buf
, " Average PME mesh/force load: %5.3f\n", pme_f_ratio
);
5501 fprintf(fplog
, "%s", buf
);
5502 fprintf(stderr
, "%s", buf
);
5503 sprintf(buf
, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp
)*100);
5504 fprintf(fplog
, "%s", buf
);
5505 fprintf(stderr
, "%s", buf
);
5507 fprintf(fplog
, "\n");
5508 fprintf(stderr
, "\n");
5510 if (lossf
>= DD_PERF_LOSS
)
5513 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5514 " in the domain decomposition.\n", lossf
*100);
5515 if (!comm
->bDynLoadBal
)
5517 sprintf(buf
+strlen(buf
), " You might want to use dynamic load balancing (option -dlb.)\n");
5521 sprintf(buf
+strlen(buf
), " You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5523 fprintf(fplog
, "%s\n", buf
);
5524 fprintf(stderr
, "%s\n", buf
);
5526 if (npme
> 0 && fabs(lossp
) >= DD_PERF_LOSS
)
5529 "NOTE: %.1f %% performance was lost because the PME nodes\n"
5530 " had %s work to do than the PP nodes.\n"
5531 " You might want to %s the number of PME nodes\n"
5532 " or %s the cut-off and the grid spacing.\n",
5534 (lossp
< 0) ? "less" : "more",
5535 (lossp
< 0) ? "decrease" : "increase",
5536 (lossp
< 0) ? "decrease" : "increase");
5537 fprintf(fplog
, "%s\n", buf
);
5538 fprintf(stderr
, "%s\n", buf
);
5543 static float dd_vol_min(gmx_domdec_t
*dd
)
5545 return dd
->comm
->load
[0].cvol_min
*dd
->nnodes
;
5548 static gmx_bool
dd_load_flags(gmx_domdec_t
*dd
)
5550 return dd
->comm
->load
[0].flags
;
5553 static float dd_f_imbal(gmx_domdec_t
*dd
)
5555 return dd
->comm
->load
[0].max
*dd
->nnodes
/dd
->comm
->load
[0].sum
- 1;
5558 float dd_pme_f_ratio(gmx_domdec_t
*dd
)
5560 if (dd
->comm
->cycl_n
[ddCyclPME
] > 0)
5562 return dd
->comm
->load
[0].pme
/dd
->comm
->load
[0].mdf
;
5570 static void dd_print_load(FILE *fplog
, gmx_domdec_t
*dd
, gmx_large_int_t step
)
5575 flags
= dd_load_flags(dd
);
5579 "DD load balancing is limited by minimum cell size in dimension");
5580 for (d
= 0; d
< dd
->ndim
; d
++)
5584 fprintf(fplog
, " %c", dim2char(dd
->dim
[d
]));
5587 fprintf(fplog
, "\n");
5589 fprintf(fplog
, "DD step %s", gmx_step_str(step
, buf
));
5590 if (dd
->comm
->bDynLoadBal
)
5592 fprintf(fplog
, " vol min/aver %5.3f%c",
5593 dd_vol_min(dd
), flags
? '!' : ' ');
5595 fprintf(fplog
, " load imb.: force %4.1f%%", dd_f_imbal(dd
)*100);
5596 if (dd
->comm
->cycl_n
[ddCyclPME
])
5598 fprintf(fplog
, " pme mesh/force %5.3f", dd_pme_f_ratio(dd
));
5600 fprintf(fplog
, "\n\n");
5603 static void dd_print_load_verbose(gmx_domdec_t
*dd
)
5605 if (dd
->comm
->bDynLoadBal
)
5607 fprintf(stderr
, "vol %4.2f%c ",
5608 dd_vol_min(dd
), dd_load_flags(dd
) ? '!' : ' ');
5610 fprintf(stderr
, "imb F %2d%% ", (int)(dd_f_imbal(dd
)*100+0.5));
5611 if (dd
->comm
->cycl_n
[ddCyclPME
])
5613 fprintf(stderr
, "pme/F %4.2f ", dd_pme_f_ratio(dd
));
5618 static void make_load_communicator(gmx_domdec_t
*dd
, int dim_ind
, ivec loc
)
5623 gmx_domdec_root_t
*root
;
5624 gmx_bool bPartOfGroup
= FALSE
;
5626 dim
= dd
->dim
[dim_ind
];
5627 copy_ivec(loc
, loc_c
);
5628 for (i
= 0; i
< dd
->nc
[dim
]; i
++)
5631 rank
= dd_index(dd
->nc
, loc_c
);
5632 if (rank
== dd
->rank
)
5634 /* This process is part of the group */
5635 bPartOfGroup
= TRUE
;
5638 MPI_Comm_split(dd
->mpi_comm_all
, bPartOfGroup
? 0 : MPI_UNDEFINED
, dd
->rank
,
5642 dd
->comm
->mpi_comm_load
[dim_ind
] = c_row
;
5643 if (dd
->comm
->eDLB
!= edlbNO
)
5645 if (dd
->ci
[dim
] == dd
->master_ci
[dim
])
5647 /* This is the root process of this row */
5648 snew(dd
->comm
->root
[dim_ind
], 1);
5649 root
= dd
->comm
->root
[dim_ind
];
5650 snew(root
->cell_f
, DD_CELL_F_SIZE(dd
, dim_ind
));
5651 snew(root
->old_cell_f
, dd
->nc
[dim
]+1);
5652 snew(root
->bCellMin
, dd
->nc
[dim
]);
5655 snew(root
->cell_f_max0
, dd
->nc
[dim
]);
5656 snew(root
->cell_f_min1
, dd
->nc
[dim
]);
5657 snew(root
->bound_min
, dd
->nc
[dim
]);
5658 snew(root
->bound_max
, dd
->nc
[dim
]);
5660 snew(root
->buf_ncd
, dd
->nc
[dim
]);
5664 /* This is not a root process, we only need to receive cell_f */
5665 snew(dd
->comm
->cell_f_row
, DD_CELL_F_SIZE(dd
, dim_ind
));
5668 if (dd
->ci
[dim
] == dd
->master_ci
[dim
])
5670 snew(dd
->comm
->load
[dim_ind
].load
, dd
->nc
[dim
]*DD_NLOAD_MAX
);
5676 void dd_setup_dlb_resource_sharing(t_commrec gmx_unused
*cr
,
5677 const gmx_hw_info_t gmx_unused
*hwinfo
,
5678 const gmx_hw_opt_t gmx_unused
*hw_opt
)
5681 int physicalnode_id_hash
;
5684 MPI_Comm mpi_comm_pp_physicalnode
;
5686 if (!(cr
->duty
& DUTY_PP
) ||
5687 hw_opt
->gpu_opt
.ncuda_dev_use
== 0)
5689 /* Only PP nodes (currently) use GPUs.
5690 * If we don't have GPUs, there are no resources to share.
5695 physicalnode_id_hash
= gmx_physicalnode_id_hash();
5697 gpu_id
= get_gpu_device_id(&hwinfo
->gpu_info
, &hw_opt
->gpu_opt
, cr
->nodeid
);
5703 fprintf(debug
, "dd_setup_dd_dlb_gpu_sharing:\n");
5704 fprintf(debug
, "DD PP rank %d physical node hash %d gpu_id %d\n",
5705 dd
->rank
, physicalnode_id_hash
, gpu_id
);
5707 /* Split the PP communicator over the physical nodes */
5708 /* TODO: See if we should store this (before), as it's also used for
5709 * for the nodecomm summution.
5711 MPI_Comm_split(dd
->mpi_comm_all
, physicalnode_id_hash
, dd
->rank
,
5712 &mpi_comm_pp_physicalnode
);
5713 MPI_Comm_split(mpi_comm_pp_physicalnode
, gpu_id
, dd
->rank
,
5714 &dd
->comm
->mpi_comm_gpu_shared
);
5715 MPI_Comm_free(&mpi_comm_pp_physicalnode
);
5716 MPI_Comm_size(dd
->comm
->mpi_comm_gpu_shared
, &dd
->comm
->nrank_gpu_shared
);
5720 fprintf(debug
, "nrank_gpu_shared %d\n", dd
->comm
->nrank_gpu_shared
);
5723 /* Note that some ranks could share a GPU, while others don't */
5725 if (dd
->comm
->nrank_gpu_shared
== 1)
5727 MPI_Comm_free(&dd
->comm
->mpi_comm_gpu_shared
);
5732 static void make_load_communicators(gmx_domdec_t gmx_unused
*dd
)
5735 int dim0
, dim1
, i
, j
;
5740 fprintf(debug
, "Making load communicators\n");
5743 snew(dd
->comm
->load
, dd
->ndim
);
5744 snew(dd
->comm
->mpi_comm_load
, dd
->ndim
);
5747 make_load_communicator(dd
, 0, loc
);
5751 for (i
= 0; i
< dd
->nc
[dim0
]; i
++)
5754 make_load_communicator(dd
, 1, loc
);
5760 for (i
= 0; i
< dd
->nc
[dim0
]; i
++)
5764 for (j
= 0; j
< dd
->nc
[dim1
]; j
++)
5767 make_load_communicator(dd
, 2, loc
);
5774 fprintf(debug
, "Finished making load communicators\n");
5779 void setup_dd_grid(FILE *fplog
, gmx_domdec_t
*dd
)
5782 int d
, dim
, i
, j
, m
;
5785 ivec dd_zp
[DD_MAXIZONE
];
5786 gmx_domdec_zones_t
*zones
;
5787 gmx_domdec_ns_ranges_t
*izone
;
5789 for (d
= 0; d
< dd
->ndim
; d
++)
5792 copy_ivec(dd
->ci
, tmp
);
5793 tmp
[dim
] = (tmp
[dim
] + 1) % dd
->nc
[dim
];
5794 dd
->neighbor
[d
][0] = ddcoord2ddnodeid(dd
, tmp
);
5795 copy_ivec(dd
->ci
, tmp
);
5796 tmp
[dim
] = (tmp
[dim
] - 1 + dd
->nc
[dim
]) % dd
->nc
[dim
];
5797 dd
->neighbor
[d
][1] = ddcoord2ddnodeid(dd
, tmp
);
5800 fprintf(debug
, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5803 dd
->neighbor
[d
][1]);
5809 fprintf(fplog
, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5811 dd
->nc
[XX
], dd
->nc
[YY
], dd
->nc
[ZZ
],
5812 dd
->ci
[XX
], dd
->ci
[YY
], dd
->ci
[ZZ
]);
5819 for (i
= 0; i
< nzonep
; i
++)
5821 copy_ivec(dd_zp3
[i
], dd_zp
[i
]);
5827 for (i
= 0; i
< nzonep
; i
++)
5829 copy_ivec(dd_zp2
[i
], dd_zp
[i
]);
5835 for (i
= 0; i
< nzonep
; i
++)
5837 copy_ivec(dd_zp1
[i
], dd_zp
[i
]);
5841 gmx_fatal(FARGS
, "Can only do 1, 2 or 3D domain decomposition");
5846 zones
= &dd
->comm
->zones
;
5848 for (i
= 0; i
< nzone
; i
++)
5851 clear_ivec(zones
->shift
[i
]);
5852 for (d
= 0; d
< dd
->ndim
; d
++)
5854 zones
->shift
[i
][dd
->dim
[d
]] = dd_zo
[i
][m
++];
5859 for (i
= 0; i
< nzone
; i
++)
5861 for (d
= 0; d
< DIM
; d
++)
5863 s
[d
] = dd
->ci
[d
] - zones
->shift
[i
][d
];
5868 else if (s
[d
] >= dd
->nc
[d
])
5874 zones
->nizone
= nzonep
;
5875 for (i
= 0; i
< zones
->nizone
; i
++)
5877 if (dd_zp
[i
][0] != i
)
5879 gmx_fatal(FARGS
, "Internal inconsistency in the dd grid setup");
5881 izone
= &zones
->izone
[i
];
5882 izone
->j0
= dd_zp
[i
][1];
5883 izone
->j1
= dd_zp
[i
][2];
5884 for (dim
= 0; dim
< DIM
; dim
++)
5886 if (dd
->nc
[dim
] == 1)
5888 /* All shifts should be allowed */
5889 izone
->shift0
[dim
] = -1;
5890 izone
->shift1
[dim
] = 1;
5895 izone->shift0[d] = 0;
5896 izone->shift1[d] = 0;
5897 for(j=izone->j0; j<izone->j1; j++) {
5898 if (dd->shift[j][d] > dd->shift[i][d])
5899 izone->shift0[d] = -1;
5900 if (dd->shift[j][d] < dd->shift[i][d])
5901 izone->shift1[d] = 1;
5907 /* Assume the shift are not more than 1 cell */
5908 izone
->shift0
[dim
] = 1;
5909 izone
->shift1
[dim
] = -1;
5910 for (j
= izone
->j0
; j
< izone
->j1
; j
++)
5912 shift_diff
= zones
->shift
[j
][dim
] - zones
->shift
[i
][dim
];
5913 if (shift_diff
< izone
->shift0
[dim
])
5915 izone
->shift0
[dim
] = shift_diff
;
5917 if (shift_diff
> izone
->shift1
[dim
])
5919 izone
->shift1
[dim
] = shift_diff
;
5926 if (dd
->comm
->eDLB
!= edlbNO
)
5928 snew(dd
->comm
->root
, dd
->ndim
);
5931 if (dd
->comm
->bRecordLoad
)
5933 make_load_communicators(dd
);
5937 static void make_pp_communicator(FILE *fplog
, t_commrec
*cr
, int gmx_unused reorder
)
5940 gmx_domdec_comm_t
*comm
;
5951 if (comm
->bCartesianPP
)
5953 /* Set up cartesian communication for the particle-particle part */
5956 fprintf(fplog
, "Will use a Cartesian communicator: %d x %d x %d\n",
5957 dd
->nc
[XX
], dd
->nc
[YY
], dd
->nc
[ZZ
]);
5960 for (i
= 0; i
< DIM
; i
++)
5964 MPI_Cart_create(cr
->mpi_comm_mygroup
, DIM
, dd
->nc
, periods
, reorder
,
5966 /* We overwrite the old communicator with the new cartesian one */
5967 cr
->mpi_comm_mygroup
= comm_cart
;
5970 dd
->mpi_comm_all
= cr
->mpi_comm_mygroup
;
5971 MPI_Comm_rank(dd
->mpi_comm_all
, &dd
->rank
);
5973 if (comm
->bCartesianPP_PME
)
5975 /* Since we want to use the original cartesian setup for sim,
5976 * and not the one after split, we need to make an index.
5978 snew(comm
->ddindex2ddnodeid
, dd
->nnodes
);
5979 comm
->ddindex2ddnodeid
[dd_index(dd
->nc
, dd
->ci
)] = dd
->rank
;
5980 gmx_sumi(dd
->nnodes
, comm
->ddindex2ddnodeid
, cr
);
5981 /* Get the rank of the DD master,
5982 * above we made sure that the master node is a PP node.
5992 MPI_Allreduce(&rank
, &dd
->masterrank
, 1, MPI_INT
, MPI_SUM
, dd
->mpi_comm_all
);
5994 else if (comm
->bCartesianPP
)
5996 if (cr
->npmenodes
== 0)
5998 /* The PP communicator is also
5999 * the communicator for this simulation
6001 cr
->mpi_comm_mysim
= cr
->mpi_comm_mygroup
;
6003 cr
->nodeid
= dd
->rank
;
6005 MPI_Cart_coords(dd
->mpi_comm_all
, dd
->rank
, DIM
, dd
->ci
);
6007 /* We need to make an index to go from the coordinates
6008 * to the nodeid of this simulation.
6010 snew(comm
->ddindex2simnodeid
, dd
->nnodes
);
6011 snew(buf
, dd
->nnodes
);
6012 if (cr
->duty
& DUTY_PP
)
6014 buf
[dd_index(dd
->nc
, dd
->ci
)] = cr
->sim_nodeid
;
6016 /* Communicate the ddindex to simulation nodeid index */
6017 MPI_Allreduce(buf
, comm
->ddindex2simnodeid
, dd
->nnodes
, MPI_INT
, MPI_SUM
,
6018 cr
->mpi_comm_mysim
);
6021 /* Determine the master coordinates and rank.
6022 * The DD master should be the same node as the master of this sim.
6024 for (i
= 0; i
< dd
->nnodes
; i
++)
6026 if (comm
->ddindex2simnodeid
[i
] == 0)
6028 ddindex2xyz(dd
->nc
, i
, dd
->master_ci
);
6029 MPI_Cart_rank(dd
->mpi_comm_all
, dd
->master_ci
, &dd
->masterrank
);
6034 fprintf(debug
, "The master rank is %d\n", dd
->masterrank
);
6039 /* No Cartesian communicators */
6040 /* We use the rank in dd->comm->all as DD index */
6041 ddindex2xyz(dd
->nc
, dd
->rank
, dd
->ci
);
6042 /* The simulation master nodeid is 0, so the DD master rank is also 0 */
6044 clear_ivec(dd
->master_ci
);
6051 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
6052 dd
->rank
, dd
->ci
[XX
], dd
->ci
[YY
], dd
->ci
[ZZ
]);
6057 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
6058 dd
->rank
, dd
->ci
[XX
], dd
->ci
[YY
], dd
->ci
[ZZ
]);
6062 static void receive_ddindex2simnodeid(t_commrec
*cr
)
6066 gmx_domdec_comm_t
*comm
;
6073 if (!comm
->bCartesianPP_PME
&& comm
->bCartesianPP
)
6075 snew(comm
->ddindex2simnodeid
, dd
->nnodes
);
6076 snew(buf
, dd
->nnodes
);
6077 if (cr
->duty
& DUTY_PP
)
6079 buf
[dd_index(dd
->nc
, dd
->ci
)] = cr
->sim_nodeid
;
6082 /* Communicate the ddindex to simulation nodeid index */
6083 MPI_Allreduce(buf
, comm
->ddindex2simnodeid
, dd
->nnodes
, MPI_INT
, MPI_SUM
,
6084 cr
->mpi_comm_mysim
);
6091 static gmx_domdec_master_t
*init_gmx_domdec_master_t(gmx_domdec_t
*dd
,
6092 int ncg
, int natoms
)
6094 gmx_domdec_master_t
*ma
;
6099 snew(ma
->ncg
, dd
->nnodes
);
6100 snew(ma
->index
, dd
->nnodes
+1);
6102 snew(ma
->nat
, dd
->nnodes
);
6103 snew(ma
->ibuf
, dd
->nnodes
*2);
6104 snew(ma
->cell_x
, DIM
);
6105 for (i
= 0; i
< DIM
; i
++)
6107 snew(ma
->cell_x
[i
], dd
->nc
[i
]+1);
6110 if (dd
->nnodes
<= GMX_DD_NNODES_SENDRECV
)
6116 snew(ma
->vbuf
, natoms
);
6122 static void split_communicator(FILE *fplog
, t_commrec
*cr
, int gmx_unused dd_node_order
,
6123 int gmx_unused reorder
)
6126 gmx_domdec_comm_t
*comm
;
6137 if (comm
->bCartesianPP
)
6139 for (i
= 1; i
< DIM
; i
++)
6141 bDiv
[i
] = ((cr
->npmenodes
*dd
->nc
[i
]) % (dd
->nnodes
) == 0);
6143 if (bDiv
[YY
] || bDiv
[ZZ
])
6145 comm
->bCartesianPP_PME
= TRUE
;
6146 /* If we have 2D PME decomposition, which is always in x+y,
6147 * we stack the PME only nodes in z.
6148 * Otherwise we choose the direction that provides the thinnest slab
6149 * of PME only nodes as this will have the least effect
6150 * on the PP communication.
6151 * But for the PME communication the opposite might be better.
6153 if (bDiv
[ZZ
] && (comm
->npmenodes_y
> 1 ||
6155 dd
->nc
[YY
] > dd
->nc
[ZZ
]))
6157 comm
->cartpmedim
= ZZ
;
6161 comm
->cartpmedim
= YY
;
6163 comm
->ntot
[comm
->cartpmedim
]
6164 += (cr
->npmenodes
*dd
->nc
[comm
->cartpmedim
])/dd
->nnodes
;
6168 fprintf(fplog
, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr
->npmenodes
, dd
->nc
[XX
], dd
->nc
[YY
], dd
->nc
[XX
], dd
->nc
[ZZ
]);
6170 "Will not use a Cartesian communicator for PP <-> PME\n\n");
6175 if (comm
->bCartesianPP_PME
)
6179 fprintf(fplog
, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm
->ntot
[XX
], comm
->ntot
[YY
], comm
->ntot
[ZZ
]);
6182 for (i
= 0; i
< DIM
; i
++)
6186 MPI_Cart_create(cr
->mpi_comm_mysim
, DIM
, comm
->ntot
, periods
, reorder
,
6189 MPI_Comm_rank(comm_cart
, &rank
);
6190 if (MASTERNODE(cr
) && rank
!= 0)
6192 gmx_fatal(FARGS
, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
6195 /* With this assigment we loose the link to the original communicator
6196 * which will usually be MPI_COMM_WORLD, unless have multisim.
6198 cr
->mpi_comm_mysim
= comm_cart
;
6199 cr
->sim_nodeid
= rank
;
6201 MPI_Cart_coords(cr
->mpi_comm_mysim
, cr
->sim_nodeid
, DIM
, dd
->ci
);
6205 fprintf(fplog
, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
6206 cr
->sim_nodeid
, dd
->ci
[XX
], dd
->ci
[YY
], dd
->ci
[ZZ
]);
6209 if (dd
->ci
[comm
->cartpmedim
] < dd
->nc
[comm
->cartpmedim
])
6213 if (cr
->npmenodes
== 0 ||
6214 dd
->ci
[comm
->cartpmedim
] >= dd
->nc
[comm
->cartpmedim
])
6216 cr
->duty
= DUTY_PME
;
6219 /* Split the sim communicator into PP and PME only nodes */
6220 MPI_Comm_split(cr
->mpi_comm_mysim
,
6222 dd_index(comm
->ntot
, dd
->ci
),
6223 &cr
->mpi_comm_mygroup
);
6227 switch (dd_node_order
)
6232 fprintf(fplog
, "Order of the nodes: PP first, PME last\n");
6235 case ddnoINTERLEAVE
:
6236 /* Interleave the PP-only and PME-only nodes,
6237 * as on clusters with dual-core machines this will double
6238 * the communication bandwidth of the PME processes
6239 * and thus speed up the PP <-> PME and inter PME communication.
6243 fprintf(fplog
, "Interleaving PP and PME nodes\n");
6245 comm
->pmenodes
= dd_pmenodes(cr
);
6250 gmx_fatal(FARGS
, "Unknown dd_node_order=%d", dd_node_order
);
6253 if (dd_simnode2pmenode(cr
, cr
->sim_nodeid
) == -1)
6255 cr
->duty
= DUTY_PME
;
6262 /* Split the sim communicator into PP and PME only nodes */
6263 MPI_Comm_split(cr
->mpi_comm_mysim
,
6266 &cr
->mpi_comm_mygroup
);
6267 MPI_Comm_rank(cr
->mpi_comm_mygroup
, &cr
->nodeid
);
6273 fprintf(fplog
, "This is a %s only node\n\n",
6274 (cr
->duty
& DUTY_PP
) ? "particle-particle" : "PME-mesh");
6278 void make_dd_communicators(FILE *fplog
, t_commrec
*cr
, int dd_node_order
)
6281 gmx_domdec_comm_t
*comm
;
6287 copy_ivec(dd
->nc
, comm
->ntot
);
6289 comm
->bCartesianPP
= (dd_node_order
== ddnoCARTESIAN
);
6290 comm
->bCartesianPP_PME
= FALSE
;
6292 /* Reorder the nodes by default. This might change the MPI ranks.
6293 * Real reordering is only supported on very few architectures,
6294 * Blue Gene is one of them.
6296 CartReorder
= (getenv("GMX_NO_CART_REORDER") == NULL
);
6298 if (cr
->npmenodes
> 0)
6300 /* Split the communicator into a PP and PME part */
6301 split_communicator(fplog
, cr
, dd_node_order
, CartReorder
);
6302 if (comm
->bCartesianPP_PME
)
6304 /* We (possibly) reordered the nodes in split_communicator,
6305 * so it is no longer required in make_pp_communicator.
6307 CartReorder
= FALSE
;
6312 /* All nodes do PP and PME */
6314 /* We do not require separate communicators */
6315 cr
->mpi_comm_mygroup
= cr
->mpi_comm_mysim
;
6319 if (cr
->duty
& DUTY_PP
)
6321 /* Copy or make a new PP communicator */
6322 make_pp_communicator(fplog
, cr
, CartReorder
);
6326 receive_ddindex2simnodeid(cr
);
6329 if (!(cr
->duty
& DUTY_PME
))
6331 /* Set up the commnuication to our PME node */
6332 dd
->pme_nodeid
= dd_simnode2pmenode(cr
, cr
->sim_nodeid
);
6333 dd
->pme_receive_vir_ener
= receive_vir_ener(cr
);
6336 fprintf(debug
, "My pme_nodeid %d receive ener %d\n",
6337 dd
->pme_nodeid
, dd
->pme_receive_vir_ener
);
6342 dd
->pme_nodeid
= -1;
6347 dd
->ma
= init_gmx_domdec_master_t(dd
,
6349 comm
->cgs_gl
.index
[comm
->cgs_gl
.nr
]);
6353 static real
*get_slb_frac(FILE *fplog
, const char *dir
, int nc
, const char *size_string
)
6355 real
*slb_frac
, tot
;
6360 if (nc
> 1 && size_string
!= NULL
)
6364 fprintf(fplog
, "Using static load balancing for the %s direction\n",
6369 for (i
= 0; i
< nc
; i
++)
6372 sscanf(size_string
, "%lf%n", &dbl
, &n
);
6375 gmx_fatal(FARGS
, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir
, size_string
);
6384 fprintf(fplog
, "Relative cell sizes:");
6386 for (i
= 0; i
< nc
; i
++)
6391 fprintf(fplog
, " %5.3f", slb_frac
[i
]);
6396 fprintf(fplog
, "\n");
6403 static int multi_body_bondeds_count(gmx_mtop_t
*mtop
)
6406 gmx_mtop_ilistloop_t iloop
;
6410 iloop
= gmx_mtop_ilistloop_init(mtop
);
6411 while (gmx_mtop_ilistloop_next(iloop
, &il
, &nmol
))
6413 for (ftype
= 0; ftype
< F_NRE
; ftype
++)
6415 if ((interaction_function
[ftype
].flags
& IF_BOND
) &&
6418 n
+= nmol
*il
[ftype
].nr
/(1 + NRAL(ftype
));
6426 static int dd_nst_env(FILE *fplog
, const char *env_var
, int def
)
6432 val
= getenv(env_var
);
6435 if (sscanf(val
, "%d", &nst
) <= 0)
6441 fprintf(fplog
, "Found env.var. %s = %s, using value %d\n",
6449 static void dd_warning(t_commrec
*cr
, FILE *fplog
, const char *warn_string
)
6453 fprintf(stderr
, "\n%s\n", warn_string
);
6457 fprintf(fplog
, "\n%s\n", warn_string
);
6461 static void check_dd_restrictions(t_commrec
*cr
, gmx_domdec_t
*dd
,
6462 t_inputrec
*ir
, FILE *fplog
)
6464 if (ir
->ePBC
== epbcSCREW
&&
6465 (dd
->nc
[XX
] == 1 || dd
->nc
[YY
] > 1 || dd
->nc
[ZZ
] > 1))
6467 gmx_fatal(FARGS
, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names
[ir
->ePBC
]);
6470 if (ir
->ns_type
== ensSIMPLE
)
6472 gmx_fatal(FARGS
, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6475 if (ir
->nstlist
== 0)
6477 gmx_fatal(FARGS
, "Domain decomposition does not work with nstlist=0");
6480 if (ir
->comm_mode
== ecmANGULAR
&& ir
->ePBC
!= epbcNONE
)
6482 dd_warning(cr
, fplog
, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6486 static real
average_cellsize_min(gmx_domdec_t
*dd
, gmx_ddbox_t
*ddbox
)
6491 r
= ddbox
->box_size
[XX
];
6492 for (di
= 0; di
< dd
->ndim
; di
++)
6495 /* Check using the initial average cell size */
6496 r
= min(r
, ddbox
->box_size
[d
]*ddbox
->skew_fac
[d
]/dd
->nc
[d
]);
6502 static int check_dlb_support(FILE *fplog
, t_commrec
*cr
,
6503 const char *dlb_opt
, gmx_bool bRecordLoad
,
6504 unsigned long Flags
, t_inputrec
*ir
)
6512 case 'a': eDLB
= edlbAUTO
; break;
6513 case 'n': eDLB
= edlbNO
; break;
6514 case 'y': eDLB
= edlbYES
; break;
6515 default: gmx_incons("Unknown dlb_opt");
6518 if (Flags
& MD_RERUN
)
6523 if (!EI_DYNAMICS(ir
->eI
))
6525 if (eDLB
== edlbYES
)
6527 sprintf(buf
, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir
->eI
));
6528 dd_warning(cr
, fplog
, buf
);
6536 dd_warning(cr
, fplog
, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6541 if (Flags
& MD_REPRODUCIBLE
)
6548 dd_warning(cr
, fplog
, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6552 dd_warning(cr
, fplog
, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6555 gmx_fatal(FARGS
, "Death horror: undefined case (%d) for load balancing choice", eDLB
);
6563 static void set_dd_dim(FILE *fplog
, gmx_domdec_t
*dd
)
6568 if (getenv("GMX_DD_ORDER_ZYX") != NULL
)
6570 /* Decomposition order z,y,x */
6573 fprintf(fplog
, "Using domain decomposition order z, y, x\n");
6575 for (dim
= DIM
-1; dim
>= 0; dim
--)
6577 if (dd
->nc
[dim
] > 1)
6579 dd
->dim
[dd
->ndim
++] = dim
;
6585 /* Decomposition order x,y,z */
6586 for (dim
= 0; dim
< DIM
; dim
++)
6588 if (dd
->nc
[dim
] > 1)
6590 dd
->dim
[dd
->ndim
++] = dim
;
6596 static gmx_domdec_comm_t
*init_dd_comm()
6598 gmx_domdec_comm_t
*comm
;
6602 snew(comm
->cggl_flag
, DIM
*2);
6603 snew(comm
->cgcm_state
, DIM
*2);
6604 for (i
= 0; i
< DIM
*2; i
++)
6606 comm
->cggl_flag_nalloc
[i
] = 0;
6607 comm
->cgcm_state_nalloc
[i
] = 0;
6610 comm
->nalloc_int
= 0;
6611 comm
->buf_int
= NULL
;
6613 vec_rvec_init(&comm
->vbuf
);
6615 comm
->n_load_have
= 0;
6616 comm
->n_load_collect
= 0;
6618 for (i
= 0; i
< ddnatNR
-ddnatZONE
; i
++)
6620 comm
->sum_nat
[i
] = 0;
6624 comm
->load_step
= 0;
6627 clear_ivec(comm
->load_lim
);
6634 gmx_domdec_t
*init_domain_decomposition(FILE *fplog
, t_commrec
*cr
,
6635 unsigned long Flags
,
6637 real comm_distance_min
, real rconstr
,
6638 const char *dlb_opt
, real dlb_scale
,
6639 const char *sizex
, const char *sizey
, const char *sizez
,
6640 gmx_mtop_t
*mtop
, t_inputrec
*ir
,
6641 matrix box
, rvec
*x
,
6643 int *npme_x
, int *npme_y
)
6646 gmx_domdec_comm_t
*comm
;
6649 real r_2b
, r_mb
, r_bonded
= -1, r_bonded_limit
= -1, limit
, acs
;
6656 "\nInitializing Domain Decomposition on %d nodes\n", cr
->nnodes
);
6661 dd
->comm
= init_dd_comm();
6663 snew(comm
->cggl_flag
, DIM
*2);
6664 snew(comm
->cgcm_state
, DIM
*2);
6666 dd
->npbcdim
= ePBC2npbcdim(ir
->ePBC
);
6667 dd
->bScrewPBC
= (ir
->ePBC
== epbcSCREW
);
6669 dd
->bSendRecv2
= dd_nst_env(fplog
, "GMX_DD_SENDRECV2", 0);
6670 comm
->dlb_scale_lim
= dd_nst_env(fplog
, "GMX_DLB_MAX", 10);
6671 comm
->eFlop
= dd_nst_env(fplog
, "GMX_DLB_FLOP", 0);
6672 recload
= dd_nst_env(fplog
, "GMX_DD_LOAD", 1);
6673 comm
->nstSortCG
= dd_nst_env(fplog
, "GMX_DD_SORT", 1);
6674 comm
->nstDDDump
= dd_nst_env(fplog
, "GMX_DD_DUMP", 0);
6675 comm
->nstDDDumpGrid
= dd_nst_env(fplog
, "GMX_DD_DUMP_GRID", 0);
6676 comm
->DD_debug
= dd_nst_env(fplog
, "GMX_DD_DEBUG", 0);
6678 dd
->pme_recv_f_alloc
= 0;
6679 dd
->pme_recv_f_buf
= NULL
;
6681 if (dd
->bSendRecv2
&& fplog
)
6683 fprintf(fplog
, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6689 fprintf(fplog
, "Will load balance based on FLOP count\n");
6691 if (comm
->eFlop
> 1)
6693 srand(1+cr
->nodeid
);
6695 comm
->bRecordLoad
= TRUE
;
6699 comm
->bRecordLoad
= (wallcycle_have_counter() && recload
> 0);
6703 /* Initialize to GPU share count to 0, might change later */
6704 comm
->nrank_gpu_shared
= 0;
6706 comm
->eDLB
= check_dlb_support(fplog
, cr
, dlb_opt
, comm
->bRecordLoad
, Flags
, ir
);
6708 comm
->bDynLoadBal
= (comm
->eDLB
== edlbYES
);
6711 fprintf(fplog
, "Dynamic load balancing: %s\n", edlb_names
[comm
->eDLB
]);
6713 dd
->bGridJump
= comm
->bDynLoadBal
;
6714 comm
->bPMELoadBalDLBLimits
= FALSE
;
6716 if (comm
->nstSortCG
)
6720 if (comm
->nstSortCG
== 1)
6722 fprintf(fplog
, "Will sort the charge groups at every domain (re)decomposition\n");
6726 fprintf(fplog
, "Will sort the charge groups every %d steps\n",
6730 snew(comm
->sort
, 1);
6736 fprintf(fplog
, "Will not sort the charge groups\n");
6740 comm
->bCGs
= (ncg_mtop(mtop
) < mtop
->natoms
);
6742 comm
->bInterCGBondeds
= (ncg_mtop(mtop
) > mtop
->mols
.nr
);
6743 if (comm
->bInterCGBondeds
)
6745 comm
->bInterCGMultiBody
= (multi_body_bondeds_count(mtop
) > 0);
6749 comm
->bInterCGMultiBody
= FALSE
;
6752 dd
->bInterCGcons
= inter_charge_group_constraints(mtop
);
6753 dd
->bInterCGsettles
= inter_charge_group_settles(mtop
);
6755 if (ir
->rlistlong
== 0)
6757 /* Set the cut-off to some very large value,
6758 * so we don't need if statements everywhere in the code.
6759 * We use sqrt, since the cut-off is squared in some places.
6761 comm
->cutoff
= GMX_CUTOFF_INF
;
6765 comm
->cutoff
= ir
->rlistlong
;
6767 comm
->cutoff_mbody
= 0;
6769 comm
->cellsize_limit
= 0;
6770 comm
->bBondComm
= FALSE
;
6772 if (comm
->bInterCGBondeds
)
6774 if (comm_distance_min
> 0)
6776 comm
->cutoff_mbody
= comm_distance_min
;
6777 if (Flags
& MD_DDBONDCOMM
)
6779 comm
->bBondComm
= (comm
->cutoff_mbody
> comm
->cutoff
);
6783 comm
->cutoff
= max(comm
->cutoff
, comm
->cutoff_mbody
);
6785 r_bonded_limit
= comm
->cutoff_mbody
;
6787 else if (ir
->bPeriodicMols
)
6789 /* Can not easily determine the required cut-off */
6790 dd_warning(cr
, fplog
, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6791 comm
->cutoff_mbody
= comm
->cutoff
/2;
6792 r_bonded_limit
= comm
->cutoff_mbody
;
6798 dd_bonded_cg_distance(fplog
, mtop
, ir
, x
, box
,
6799 Flags
& MD_DDBONDCHECK
, &r_2b
, &r_mb
);
6801 gmx_bcast(sizeof(r_2b
), &r_2b
, cr
);
6802 gmx_bcast(sizeof(r_mb
), &r_mb
, cr
);
6804 /* We use an initial margin of 10% for the minimum cell size,
6805 * except when we are just below the non-bonded cut-off.
6807 if (Flags
& MD_DDBONDCOMM
)
6809 if (max(r_2b
, r_mb
) > comm
->cutoff
)
6811 r_bonded
= max(r_2b
, r_mb
);
6812 r_bonded_limit
= 1.1*r_bonded
;
6813 comm
->bBondComm
= TRUE
;
6818 r_bonded_limit
= min(1.1*r_bonded
, comm
->cutoff
);
6820 /* We determine cutoff_mbody later */
6824 /* No special bonded communication,
6825 * simply increase the DD cut-off.
6827 r_bonded_limit
= 1.1*max(r_2b
, r_mb
);
6828 comm
->cutoff_mbody
= r_bonded_limit
;
6829 comm
->cutoff
= max(comm
->cutoff
, comm
->cutoff_mbody
);
6832 comm
->cellsize_limit
= max(comm
->cellsize_limit
, r_bonded_limit
);
6836 "Minimum cell size due to bonded interactions: %.3f nm\n",
6837 comm
->cellsize_limit
);
6841 if (dd
->bInterCGcons
&& rconstr
<= 0)
6843 /* There is a cell size limit due to the constraints (P-LINCS) */
6844 rconstr
= constr_r_max(fplog
, mtop
, ir
);
6848 "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6850 if (rconstr
> comm
->cellsize_limit
)
6852 fprintf(fplog
, "This distance will limit the DD cell size, you can override this with -rcon\n");
6856 else if (rconstr
> 0 && fplog
)
6858 /* Here we do not check for dd->bInterCGcons,
6859 * because one can also set a cell size limit for virtual sites only
6860 * and at this point we don't know yet if there are intercg v-sites.
6863 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6866 comm
->cellsize_limit
= max(comm
->cellsize_limit
, rconstr
);
6868 comm
->cgs_gl
= gmx_mtop_global_cgs(mtop
);
6872 copy_ivec(nc
, dd
->nc
);
6873 set_dd_dim(fplog
, dd
);
6874 set_ddbox_cr(cr
, &dd
->nc
, ir
, box
, &comm
->cgs_gl
, x
, ddbox
);
6876 if (cr
->npmenodes
== -1)
6880 acs
= average_cellsize_min(dd
, ddbox
);
6881 if (acs
< comm
->cellsize_limit
)
6885 fprintf(fplog
, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs
, comm
->cellsize_limit
);
6887 gmx_fatal_collective(FARGS
, cr
, NULL
,
6888 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6889 acs
, comm
->cellsize_limit
);
6894 set_ddbox_cr(cr
, NULL
, ir
, box
, &comm
->cgs_gl
, x
, ddbox
);
6896 /* We need to choose the optimal DD grid and possibly PME nodes */
6897 limit
= dd_choose_grid(fplog
, cr
, dd
, ir
, mtop
, box
, ddbox
,
6898 comm
->eDLB
!= edlbNO
, dlb_scale
,
6899 comm
->cellsize_limit
, comm
->cutoff
,
6900 comm
->bInterCGBondeds
);
6902 if (dd
->nc
[XX
] == 0)
6904 bC
= (dd
->bInterCGcons
&& rconstr
> r_bonded_limit
);
6905 sprintf(buf
, "Change the number of nodes or mdrun option %s%s%s",
6906 !bC
? "-rdd" : "-rcon",
6907 comm
->eDLB
!= edlbNO
? " or -dds" : "",
6908 bC
? " or your LINCS settings" : "");
6910 gmx_fatal_collective(FARGS
, cr
, NULL
,
6911 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6913 "Look in the log file for details on the domain decomposition",
6914 cr
->nnodes
-cr
->npmenodes
, limit
, buf
);
6916 set_dd_dim(fplog
, dd
);
6922 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6923 dd
->nc
[XX
], dd
->nc
[YY
], dd
->nc
[ZZ
], cr
->npmenodes
);
6926 dd
->nnodes
= dd
->nc
[XX
]*dd
->nc
[YY
]*dd
->nc
[ZZ
];
6927 if (cr
->nnodes
- dd
->nnodes
!= cr
->npmenodes
)
6929 gmx_fatal_collective(FARGS
, cr
, NULL
,
6930 "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6931 dd
->nnodes
, cr
->nnodes
- cr
->npmenodes
, cr
->nnodes
);
6933 if (cr
->npmenodes
> dd
->nnodes
)
6935 gmx_fatal_collective(FARGS
, cr
, NULL
,
6936 "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr
->npmenodes
, dd
->nnodes
);
6938 if (cr
->npmenodes
> 0)
6940 comm
->npmenodes
= cr
->npmenodes
;
6944 comm
->npmenodes
= dd
->nnodes
;
6947 if (EEL_PME(ir
->coulombtype
) || EVDW_PME(ir
->vdwtype
))
6949 /* The following choices should match those
6950 * in comm_cost_est in domdec_setup.c.
6951 * Note that here the checks have to take into account
6952 * that the decomposition might occur in a different order than xyz
6953 * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6954 * in which case they will not match those in comm_cost_est,
6955 * but since that is mainly for testing purposes that's fine.
6957 if (dd
->ndim
>= 2 && dd
->dim
[0] == XX
&& dd
->dim
[1] == YY
&&
6958 comm
->npmenodes
> dd
->nc
[XX
] && comm
->npmenodes
% dd
->nc
[XX
] == 0 &&
6959 getenv("GMX_PMEONEDD") == NULL
)
6961 comm
->npmedecompdim
= 2;
6962 comm
->npmenodes_x
= dd
->nc
[XX
];
6963 comm
->npmenodes_y
= comm
->npmenodes
/comm
->npmenodes_x
;
6967 /* In case nc is 1 in both x and y we could still choose to
6968 * decompose pme in y instead of x, but we use x for simplicity.
6970 comm
->npmedecompdim
= 1;
6971 if (dd
->dim
[0] == YY
)
6973 comm
->npmenodes_x
= 1;
6974 comm
->npmenodes_y
= comm
->npmenodes
;
6978 comm
->npmenodes_x
= comm
->npmenodes
;
6979 comm
->npmenodes_y
= 1;
6984 fprintf(fplog
, "PME domain decomposition: %d x %d x %d\n",
6985 comm
->npmenodes_x
, comm
->npmenodes_y
, 1);
6990 comm
->npmedecompdim
= 0;
6991 comm
->npmenodes_x
= 0;
6992 comm
->npmenodes_y
= 0;
6995 /* Technically we don't need both of these,
6996 * but it simplifies code not having to recalculate it.
6998 *npme_x
= comm
->npmenodes_x
;
6999 *npme_y
= comm
->npmenodes_y
;
7001 snew(comm
->slb_frac
, DIM
);
7002 if (comm
->eDLB
== edlbNO
)
7004 comm
->slb_frac
[XX
] = get_slb_frac(fplog
, "x", dd
->nc
[XX
], sizex
);
7005 comm
->slb_frac
[YY
] = get_slb_frac(fplog
, "y", dd
->nc
[YY
], sizey
);
7006 comm
->slb_frac
[ZZ
] = get_slb_frac(fplog
, "z", dd
->nc
[ZZ
], sizez
);
7009 if (comm
->bInterCGBondeds
&& comm
->cutoff_mbody
== 0)
7011 if (comm
->bBondComm
|| comm
->eDLB
!= edlbNO
)
7013 /* Set the bonded communication distance to halfway
7014 * the minimum and the maximum,
7015 * since the extra communication cost is nearly zero.
7017 acs
= average_cellsize_min(dd
, ddbox
);
7018 comm
->cutoff_mbody
= 0.5*(r_bonded
+ acs
);
7019 if (comm
->eDLB
!= edlbNO
)
7021 /* Check if this does not limit the scaling */
7022 comm
->cutoff_mbody
= min(comm
->cutoff_mbody
, dlb_scale
*acs
);
7024 if (!comm
->bBondComm
)
7026 /* Without bBondComm do not go beyond the n.b. cut-off */
7027 comm
->cutoff_mbody
= min(comm
->cutoff_mbody
, comm
->cutoff
);
7028 if (comm
->cellsize_limit
>= comm
->cutoff
)
7030 /* We don't loose a lot of efficieny
7031 * when increasing it to the n.b. cut-off.
7032 * It can even be slightly faster, because we need
7033 * less checks for the communication setup.
7035 comm
->cutoff_mbody
= comm
->cutoff
;
7038 /* Check if we did not end up below our original limit */
7039 comm
->cutoff_mbody
= max(comm
->cutoff_mbody
, r_bonded_limit
);
7041 if (comm
->cutoff_mbody
> comm
->cellsize_limit
)
7043 comm
->cellsize_limit
= comm
->cutoff_mbody
;
7046 /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
7051 fprintf(debug
, "Bonded atom communication beyond the cut-off: %d\n"
7052 "cellsize limit %f\n",
7053 comm
->bBondComm
, comm
->cellsize_limit
);
7058 check_dd_restrictions(cr
, dd
, ir
, fplog
);
7061 comm
->partition_step
= INT_MIN
;
7064 clear_dd_cycle_counts(dd
);
7069 static void set_dlb_limits(gmx_domdec_t
*dd
)
7074 for (d
= 0; d
< dd
->ndim
; d
++)
7076 dd
->comm
->cd
[d
].np
= dd
->comm
->cd
[d
].np_dlb
;
7077 dd
->comm
->cellsize_min
[dd
->dim
[d
]] =
7078 dd
->comm
->cellsize_min_dlb
[dd
->dim
[d
]];
7083 static void turn_on_dlb(FILE *fplog
, t_commrec
*cr
, gmx_large_int_t step
)
7086 gmx_domdec_comm_t
*comm
;
7096 fprintf(fplog
, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step
, buf
), dd_force_imb_perf_loss(dd
)*100);
7099 cellsize_min
= comm
->cellsize_min
[dd
->dim
[0]];
7100 for (d
= 1; d
< dd
->ndim
; d
++)
7102 cellsize_min
= min(cellsize_min
, comm
->cellsize_min
[dd
->dim
[d
]]);
7105 if (cellsize_min
< comm
->cellsize_limit
*1.05)
7107 dd_warning(cr
, fplog
, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
7109 /* Change DLB from "auto" to "no". */
7110 comm
->eDLB
= edlbNO
;
7115 dd_warning(cr
, fplog
, "NOTE: Turning on dynamic load balancing\n");
7116 comm
->bDynLoadBal
= TRUE
;
7117 dd
->bGridJump
= TRUE
;
7121 /* We can set the required cell size info here,
7122 * so we do not need to communicate this.
7123 * The grid is completely uniform.
7125 for (d
= 0; d
< dd
->ndim
; d
++)
7129 comm
->load
[d
].sum_m
= comm
->load
[d
].sum
;
7131 nc
= dd
->nc
[dd
->dim
[d
]];
7132 for (i
= 0; i
< nc
; i
++)
7134 comm
->root
[d
]->cell_f
[i
] = i
/(real
)nc
;
7137 comm
->root
[d
]->cell_f_max0
[i
] = i
/(real
)nc
;
7138 comm
->root
[d
]->cell_f_min1
[i
] = (i
+1)/(real
)nc
;
7141 comm
->root
[d
]->cell_f
[nc
] = 1.0;
7146 static char *init_bLocalCG(gmx_mtop_t
*mtop
)
7151 ncg
= ncg_mtop(mtop
);
7152 snew(bLocalCG
, ncg
);
7153 for (cg
= 0; cg
< ncg
; cg
++)
7155 bLocalCG
[cg
] = FALSE
;
7161 void dd_init_bondeds(FILE *fplog
,
7162 gmx_domdec_t
*dd
, gmx_mtop_t
*mtop
,
7164 t_inputrec
*ir
, gmx_bool bBCheck
, cginfo_mb_t
*cginfo_mb
)
7166 gmx_domdec_comm_t
*comm
;
7170 dd_make_reverse_top(fplog
, dd
, mtop
, vsite
, ir
, bBCheck
);
7174 if (comm
->bBondComm
)
7176 /* Communicate atoms beyond the cut-off for bonded interactions */
7179 comm
->cglink
= make_charge_group_links(mtop
, dd
, cginfo_mb
);
7181 comm
->bLocalCG
= init_bLocalCG(mtop
);
7185 /* Only communicate atoms based on cut-off */
7186 comm
->cglink
= NULL
;
7187 comm
->bLocalCG
= NULL
;
7191 static void print_dd_settings(FILE *fplog
, gmx_domdec_t
*dd
,
7193 gmx_bool bDynLoadBal
, real dlb_scale
,
7196 gmx_domdec_comm_t
*comm
;
7211 fprintf(fplog
, "The maximum number of communication pulses is:");
7212 for (d
= 0; d
< dd
->ndim
; d
++)
7214 fprintf(fplog
, " %c %d", dim2char(dd
->dim
[d
]), comm
->cd
[d
].np_dlb
);
7216 fprintf(fplog
, "\n");
7217 fprintf(fplog
, "The minimum size for domain decomposition cells is %.3f nm\n", comm
->cellsize_limit
);
7218 fprintf(fplog
, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale
);
7219 fprintf(fplog
, "The allowed shrink of domain decomposition cells is:");
7220 for (d
= 0; d
< DIM
; d
++)
7224 if (d
>= ddbox
->npbcdim
&& dd
->nc
[d
] == 2)
7231 comm
->cellsize_min_dlb
[d
]/
7232 (ddbox
->box_size
[d
]*ddbox
->skew_fac
[d
]/dd
->nc
[d
]);
7234 fprintf(fplog
, " %c %.2f", dim2char(d
), shrink
);
7237 fprintf(fplog
, "\n");
7241 set_dd_cell_sizes_slb(dd
, ddbox
, FALSE
, np
);
7242 fprintf(fplog
, "The initial number of communication pulses is:");
7243 for (d
= 0; d
< dd
->ndim
; d
++)
7245 fprintf(fplog
, " %c %d", dim2char(dd
->dim
[d
]), np
[dd
->dim
[d
]]);
7247 fprintf(fplog
, "\n");
7248 fprintf(fplog
, "The initial domain decomposition cell size is:");
7249 for (d
= 0; d
< DIM
; d
++)
7253 fprintf(fplog
, " %c %.2f nm",
7254 dim2char(d
), dd
->comm
->cellsize_min
[d
]);
7257 fprintf(fplog
, "\n\n");
7260 if (comm
->bInterCGBondeds
|| dd
->vsite_comm
|| dd
->constraint_comm
)
7262 fprintf(fplog
, "The maximum allowed distance for charge groups involved in interactions is:\n");
7263 fprintf(fplog
, "%40s %-7s %6.3f nm\n",
7264 "non-bonded interactions", "", comm
->cutoff
);
7268 limit
= dd
->comm
->cellsize_limit
;
7272 if (dynamic_dd_box(ddbox
, ir
))
7274 fprintf(fplog
, "(the following are initial values, they could change due to box deformation)\n");
7276 limit
= dd
->comm
->cellsize_min
[XX
];
7277 for (d
= 1; d
< DIM
; d
++)
7279 limit
= min(limit
, dd
->comm
->cellsize_min
[d
]);
7283 if (comm
->bInterCGBondeds
)
7285 fprintf(fplog
, "%40s %-7s %6.3f nm\n",
7286 "two-body bonded interactions", "(-rdd)",
7287 max(comm
->cutoff
, comm
->cutoff_mbody
));
7288 fprintf(fplog
, "%40s %-7s %6.3f nm\n",
7289 "multi-body bonded interactions", "(-rdd)",
7290 (comm
->bBondComm
|| dd
->bGridJump
) ? comm
->cutoff_mbody
: min(comm
->cutoff
, limit
));
7294 fprintf(fplog
, "%40s %-7s %6.3f nm\n",
7295 "virtual site constructions", "(-rcon)", limit
);
7297 if (dd
->constraint_comm
)
7299 sprintf(buf
, "atoms separated by up to %d constraints",
7301 fprintf(fplog
, "%40s %-7s %6.3f nm\n",
7302 buf
, "(-rcon)", limit
);
7304 fprintf(fplog
, "\n");
7310 static void set_cell_limits_dlb(gmx_domdec_t
*dd
,
7312 const t_inputrec
*ir
,
7313 const gmx_ddbox_t
*ddbox
)
7315 gmx_domdec_comm_t
*comm
;
7316 int d
, dim
, npulse
, npulse_d_max
, npulse_d
;
7321 bNoCutOff
= (ir
->rvdw
== 0 || ir
->rcoulomb
== 0);
7323 /* Determine the maximum number of comm. pulses in one dimension */
7325 comm
->cellsize_limit
= max(comm
->cellsize_limit
, comm
->cutoff_mbody
);
7327 /* Determine the maximum required number of grid pulses */
7328 if (comm
->cellsize_limit
>= comm
->cutoff
)
7330 /* Only a single pulse is required */
7333 else if (!bNoCutOff
&& comm
->cellsize_limit
> 0)
7335 /* We round down slightly here to avoid overhead due to the latency
7336 * of extra communication calls when the cut-off
7337 * would be only slightly longer than the cell size.
7338 * Later cellsize_limit is redetermined,
7339 * so we can not miss interactions due to this rounding.
7341 npulse
= (int)(0.96 + comm
->cutoff
/comm
->cellsize_limit
);
7345 /* There is no cell size limit */
7346 npulse
= max(dd
->nc
[XX
]-1, max(dd
->nc
[YY
]-1, dd
->nc
[ZZ
]-1));
7349 if (!bNoCutOff
&& npulse
> 1)
7351 /* See if we can do with less pulses, based on dlb_scale */
7353 for (d
= 0; d
< dd
->ndim
; d
++)
7356 npulse_d
= (int)(1 + dd
->nc
[dim
]*comm
->cutoff
7357 /(ddbox
->box_size
[dim
]*ddbox
->skew_fac
[dim
]*dlb_scale
));
7358 npulse_d_max
= max(npulse_d_max
, npulse_d
);
7360 npulse
= min(npulse
, npulse_d_max
);
7363 /* This env var can override npulse */
7364 d
= dd_nst_env(debug
, "GMX_DD_NPULSE", 0);
7371 comm
->bVacDLBNoLimit
= (ir
->ePBC
== epbcNONE
);
7372 for (d
= 0; d
< dd
->ndim
; d
++)
7374 comm
->cd
[d
].np_dlb
= min(npulse
, dd
->nc
[dd
->dim
[d
]]-1);
7375 comm
->cd
[d
].np_nalloc
= comm
->cd
[d
].np_dlb
;
7376 snew(comm
->cd
[d
].ind
, comm
->cd
[d
].np_nalloc
);
7377 comm
->maxpulse
= max(comm
->maxpulse
, comm
->cd
[d
].np_dlb
);
7378 if (comm
->cd
[d
].np_dlb
< dd
->nc
[dd
->dim
[d
]]-1)
7380 comm
->bVacDLBNoLimit
= FALSE
;
7384 /* cellsize_limit is set for LINCS in init_domain_decomposition */
7385 if (!comm
->bVacDLBNoLimit
)
7387 comm
->cellsize_limit
= max(comm
->cellsize_limit
,
7388 comm
->cutoff
/comm
->maxpulse
);
7390 comm
->cellsize_limit
= max(comm
->cellsize_limit
, comm
->cutoff_mbody
);
7391 /* Set the minimum cell size for each DD dimension */
7392 for (d
= 0; d
< dd
->ndim
; d
++)
7394 if (comm
->bVacDLBNoLimit
||
7395 comm
->cd
[d
].np_dlb
*comm
->cellsize_limit
>= comm
->cutoff
)
7397 comm
->cellsize_min_dlb
[dd
->dim
[d
]] = comm
->cellsize_limit
;
7401 comm
->cellsize_min_dlb
[dd
->dim
[d
]] =
7402 comm
->cutoff
/comm
->cd
[d
].np_dlb
;
7405 if (comm
->cutoff_mbody
<= 0)
7407 comm
->cutoff_mbody
= min(comm
->cutoff
, comm
->cellsize_limit
);
7409 if (comm
->bDynLoadBal
)
7415 gmx_bool
dd_bonded_molpbc(gmx_domdec_t
*dd
, int ePBC
)
7417 /* If each molecule is a single charge group
7418 * or we use domain decomposition for each periodic dimension,
7419 * we do not need to take pbc into account for the bonded interactions.
7421 return (ePBC
!= epbcNONE
&& dd
->comm
->bInterCGBondeds
&&
7424 (dd
->nc
[ZZ
] > 1 || ePBC
== epbcXY
)));
7427 void set_dd_parameters(FILE *fplog
, gmx_domdec_t
*dd
, real dlb_scale
,
7428 t_inputrec
*ir
, gmx_ddbox_t
*ddbox
)
7430 gmx_domdec_comm_t
*comm
;
7436 /* Initialize the thread data.
7437 * This can not be done in init_domain_decomposition,
7438 * as the numbers of threads is determined later.
7440 comm
->nth
= gmx_omp_nthreads_get(emntDomdec
);
7443 snew(comm
->dth
, comm
->nth
);
7446 if (EEL_PME(ir
->coulombtype
) || EVDW_PME(ir
->vdwtype
))
7448 init_ddpme(dd
, &comm
->ddpme
[0], 0);
7449 if (comm
->npmedecompdim
>= 2)
7451 init_ddpme(dd
, &comm
->ddpme
[1], 1);
7456 comm
->npmenodes
= 0;
7457 if (dd
->pme_nodeid
>= 0)
7459 gmx_fatal_collective(FARGS
, NULL
, dd
,
7460 "Can not have separate PME nodes without PME electrostatics");
7466 fprintf(debug
, "The DD cut-off is %f\n", comm
->cutoff
);
7468 if (comm
->eDLB
!= edlbNO
)
7470 set_cell_limits_dlb(dd
, dlb_scale
, ir
, ddbox
);
7473 print_dd_settings(fplog
, dd
, ir
, comm
->bDynLoadBal
, dlb_scale
, ddbox
);
7474 if (comm
->eDLB
== edlbAUTO
)
7478 fprintf(fplog
, "When dynamic load balancing gets turned on, these settings will change to:\n");
7480 print_dd_settings(fplog
, dd
, ir
, TRUE
, dlb_scale
, ddbox
);
7483 if (ir
->ePBC
== epbcNONE
)
7485 vol_frac
= 1 - 1/(double)dd
->nnodes
;
7490 (1 + comm_box_frac(dd
->nc
, comm
->cutoff
, ddbox
))/(double)dd
->nnodes
;
7494 fprintf(debug
, "Volume fraction for all DD zones: %f\n", vol_frac
);
7496 natoms_tot
= comm
->cgs_gl
.index
[comm
->cgs_gl
.nr
];
7498 dd
->ga2la
= ga2la_init(natoms_tot
, vol_frac
*natoms_tot
);
7501 static gmx_bool
test_dd_cutoff(t_commrec
*cr
,
7502 t_state
*state
, t_inputrec
*ir
,
7513 set_ddbox(dd
, FALSE
, cr
, ir
, state
->box
,
7514 TRUE
, &dd
->comm
->cgs_gl
, state
->x
, &ddbox
);
7518 for (d
= 0; d
< dd
->ndim
; d
++)
7522 inv_cell_size
= DD_CELL_MARGIN
*dd
->nc
[dim
]/ddbox
.box_size
[dim
];
7523 if (dynamic_dd_box(&ddbox
, ir
))
7525 inv_cell_size
*= DD_PRES_SCALE_MARGIN
;
7528 np
= 1 + (int)(cutoff_req
*inv_cell_size
*ddbox
.skew_fac
[dim
]);
7530 if (dd
->comm
->eDLB
!= edlbNO
&& dim
< ddbox
.npbcdim
&&
7531 dd
->comm
->cd
[d
].np_dlb
> 0)
7533 if (np
> dd
->comm
->cd
[d
].np_dlb
)
7538 /* If a current local cell size is smaller than the requested
7539 * cut-off, we could still fix it, but this gets very complicated.
7540 * Without fixing here, we might actually need more checks.
7542 if ((dd
->comm
->cell_x1
[dim
] - dd
->comm
->cell_x0
[dim
])*ddbox
.skew_fac
[dim
]*dd
->comm
->cd
[d
].np_dlb
< cutoff_req
)
7549 if (dd
->comm
->eDLB
!= edlbNO
)
7551 /* If DLB is not active yet, we don't need to check the grid jumps.
7552 * Actually we shouldn't, because then the grid jump data is not set.
7554 if (dd
->comm
->bDynLoadBal
&&
7555 check_grid_jump(0, dd
, cutoff_req
, &ddbox
, FALSE
))
7560 gmx_sumi(1, &LocallyLimited
, cr
);
7562 if (LocallyLimited
> 0)
7571 gmx_bool
change_dd_cutoff(t_commrec
*cr
, t_state
*state
, t_inputrec
*ir
,
7574 gmx_bool bCutoffAllowed
;
7576 bCutoffAllowed
= test_dd_cutoff(cr
, state
, ir
, cutoff_req
);
7580 cr
->dd
->comm
->cutoff
= cutoff_req
;
7583 return bCutoffAllowed
;
7586 void change_dd_dlb_cutoff_limit(t_commrec
*cr
)
7588 gmx_domdec_comm_t
*comm
;
7590 comm
= cr
->dd
->comm
;
7592 /* Turn on the DLB limiting (might have been on already) */
7593 comm
->bPMELoadBalDLBLimits
= TRUE
;
7595 /* Change the cut-off limit */
7596 comm
->PMELoadBal_max_cutoff
= comm
->cutoff
;
7599 static void merge_cg_buffers(int ncell
,
7600 gmx_domdec_comm_dim_t
*cd
, int pulse
,
7602 int *index_gl
, int *recv_i
,
7603 rvec
*cg_cm
, rvec
*recv_vr
,
7605 cginfo_mb_t
*cginfo_mb
, int *cginfo
)
7607 gmx_domdec_ind_t
*ind
, *ind_p
;
7608 int p
, cell
, c
, cg
, cg0
, cg1
, cg_gl
, nat
;
7609 int shift
, shift_at
;
7611 ind
= &cd
->ind
[pulse
];
7613 /* First correct the already stored data */
7614 shift
= ind
->nrecv
[ncell
];
7615 for (cell
= ncell
-1; cell
>= 0; cell
--)
7617 shift
-= ind
->nrecv
[cell
];
7620 /* Move the cg's present from previous grid pulses */
7621 cg0
= ncg_cell
[ncell
+cell
];
7622 cg1
= ncg_cell
[ncell
+cell
+1];
7623 cgindex
[cg1
+shift
] = cgindex
[cg1
];
7624 for (cg
= cg1
-1; cg
>= cg0
; cg
--)
7626 index_gl
[cg
+shift
] = index_gl
[cg
];
7627 copy_rvec(cg_cm
[cg
], cg_cm
[cg
+shift
]);
7628 cgindex
[cg
+shift
] = cgindex
[cg
];
7629 cginfo
[cg
+shift
] = cginfo
[cg
];
7631 /* Correct the already stored send indices for the shift */
7632 for (p
= 1; p
<= pulse
; p
++)
7634 ind_p
= &cd
->ind
[p
];
7636 for (c
= 0; c
< cell
; c
++)
7638 cg0
+= ind_p
->nsend
[c
];
7640 cg1
= cg0
+ ind_p
->nsend
[cell
];
7641 for (cg
= cg0
; cg
< cg1
; cg
++)
7643 ind_p
->index
[cg
] += shift
;
7649 /* Merge in the communicated buffers */
7653 for (cell
= 0; cell
< ncell
; cell
++)
7655 cg1
= ncg_cell
[ncell
+cell
+1] + shift
;
7658 /* Correct the old cg indices */
7659 for (cg
= ncg_cell
[ncell
+cell
]; cg
< cg1
; cg
++)
7661 cgindex
[cg
+1] += shift_at
;
7664 for (cg
= 0; cg
< ind
->nrecv
[cell
]; cg
++)
7666 /* Copy this charge group from the buffer */
7667 index_gl
[cg1
] = recv_i
[cg0
];
7668 copy_rvec(recv_vr
[cg0
], cg_cm
[cg1
]);
7669 /* Add it to the cgindex */
7670 cg_gl
= index_gl
[cg1
];
7671 cginfo
[cg1
] = ddcginfo(cginfo_mb
, cg_gl
);
7672 nat
= GET_CGINFO_NATOMS(cginfo
[cg1
]);
7673 cgindex
[cg1
+1] = cgindex
[cg1
] + nat
;
7678 shift
+= ind
->nrecv
[cell
];
7679 ncg_cell
[ncell
+cell
+1] = cg1
;
7683 static void make_cell2at_index(gmx_domdec_comm_dim_t
*cd
,
7684 int nzone
, int cg0
, const int *cgindex
)
7688 /* Store the atom block boundaries for easy copying of communication buffers
7691 for (zone
= 0; zone
< nzone
; zone
++)
7693 for (p
= 0; p
< cd
->np
; p
++)
7695 cd
->ind
[p
].cell2at0
[zone
] = cgindex
[cg
];
7696 cg
+= cd
->ind
[p
].nrecv
[zone
];
7697 cd
->ind
[p
].cell2at1
[zone
] = cgindex
[cg
];
7702 static gmx_bool
missing_link(t_blocka
*link
, int cg_gl
, char *bLocalCG
)
7708 for (i
= link
->index
[cg_gl
]; i
< link
->index
[cg_gl
+1]; i
++)
7710 if (!bLocalCG
[link
->a
[i
]])
7719 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7721 real c
[DIM
][4]; /* the corners for the non-bonded communication */
7722 real cr0
; /* corner for rounding */
7723 real cr1
[4]; /* corners for rounding */
7724 real bc
[DIM
]; /* corners for bounded communication */
7725 real bcr1
; /* corner for rounding for bonded communication */
7728 /* Determine the corners of the domain(s) we are communicating with */
7730 set_dd_corners(const gmx_domdec_t
*dd
,
7731 int dim0
, int dim1
, int dim2
,
7735 const gmx_domdec_comm_t
*comm
;
7736 const gmx_domdec_zones_t
*zones
;
7741 zones
= &comm
->zones
;
7743 /* Keep the compiler happy */
7747 /* The first dimension is equal for all cells */
7748 c
->c
[0][0] = comm
->cell_x0
[dim0
];
7751 c
->bc
[0] = c
->c
[0][0];
7756 /* This cell row is only seen from the first row */
7757 c
->c
[1][0] = comm
->cell_x0
[dim1
];
7758 /* All rows can see this row */
7759 c
->c
[1][1] = comm
->cell_x0
[dim1
];
7762 c
->c
[1][1] = max(comm
->cell_x0
[dim1
], comm
->zone_d1
[1].mch0
);
7765 /* For the multi-body distance we need the maximum */
7766 c
->bc
[1] = max(comm
->cell_x0
[dim1
], comm
->zone_d1
[1].p1_0
);
7769 /* Set the upper-right corner for rounding */
7770 c
->cr0
= comm
->cell_x1
[dim0
];
7775 for (j
= 0; j
< 4; j
++)
7777 c
->c
[2][j
] = comm
->cell_x0
[dim2
];
7781 /* Use the maximum of the i-cells that see a j-cell */
7782 for (i
= 0; i
< zones
->nizone
; i
++)
7784 for (j
= zones
->izone
[i
].j0
; j
< zones
->izone
[i
].j1
; j
++)
7790 comm
->zone_d2
[zones
->shift
[i
][dim0
]][zones
->shift
[i
][dim1
]].mch0
);
7796 /* For the multi-body distance we need the maximum */
7797 c
->bc
[2] = comm
->cell_x0
[dim2
];
7798 for (i
= 0; i
< 2; i
++)
7800 for (j
= 0; j
< 2; j
++)
7802 c
->bc
[2] = max(c
->bc
[2], comm
->zone_d2
[i
][j
].p1_0
);
7808 /* Set the upper-right corner for rounding */
7809 /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7810 * Only cell (0,0,0) can see cell 7 (1,1,1)
7812 c
->cr1
[0] = comm
->cell_x1
[dim1
];
7813 c
->cr1
[3] = comm
->cell_x1
[dim1
];
7816 c
->cr1
[0] = max(comm
->cell_x1
[dim1
], comm
->zone_d1
[1].mch1
);
7819 /* For the multi-body distance we need the maximum */
7820 c
->bcr1
= max(comm
->cell_x1
[dim1
], comm
->zone_d1
[1].p1_1
);
7827 /* Determine which cg's we need to send in this pulse from this zone */
7829 get_zone_pulse_cgs(gmx_domdec_t
*dd
,
7830 int zonei
, int zone
,
7832 const int *index_gl
,
7834 int dim
, int dim_ind
,
7835 int dim0
, int dim1
, int dim2
,
7836 real r_comm2
, real r_bcomm2
,
7840 real skew_fac2_d
, real skew_fac_01
,
7841 rvec
*v_d
, rvec
*v_0
, rvec
*v_1
,
7842 const dd_corners_t
*c
,
7844 gmx_bool bDistBonded
,
7850 gmx_domdec_ind_t
*ind
,
7851 int **ibuf
, int *ibuf_nalloc
,
7857 gmx_domdec_comm_t
*comm
;
7859 gmx_bool bDistMB_pulse
;
7861 real r2
, rb2
, r
, tric_sh
;
7864 int nsend_z
, nsend
, nat
;
7868 bScrew
= (dd
->bScrewPBC
&& dim
== XX
);
7870 bDistMB_pulse
= (bDistMB
&& bDistBonded
);
7876 for (cg
= cg0
; cg
< cg1
; cg
++)
7880 if (tric_dist
[dim_ind
] == 0)
7882 /* Rectangular direction, easy */
7883 r
= cg_cm
[cg
][dim
] - c
->c
[dim_ind
][zone
];
7890 r
= cg_cm
[cg
][dim
] - c
->bc
[dim_ind
];
7896 /* Rounding gives at most a 16% reduction
7897 * in communicated atoms
7899 if (dim_ind
>= 1 && (zonei
== 1 || zonei
== 2))
7901 r
= cg_cm
[cg
][dim0
] - c
->cr0
;
7902 /* This is the first dimension, so always r >= 0 */
7909 if (dim_ind
== 2 && (zonei
== 2 || zonei
== 3))
7911 r
= cg_cm
[cg
][dim1
] - c
->cr1
[zone
];
7918 r
= cg_cm
[cg
][dim1
] - c
->bcr1
;
7928 /* Triclinic direction, more complicated */
7931 /* Rounding, conservative as the skew_fac multiplication
7932 * will slightly underestimate the distance.
7934 if (dim_ind
>= 1 && (zonei
== 1 || zonei
== 2))
7936 rn
[dim0
] = cg_cm
[cg
][dim0
] - c
->cr0
;
7937 for (i
= dim0
+1; i
< DIM
; i
++)
7939 rn
[dim0
] -= cg_cm
[cg
][i
]*v_0
[i
][dim0
];
7941 r2
= rn
[dim0
]*rn
[dim0
]*sf2_round
[dim0
];
7944 rb
[dim0
] = rn
[dim0
];
7947 /* Take care that the cell planes along dim0 might not
7948 * be orthogonal to those along dim1 and dim2.
7950 for (i
= 1; i
<= dim_ind
; i
++)
7953 if (normal
[dim0
][dimd
] > 0)
7955 rn
[dimd
] -= rn
[dim0
]*normal
[dim0
][dimd
];
7958 rb
[dimd
] -= rb
[dim0
]*normal
[dim0
][dimd
];
7963 if (dim_ind
== 2 && (zonei
== 2 || zonei
== 3))
7965 rn
[dim1
] += cg_cm
[cg
][dim1
] - c
->cr1
[zone
];
7967 for (i
= dim1
+1; i
< DIM
; i
++)
7969 tric_sh
-= cg_cm
[cg
][i
]*v_1
[i
][dim1
];
7971 rn
[dim1
] += tric_sh
;
7974 r2
+= rn
[dim1
]*rn
[dim1
]*sf2_round
[dim1
];
7975 /* Take care of coupling of the distances
7976 * to the planes along dim0 and dim1 through dim2.
7978 r2
-= rn
[dim0
]*rn
[dim1
]*skew_fac_01
;
7979 /* Take care that the cell planes along dim1
7980 * might not be orthogonal to that along dim2.
7982 if (normal
[dim1
][dim2
] > 0)
7984 rn
[dim2
] -= rn
[dim1
]*normal
[dim1
][dim2
];
7990 cg_cm
[cg
][dim1
] - c
->bcr1
+ tric_sh
;
7993 rb2
+= rb
[dim1
]*rb
[dim1
]*sf2_round
[dim1
];
7994 /* Take care of coupling of the distances
7995 * to the planes along dim0 and dim1 through dim2.
7997 rb2
-= rb
[dim0
]*rb
[dim1
]*skew_fac_01
;
7998 /* Take care that the cell planes along dim1
7999 * might not be orthogonal to that along dim2.
8001 if (normal
[dim1
][dim2
] > 0)
8003 rb
[dim2
] -= rb
[dim1
]*normal
[dim1
][dim2
];
8008 /* The distance along the communication direction */
8009 rn
[dim
] += cg_cm
[cg
][dim
] - c
->c
[dim_ind
][zone
];
8011 for (i
= dim
+1; i
< DIM
; i
++)
8013 tric_sh
-= cg_cm
[cg
][i
]*v_d
[i
][dim
];
8018 r2
+= rn
[dim
]*rn
[dim
]*skew_fac2_d
;
8019 /* Take care of coupling of the distances
8020 * to the planes along dim0 and dim1 through dim2.
8022 if (dim_ind
== 1 && zonei
== 1)
8024 r2
-= rn
[dim0
]*rn
[dim
]*skew_fac_01
;
8030 rb
[dim
] += cg_cm
[cg
][dim
] - c
->bc
[dim_ind
] + tric_sh
;
8033 rb2
+= rb
[dim
]*rb
[dim
]*skew_fac2_d
;
8034 /* Take care of coupling of the distances
8035 * to the planes along dim0 and dim1 through dim2.
8037 if (dim_ind
== 1 && zonei
== 1)
8039 rb2
-= rb
[dim0
]*rb
[dim
]*skew_fac_01
;
8047 ((bDistMB
&& rb2
< r_bcomm2
) ||
8048 (bDist2B
&& r2
< r_bcomm2
)) &&
8050 (GET_CGINFO_BOND_INTER(cginfo
[cg
]) &&
8051 missing_link(comm
->cglink
, index_gl
[cg
],
8054 /* Make an index to the local charge groups */
8055 if (nsend
+1 > ind
->nalloc
)
8057 ind
->nalloc
= over_alloc_large(nsend
+1);
8058 srenew(ind
->index
, ind
->nalloc
);
8060 if (nsend
+1 > *ibuf_nalloc
)
8062 *ibuf_nalloc
= over_alloc_large(nsend
+1);
8063 srenew(*ibuf
, *ibuf_nalloc
);
8065 ind
->index
[nsend
] = cg
;
8066 (*ibuf
)[nsend
] = index_gl
[cg
];
8068 vec_rvec_check_alloc(vbuf
, nsend
+1);
8070 if (dd
->ci
[dim
] == 0)
8072 /* Correct cg_cm for pbc */
8073 rvec_add(cg_cm
[cg
], box
[dim
], vbuf
->v
[nsend
]);
8076 vbuf
->v
[nsend
][YY
] = box
[YY
][YY
] - vbuf
->v
[nsend
][YY
];
8077 vbuf
->v
[nsend
][ZZ
] = box
[ZZ
][ZZ
] - vbuf
->v
[nsend
][ZZ
];
8082 copy_rvec(cg_cm
[cg
], vbuf
->v
[nsend
]);
8085 nat
+= cgindex
[cg
+1] - cgindex
[cg
];
8091 *nsend_z_ptr
= nsend_z
;
8094 static void setup_dd_communication(gmx_domdec_t
*dd
,
8095 matrix box
, gmx_ddbox_t
*ddbox
,
8096 t_forcerec
*fr
, t_state
*state
, rvec
**f
)
8098 int dim_ind
, dim
, dim0
, dim1
, dim2
, dimd
, p
, nat_tot
;
8099 int nzone
, nzone_send
, zone
, zonei
, cg0
, cg1
;
8100 int c
, i
, j
, cg
, cg_gl
, nrcg
;
8101 int *zone_cg_range
, pos_cg
, *index_gl
, *cgindex
, *recv_i
;
8102 gmx_domdec_comm_t
*comm
;
8103 gmx_domdec_zones_t
*zones
;
8104 gmx_domdec_comm_dim_t
*cd
;
8105 gmx_domdec_ind_t
*ind
;
8106 cginfo_mb_t
*cginfo_mb
;
8107 gmx_bool bBondComm
, bDist2B
, bDistMB
, bDistBonded
;
8108 real r_mb
, r_comm2
, r_scomm2
, r_bcomm2
, r_0
, r_1
, r2inc
, inv_ncg
;
8109 dd_corners_t corners
;
8111 rvec
*cg_cm
, *normal
, *v_d
, *v_0
= NULL
, *v_1
= NULL
, *recv_vr
;
8112 real skew_fac2_d
, skew_fac_01
;
8119 fprintf(debug
, "Setting up DD communication\n");
8124 switch (fr
->cutoff_scheme
)
8133 gmx_incons("unimplemented");
8137 for (dim_ind
= 0; dim_ind
< dd
->ndim
; dim_ind
++)
8139 dim
= dd
->dim
[dim_ind
];
8141 /* Check if we need to use triclinic distances */
8142 tric_dist
[dim_ind
] = 0;
8143 for (i
= 0; i
<= dim_ind
; i
++)
8145 if (ddbox
->tric_dir
[dd
->dim
[i
]])
8147 tric_dist
[dim_ind
] = 1;
8152 bBondComm
= comm
->bBondComm
;
8154 /* Do we need to determine extra distances for multi-body bondeds? */
8155 bDistMB
= (comm
->bInterCGMultiBody
&& dd
->bGridJump
&& dd
->ndim
> 1);
8157 /* Do we need to determine extra distances for only two-body bondeds? */
8158 bDist2B
= (bBondComm
&& !bDistMB
);
8160 r_comm2
= sqr(comm
->cutoff
);
8161 r_bcomm2
= sqr(comm
->cutoff_mbody
);
8165 fprintf(debug
, "bBondComm %d, r_bc %f\n", bBondComm
, sqrt(r_bcomm2
));
8168 zones
= &comm
->zones
;
8171 dim1
= (dd
->ndim
>= 2 ? dd
->dim
[1] : -1);
8172 dim2
= (dd
->ndim
>= 3 ? dd
->dim
[2] : -1);
8174 set_dd_corners(dd
, dim0
, dim1
, dim2
, bDistMB
, &corners
);
8176 /* Triclinic stuff */
8177 normal
= ddbox
->normal
;
8181 v_0
= ddbox
->v
[dim0
];
8182 if (ddbox
->tric_dir
[dim0
] && ddbox
->tric_dir
[dim1
])
8184 /* Determine the coupling coefficient for the distances
8185 * to the cell planes along dim0 and dim1 through dim2.
8186 * This is required for correct rounding.
8189 ddbox
->v
[dim0
][dim1
+1][dim0
]*ddbox
->v
[dim1
][dim1
+1][dim1
];
8192 fprintf(debug
, "\nskew_fac_01 %f\n", skew_fac_01
);
8198 v_1
= ddbox
->v
[dim1
];
8201 zone_cg_range
= zones
->cg_range
;
8202 index_gl
= dd
->index_gl
;
8203 cgindex
= dd
->cgindex
;
8204 cginfo_mb
= fr
->cginfo_mb
;
8206 zone_cg_range
[0] = 0;
8207 zone_cg_range
[1] = dd
->ncg_home
;
8208 comm
->zone_ncg1
[0] = dd
->ncg_home
;
8209 pos_cg
= dd
->ncg_home
;
8211 nat_tot
= dd
->nat_home
;
8213 for (dim_ind
= 0; dim_ind
< dd
->ndim
; dim_ind
++)
8215 dim
= dd
->dim
[dim_ind
];
8216 cd
= &comm
->cd
[dim_ind
];
8218 if (dim
>= ddbox
->npbcdim
&& dd
->ci
[dim
] == 0)
8220 /* No pbc in this dimension, the first node should not comm. */
8228 v_d
= ddbox
->v
[dim
];
8229 skew_fac2_d
= sqr(ddbox
->skew_fac
[dim
]);
8231 cd
->bInPlace
= TRUE
;
8232 for (p
= 0; p
< cd
->np
; p
++)
8234 /* Only atoms communicated in the first pulse are used
8235 * for multi-body bonded interactions or for bBondComm.
8237 bDistBonded
= ((bDistMB
|| bDist2B
) && p
== 0);
8242 for (zone
= 0; zone
< nzone_send
; zone
++)
8244 if (tric_dist
[dim_ind
] && dim_ind
> 0)
8246 /* Determine slightly more optimized skew_fac's
8248 * This reduces the number of communicated atoms
8249 * by about 10% for 3D DD of rhombic dodecahedra.
8251 for (dimd
= 0; dimd
< dim
; dimd
++)
8253 sf2_round
[dimd
] = 1;
8254 if (ddbox
->tric_dir
[dimd
])
8256 for (i
= dd
->dim
[dimd
]+1; i
< DIM
; i
++)
8258 /* If we are shifted in dimension i
8259 * and the cell plane is tilted forward
8260 * in dimension i, skip this coupling.
8262 if (!(zones
->shift
[nzone
+zone
][i
] &&
8263 ddbox
->v
[dimd
][i
][dimd
] >= 0))
8266 sqr(ddbox
->v
[dimd
][i
][dimd
]);
8269 sf2_round
[dimd
] = 1/sf2_round
[dimd
];
8274 zonei
= zone_perm
[dim_ind
][zone
];
8277 /* Here we permutate the zones to obtain a convenient order
8278 * for neighbor searching
8280 cg0
= zone_cg_range
[zonei
];
8281 cg1
= zone_cg_range
[zonei
+1];
8285 /* Look only at the cg's received in the previous grid pulse
8287 cg1
= zone_cg_range
[nzone
+zone
+1];
8288 cg0
= cg1
- cd
->ind
[p
-1].nrecv
[zone
];
8291 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8292 for (th
= 0; th
< comm
->nth
; th
++)
8294 gmx_domdec_ind_t
*ind_p
;
8295 int **ibuf_p
, *ibuf_nalloc_p
;
8297 int *nsend_p
, *nat_p
;
8303 /* Thread 0 writes in the comm buffers */
8305 ibuf_p
= &comm
->buf_int
;
8306 ibuf_nalloc_p
= &comm
->nalloc_int
;
8307 vbuf_p
= &comm
->vbuf
;
8310 nsend_zone_p
= &ind
->nsend
[zone
];
8314 /* Other threads write into temp buffers */
8315 ind_p
= &comm
->dth
[th
].ind
;
8316 ibuf_p
= &comm
->dth
[th
].ibuf
;
8317 ibuf_nalloc_p
= &comm
->dth
[th
].ibuf_nalloc
;
8318 vbuf_p
= &comm
->dth
[th
].vbuf
;
8319 nsend_p
= &comm
->dth
[th
].nsend
;
8320 nat_p
= &comm
->dth
[th
].nat
;
8321 nsend_zone_p
= &comm
->dth
[th
].nsend_zone
;
8323 comm
->dth
[th
].nsend
= 0;
8324 comm
->dth
[th
].nat
= 0;
8325 comm
->dth
[th
].nsend_zone
= 0;
8335 cg0_th
= cg0
+ ((cg1
- cg0
)* th
)/comm
->nth
;
8336 cg1_th
= cg0
+ ((cg1
- cg0
)*(th
+1))/comm
->nth
;
8339 /* Get the cg's for this pulse in this zone */
8340 get_zone_pulse_cgs(dd
, zonei
, zone
, cg0_th
, cg1_th
,
8342 dim
, dim_ind
, dim0
, dim1
, dim2
,
8345 normal
, skew_fac2_d
, skew_fac_01
,
8346 v_d
, v_0
, v_1
, &corners
, sf2_round
,
8347 bDistBonded
, bBondComm
,
8351 ibuf_p
, ibuf_nalloc_p
,
8357 /* Append data of threads>=1 to the communication buffers */
8358 for (th
= 1; th
< comm
->nth
; th
++)
8360 dd_comm_setup_work_t
*dth
;
8363 dth
= &comm
->dth
[th
];
8365 ns1
= nsend
+ dth
->nsend_zone
;
8366 if (ns1
> ind
->nalloc
)
8368 ind
->nalloc
= over_alloc_dd(ns1
);
8369 srenew(ind
->index
, ind
->nalloc
);
8371 if (ns1
> comm
->nalloc_int
)
8373 comm
->nalloc_int
= over_alloc_dd(ns1
);
8374 srenew(comm
->buf_int
, comm
->nalloc_int
);
8376 if (ns1
> comm
->vbuf
.nalloc
)
8378 comm
->vbuf
.nalloc
= over_alloc_dd(ns1
);
8379 srenew(comm
->vbuf
.v
, comm
->vbuf
.nalloc
);
8382 for (i
= 0; i
< dth
->nsend_zone
; i
++)
8384 ind
->index
[nsend
] = dth
->ind
.index
[i
];
8385 comm
->buf_int
[nsend
] = dth
->ibuf
[i
];
8386 copy_rvec(dth
->vbuf
.v
[i
],
8387 comm
->vbuf
.v
[nsend
]);
8391 ind
->nsend
[zone
] += dth
->nsend_zone
;
8394 /* Clear the counts in case we do not have pbc */
8395 for (zone
= nzone_send
; zone
< nzone
; zone
++)
8397 ind
->nsend
[zone
] = 0;
8399 ind
->nsend
[nzone
] = nsend
;
8400 ind
->nsend
[nzone
+1] = nat
;
8401 /* Communicate the number of cg's and atoms to receive */
8402 dd_sendrecv_int(dd
, dim_ind
, dddirBackward
,
8403 ind
->nsend
, nzone
+2,
8404 ind
->nrecv
, nzone
+2);
8406 /* The rvec buffer is also required for atom buffers of size nsend
8407 * in dd_move_x and dd_move_f.
8409 vec_rvec_check_alloc(&comm
->vbuf
, ind
->nsend
[nzone
+1]);
8413 /* We can receive in place if only the last zone is not empty */
8414 for (zone
= 0; zone
< nzone
-1; zone
++)
8416 if (ind
->nrecv
[zone
] > 0)
8418 cd
->bInPlace
= FALSE
;
8423 /* The int buffer is only required here for the cg indices */
8424 if (ind
->nrecv
[nzone
] > comm
->nalloc_int2
)
8426 comm
->nalloc_int2
= over_alloc_dd(ind
->nrecv
[nzone
]);
8427 srenew(comm
->buf_int2
, comm
->nalloc_int2
);
8429 /* The rvec buffer is also required for atom buffers
8430 * of size nrecv in dd_move_x and dd_move_f.
8432 i
= max(cd
->ind
[0].nrecv
[nzone
+1], ind
->nrecv
[nzone
+1]);
8433 vec_rvec_check_alloc(&comm
->vbuf2
, i
);
8437 /* Make space for the global cg indices */
8438 if (pos_cg
+ ind
->nrecv
[nzone
] > dd
->cg_nalloc
8439 || dd
->cg_nalloc
== 0)
8441 dd
->cg_nalloc
= over_alloc_dd(pos_cg
+ ind
->nrecv
[nzone
]);
8442 srenew(index_gl
, dd
->cg_nalloc
);
8443 srenew(cgindex
, dd
->cg_nalloc
+1);
8445 /* Communicate the global cg indices */
8448 recv_i
= index_gl
+ pos_cg
;
8452 recv_i
= comm
->buf_int2
;
8454 dd_sendrecv_int(dd
, dim_ind
, dddirBackward
,
8455 comm
->buf_int
, nsend
,
8456 recv_i
, ind
->nrecv
[nzone
]);
8458 /* Make space for cg_cm */
8459 dd_check_alloc_ncg(fr
, state
, f
, pos_cg
+ ind
->nrecv
[nzone
]);
8460 if (fr
->cutoff_scheme
== ecutsGROUP
)
8468 /* Communicate cg_cm */
8471 recv_vr
= cg_cm
+ pos_cg
;
8475 recv_vr
= comm
->vbuf2
.v
;
8477 dd_sendrecv_rvec(dd
, dim_ind
, dddirBackward
,
8478 comm
->vbuf
.v
, nsend
,
8479 recv_vr
, ind
->nrecv
[nzone
]);
8481 /* Make the charge group index */
8484 zone
= (p
== 0 ? 0 : nzone
- 1);
8485 while (zone
< nzone
)
8487 for (cg
= 0; cg
< ind
->nrecv
[zone
]; cg
++)
8489 cg_gl
= index_gl
[pos_cg
];
8490 fr
->cginfo
[pos_cg
] = ddcginfo(cginfo_mb
, cg_gl
);
8491 nrcg
= GET_CGINFO_NATOMS(fr
->cginfo
[pos_cg
]);
8492 cgindex
[pos_cg
+1] = cgindex
[pos_cg
] + nrcg
;
8495 /* Update the charge group presence,
8496 * so we can use it in the next pass of the loop.
8498 comm
->bLocalCG
[cg_gl
] = TRUE
;
8504 comm
->zone_ncg1
[nzone
+zone
] = ind
->nrecv
[zone
];
8507 zone_cg_range
[nzone
+zone
] = pos_cg
;
8512 /* This part of the code is never executed with bBondComm. */
8513 merge_cg_buffers(nzone
, cd
, p
, zone_cg_range
,
8514 index_gl
, recv_i
, cg_cm
, recv_vr
,
8515 cgindex
, fr
->cginfo_mb
, fr
->cginfo
);
8516 pos_cg
+= ind
->nrecv
[nzone
];
8518 nat_tot
+= ind
->nrecv
[nzone
+1];
8522 /* Store the atom block for easy copying of communication buffers */
8523 make_cell2at_index(cd
, nzone
, zone_cg_range
[nzone
], cgindex
);
8527 dd
->index_gl
= index_gl
;
8528 dd
->cgindex
= cgindex
;
8530 dd
->ncg_tot
= zone_cg_range
[zones
->n
];
8531 dd
->nat_tot
= nat_tot
;
8532 comm
->nat
[ddnatHOME
] = dd
->nat_home
;
8533 for (i
= ddnatZONE
; i
< ddnatNR
; i
++)
8535 comm
->nat
[i
] = dd
->nat_tot
;
8540 /* We don't need to update cginfo, since that was alrady done above.
8541 * So we pass NULL for the forcerec.
8543 dd_set_cginfo(dd
->index_gl
, dd
->ncg_home
, dd
->ncg_tot
,
8544 NULL
, comm
->bLocalCG
);
8549 fprintf(debug
, "Finished setting up DD communication, zones:");
8550 for (c
= 0; c
< zones
->n
; c
++)
8552 fprintf(debug
, " %d", zones
->cg_range
[c
+1]-zones
->cg_range
[c
]);
8554 fprintf(debug
, "\n");
8558 static void set_cg_boundaries(gmx_domdec_zones_t
*zones
)
8562 for (c
= 0; c
< zones
->nizone
; c
++)
8564 zones
->izone
[c
].cg1
= zones
->cg_range
[c
+1];
8565 zones
->izone
[c
].jcg0
= zones
->cg_range
[zones
->izone
[c
].j0
];
8566 zones
->izone
[c
].jcg1
= zones
->cg_range
[zones
->izone
[c
].j1
];
8570 static void set_zones_size(gmx_domdec_t
*dd
,
8571 matrix box
, const gmx_ddbox_t
*ddbox
,
8572 int zone_start
, int zone_end
)
8574 gmx_domdec_comm_t
*comm
;
8575 gmx_domdec_zones_t
*zones
;
8577 int z
, zi
, zj0
, zj1
, d
, dim
;
8580 real size_j
, add_tric
;
8585 zones
= &comm
->zones
;
8587 /* Do we need to determine extra distances for multi-body bondeds? */
8588 bDistMB
= (comm
->bInterCGMultiBody
&& dd
->bGridJump
&& dd
->ndim
> 1);
8590 for (z
= zone_start
; z
< zone_end
; z
++)
8592 /* Copy cell limits to zone limits.
8593 * Valid for non-DD dims and non-shifted dims.
8595 copy_rvec(comm
->cell_x0
, zones
->size
[z
].x0
);
8596 copy_rvec(comm
->cell_x1
, zones
->size
[z
].x1
);
8599 for (d
= 0; d
< dd
->ndim
; d
++)
8603 for (z
= 0; z
< zones
->n
; z
++)
8605 /* With a staggered grid we have different sizes
8606 * for non-shifted dimensions.
8608 if (dd
->bGridJump
&& zones
->shift
[z
][dim
] == 0)
8612 zones
->size
[z
].x0
[dim
] = comm
->zone_d1
[zones
->shift
[z
][dd
->dim
[d
-1]]].min0
;
8613 zones
->size
[z
].x1
[dim
] = comm
->zone_d1
[zones
->shift
[z
][dd
->dim
[d
-1]]].max1
;
8617 zones
->size
[z
].x0
[dim
] = comm
->zone_d2
[zones
->shift
[z
][dd
->dim
[d
-2]]][zones
->shift
[z
][dd
->dim
[d
-1]]].min0
;
8618 zones
->size
[z
].x1
[dim
] = comm
->zone_d2
[zones
->shift
[z
][dd
->dim
[d
-2]]][zones
->shift
[z
][dd
->dim
[d
-1]]].max1
;
8624 rcmbs
= comm
->cutoff_mbody
;
8625 if (ddbox
->tric_dir
[dim
])
8627 rcs
/= ddbox
->skew_fac
[dim
];
8628 rcmbs
/= ddbox
->skew_fac
[dim
];
8631 /* Set the lower limit for the shifted zone dimensions */
8632 for (z
= zone_start
; z
< zone_end
; z
++)
8634 if (zones
->shift
[z
][dim
] > 0)
8637 if (!dd
->bGridJump
|| d
== 0)
8639 zones
->size
[z
].x0
[dim
] = comm
->cell_x1
[dim
];
8640 zones
->size
[z
].x1
[dim
] = comm
->cell_x1
[dim
] + rcs
;
8644 /* Here we take the lower limit of the zone from
8645 * the lowest domain of the zone below.
8649 zones
->size
[z
].x0
[dim
] =
8650 comm
->zone_d1
[zones
->shift
[z
][dd
->dim
[d
-1]]].min1
;
8656 zones
->size
[z
].x0
[dim
] =
8657 zones
->size
[zone_perm
[2][z
-4]].x0
[dim
];
8661 zones
->size
[z
].x0
[dim
] =
8662 comm
->zone_d2
[zones
->shift
[z
][dd
->dim
[d
-2]]][zones
->shift
[z
][dd
->dim
[d
-1]]].min1
;
8665 /* A temporary limit, is updated below */
8666 zones
->size
[z
].x1
[dim
] = zones
->size
[z
].x0
[dim
];
8670 for (zi
= 0; zi
< zones
->nizone
; zi
++)
8672 if (zones
->shift
[zi
][dim
] == 0)
8674 /* This takes the whole zone into account.
8675 * With multiple pulses this will lead
8676 * to a larger zone then strictly necessary.
8678 zones
->size
[z
].x1
[dim
] = max(zones
->size
[z
].x1
[dim
],
8679 zones
->size
[zi
].x1
[dim
]+rcmbs
);
8687 /* Loop over the i-zones to set the upper limit of each
8690 for (zi
= 0; zi
< zones
->nizone
; zi
++)
8692 if (zones
->shift
[zi
][dim
] == 0)
8694 for (z
= zones
->izone
[zi
].j0
; z
< zones
->izone
[zi
].j1
; z
++)
8696 if (zones
->shift
[z
][dim
] > 0)
8698 zones
->size
[z
].x1
[dim
] = max(zones
->size
[z
].x1
[dim
],
8699 zones
->size
[zi
].x1
[dim
]+rcs
);
8706 for (z
= zone_start
; z
< zone_end
; z
++)
8708 /* Initialization only required to keep the compiler happy */
8709 rvec corner_min
= {0, 0, 0}, corner_max
= {0, 0, 0}, corner
;
8712 /* To determine the bounding box for a zone we need to find
8713 * the extreme corners of 4, 2 or 1 corners.
8715 nc
= 1 << (ddbox
->npbcdim
- 1);
8717 for (c
= 0; c
< nc
; c
++)
8719 /* Set up a zone corner at x=0, ignoring trilinic couplings */
8723 corner
[YY
] = zones
->size
[z
].x0
[YY
];
8727 corner
[YY
] = zones
->size
[z
].x1
[YY
];
8731 corner
[ZZ
] = zones
->size
[z
].x0
[ZZ
];
8735 corner
[ZZ
] = zones
->size
[z
].x1
[ZZ
];
8737 if (dd
->ndim
== 1 && box
[ZZ
][YY
] != 0)
8739 /* With 1D domain decomposition the cg's are not in
8740 * the triclinic box, but triclinic x-y and rectangular y-z.
8741 * Shift y back, so it will later end up at 0.
8743 corner
[YY
] -= corner
[ZZ
]*box
[ZZ
][YY
]/box
[ZZ
][ZZ
];
8745 /* Apply the triclinic couplings */
8746 for (i
= YY
; i
< ddbox
->npbcdim
; i
++)
8748 for (j
= XX
; j
< i
; j
++)
8750 corner
[j
] += corner
[i
]*box
[i
][j
]/box
[i
][i
];
8755 copy_rvec(corner
, corner_min
);
8756 copy_rvec(corner
, corner_max
);
8760 for (i
= 0; i
< DIM
; i
++)
8762 corner_min
[i
] = min(corner_min
[i
], corner
[i
]);
8763 corner_max
[i
] = max(corner_max
[i
], corner
[i
]);
8767 /* Copy the extreme cornes without offset along x */
8768 for (i
= 0; i
< DIM
; i
++)
8770 zones
->size
[z
].bb_x0
[i
] = corner_min
[i
];
8771 zones
->size
[z
].bb_x1
[i
] = corner_max
[i
];
8773 /* Add the offset along x */
8774 zones
->size
[z
].bb_x0
[XX
] += zones
->size
[z
].x0
[XX
];
8775 zones
->size
[z
].bb_x1
[XX
] += zones
->size
[z
].x1
[XX
];
8778 if (zone_start
== 0)
8781 for (dim
= 0; dim
< DIM
; dim
++)
8783 vol
*= zones
->size
[0].x1
[dim
] - zones
->size
[0].x0
[dim
];
8785 zones
->dens_zone0
= (zones
->cg_range
[1] - zones
->cg_range
[0])/vol
;
8790 for (z
= zone_start
; z
< zone_end
; z
++)
8792 fprintf(debug
, "zone %d %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
8794 zones
->size
[z
].x0
[XX
], zones
->size
[z
].x1
[XX
],
8795 zones
->size
[z
].x0
[YY
], zones
->size
[z
].x1
[YY
],
8796 zones
->size
[z
].x0
[ZZ
], zones
->size
[z
].x1
[ZZ
]);
8797 fprintf(debug
, "zone %d bb %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
8799 zones
->size
[z
].bb_x0
[XX
], zones
->size
[z
].bb_x1
[XX
],
8800 zones
->size
[z
].bb_x0
[YY
], zones
->size
[z
].bb_x1
[YY
],
8801 zones
->size
[z
].bb_x0
[ZZ
], zones
->size
[z
].bb_x1
[ZZ
]);
8806 static int comp_cgsort(const void *a
, const void *b
)
8810 gmx_cgsort_t
*cga
, *cgb
;
8811 cga
= (gmx_cgsort_t
*)a
;
8812 cgb
= (gmx_cgsort_t
*)b
;
8814 comp
= cga
->nsc
- cgb
->nsc
;
8817 comp
= cga
->ind_gl
- cgb
->ind_gl
;
8823 static void order_int_cg(int n
, const gmx_cgsort_t
*sort
,
8828 /* Order the data */
8829 for (i
= 0; i
< n
; i
++)
8831 buf
[i
] = a
[sort
[i
].ind
];
8834 /* Copy back to the original array */
8835 for (i
= 0; i
< n
; i
++)
8841 static void order_vec_cg(int n
, const gmx_cgsort_t
*sort
,
8846 /* Order the data */
8847 for (i
= 0; i
< n
; i
++)
8849 copy_rvec(v
[sort
[i
].ind
], buf
[i
]);
8852 /* Copy back to the original array */
8853 for (i
= 0; i
< n
; i
++)
8855 copy_rvec(buf
[i
], v
[i
]);
8859 static void order_vec_atom(int ncg
, const int *cgindex
, const gmx_cgsort_t
*sort
,
8862 int a
, atot
, cg
, cg0
, cg1
, i
;
8864 if (cgindex
== NULL
)
8866 /* Avoid the useless loop of the atoms within a cg */
8867 order_vec_cg(ncg
, sort
, v
, buf
);
8872 /* Order the data */
8874 for (cg
= 0; cg
< ncg
; cg
++)
8876 cg0
= cgindex
[sort
[cg
].ind
];
8877 cg1
= cgindex
[sort
[cg
].ind
+1];
8878 for (i
= cg0
; i
< cg1
; i
++)
8880 copy_rvec(v
[i
], buf
[a
]);
8886 /* Copy back to the original array */
8887 for (a
= 0; a
< atot
; a
++)
8889 copy_rvec(buf
[a
], v
[a
]);
8893 static void ordered_sort(int nsort2
, gmx_cgsort_t
*sort2
,
8894 int nsort_new
, gmx_cgsort_t
*sort_new
,
8895 gmx_cgsort_t
*sort1
)
8899 /* The new indices are not very ordered, so we qsort them */
8900 qsort_threadsafe(sort_new
, nsort_new
, sizeof(sort_new
[0]), comp_cgsort
);
8902 /* sort2 is already ordered, so now we can merge the two arrays */
8906 while (i2
< nsort2
|| i_new
< nsort_new
)
8910 sort1
[i1
++] = sort_new
[i_new
++];
8912 else if (i_new
== nsort_new
)
8914 sort1
[i1
++] = sort2
[i2
++];
8916 else if (sort2
[i2
].nsc
< sort_new
[i_new
].nsc
||
8917 (sort2
[i2
].nsc
== sort_new
[i_new
].nsc
&&
8918 sort2
[i2
].ind_gl
< sort_new
[i_new
].ind_gl
))
8920 sort1
[i1
++] = sort2
[i2
++];
8924 sort1
[i1
++] = sort_new
[i_new
++];
8929 static int dd_sort_order(gmx_domdec_t
*dd
, t_forcerec
*fr
, int ncg_home_old
)
8931 gmx_domdec_sort_t
*sort
;
8932 gmx_cgsort_t
*cgsort
, *sort_i
;
8933 int ncg_new
, nsort2
, nsort_new
, i
, *a
, moved
, *ibuf
;
8934 int sort_last
, sort_skip
;
8936 sort
= dd
->comm
->sort
;
8938 a
= fr
->ns
.grid
->cell_index
;
8940 moved
= NSGRID_SIGNAL_MOVED_FAC
*fr
->ns
.grid
->ncells
;
8942 if (ncg_home_old
>= 0)
8944 /* The charge groups that remained in the same ns grid cell
8945 * are completely ordered. So we can sort efficiently by sorting
8946 * the charge groups that did move into the stationary list.
8951 for (i
= 0; i
< dd
->ncg_home
; i
++)
8953 /* Check if this cg did not move to another node */
8956 if (i
>= ncg_home_old
|| a
[i
] != sort
->sort
[i
].nsc
)
8958 /* This cg is new on this node or moved ns grid cell */
8959 if (nsort_new
>= sort
->sort_new_nalloc
)
8961 sort
->sort_new_nalloc
= over_alloc_dd(nsort_new
+1);
8962 srenew(sort
->sort_new
, sort
->sort_new_nalloc
);
8964 sort_i
= &(sort
->sort_new
[nsort_new
++]);
8968 /* This cg did not move */
8969 sort_i
= &(sort
->sort2
[nsort2
++]);
8971 /* Sort on the ns grid cell indices
8972 * and the global topology index.
8973 * index_gl is irrelevant with cell ns,
8974 * but we set it here anyhow to avoid a conditional.
8977 sort_i
->ind_gl
= dd
->index_gl
[i
];
8984 fprintf(debug
, "ordered sort cgs: stationary %d moved %d\n",
8987 /* Sort efficiently */
8988 ordered_sort(nsort2
, sort
->sort2
, nsort_new
, sort
->sort_new
,
8993 cgsort
= sort
->sort
;
8995 for (i
= 0; i
< dd
->ncg_home
; i
++)
8997 /* Sort on the ns grid cell indices
8998 * and the global topology index
9000 cgsort
[i
].nsc
= a
[i
];
9001 cgsort
[i
].ind_gl
= dd
->index_gl
[i
];
9003 if (cgsort
[i
].nsc
< moved
)
9010 fprintf(debug
, "qsort cgs: %d new home %d\n", dd
->ncg_home
, ncg_new
);
9012 /* Determine the order of the charge groups using qsort */
9013 qsort_threadsafe(cgsort
, dd
->ncg_home
, sizeof(cgsort
[0]), comp_cgsort
);
9019 static int dd_sort_order_nbnxn(gmx_domdec_t
*dd
, t_forcerec
*fr
)
9022 int ncg_new
, i
, *a
, na
;
9024 sort
= dd
->comm
->sort
->sort
;
9026 nbnxn_get_atomorder(fr
->nbv
->nbs
, &a
, &na
);
9029 for (i
= 0; i
< na
; i
++)
9033 sort
[ncg_new
].ind
= a
[i
];
9041 static void dd_sort_state(gmx_domdec_t
*dd
, rvec
*cgcm
, t_forcerec
*fr
, t_state
*state
,
9044 gmx_domdec_sort_t
*sort
;
9045 gmx_cgsort_t
*cgsort
, *sort_i
;
9047 int ncg_new
, i
, *ibuf
, cgsize
;
9050 sort
= dd
->comm
->sort
;
9052 if (dd
->ncg_home
> sort
->sort_nalloc
)
9054 sort
->sort_nalloc
= over_alloc_dd(dd
->ncg_home
);
9055 srenew(sort
->sort
, sort
->sort_nalloc
);
9056 srenew(sort
->sort2
, sort
->sort_nalloc
);
9058 cgsort
= sort
->sort
;
9060 switch (fr
->cutoff_scheme
)
9063 ncg_new
= dd_sort_order(dd
, fr
, ncg_home_old
);
9066 ncg_new
= dd_sort_order_nbnxn(dd
, fr
);
9069 gmx_incons("unimplemented");
9073 /* We alloc with the old size, since cgindex is still old */
9074 vec_rvec_check_alloc(&dd
->comm
->vbuf
, dd
->cgindex
[dd
->ncg_home
]);
9075 vbuf
= dd
->comm
->vbuf
.v
;
9079 cgindex
= dd
->cgindex
;
9086 /* Remove the charge groups which are no longer at home here */
9087 dd
->ncg_home
= ncg_new
;
9090 fprintf(debug
, "Set the new home charge group count to %d\n",
9094 /* Reorder the state */
9095 for (i
= 0; i
< estNR
; i
++)
9097 if (EST_DISTR(i
) && (state
->flags
& (1<<i
)))
9102 order_vec_atom(dd
->ncg_home
, cgindex
, cgsort
, state
->x
, vbuf
);
9105 order_vec_atom(dd
->ncg_home
, cgindex
, cgsort
, state
->v
, vbuf
);
9108 order_vec_atom(dd
->ncg_home
, cgindex
, cgsort
, state
->sd_X
, vbuf
);
9111 order_vec_atom(dd
->ncg_home
, cgindex
, cgsort
, state
->cg_p
, vbuf
);
9115 case estDISRE_INITF
:
9116 case estDISRE_RM3TAV
:
9117 case estORIRE_INITF
:
9119 /* No ordering required */
9122 gmx_incons("Unknown state entry encountered in dd_sort_state");
9127 if (fr
->cutoff_scheme
== ecutsGROUP
)
9130 order_vec_cg(dd
->ncg_home
, cgsort
, cgcm
, vbuf
);
9133 if (dd
->ncg_home
+1 > sort
->ibuf_nalloc
)
9135 sort
->ibuf_nalloc
= over_alloc_dd(dd
->ncg_home
+1);
9136 srenew(sort
->ibuf
, sort
->ibuf_nalloc
);
9139 /* Reorder the global cg index */
9140 order_int_cg(dd
->ncg_home
, cgsort
, dd
->index_gl
, ibuf
);
9141 /* Reorder the cginfo */
9142 order_int_cg(dd
->ncg_home
, cgsort
, fr
->cginfo
, ibuf
);
9143 /* Rebuild the local cg index */
9147 for (i
= 0; i
< dd
->ncg_home
; i
++)
9149 cgsize
= dd
->cgindex
[cgsort
[i
].ind
+1] - dd
->cgindex
[cgsort
[i
].ind
];
9150 ibuf
[i
+1] = ibuf
[i
] + cgsize
;
9152 for (i
= 0; i
< dd
->ncg_home
+1; i
++)
9154 dd
->cgindex
[i
] = ibuf
[i
];
9159 for (i
= 0; i
< dd
->ncg_home
+1; i
++)
9164 /* Set the home atom number */
9165 dd
->nat_home
= dd
->cgindex
[dd
->ncg_home
];
9167 if (fr
->cutoff_scheme
== ecutsVERLET
)
9169 /* The atoms are now exactly in grid order, update the grid order */
9170 nbnxn_set_atomorder(fr
->nbv
->nbs
);
9174 /* Copy the sorted ns cell indices back to the ns grid struct */
9175 for (i
= 0; i
< dd
->ncg_home
; i
++)
9177 fr
->ns
.grid
->cell_index
[i
] = cgsort
[i
].nsc
;
9179 fr
->ns
.grid
->nr
= dd
->ncg_home
;
9183 static void add_dd_statistics(gmx_domdec_t
*dd
)
9185 gmx_domdec_comm_t
*comm
;
9190 for (ddnat
= ddnatZONE
; ddnat
< ddnatNR
; ddnat
++)
9192 comm
->sum_nat
[ddnat
-ddnatZONE
] +=
9193 comm
->nat
[ddnat
] - comm
->nat
[ddnat
-1];
9198 void reset_dd_statistics_counters(gmx_domdec_t
*dd
)
9200 gmx_domdec_comm_t
*comm
;
9205 /* Reset all the statistics and counters for total run counting */
9206 for (ddnat
= ddnatZONE
; ddnat
< ddnatNR
; ddnat
++)
9208 comm
->sum_nat
[ddnat
-ddnatZONE
] = 0;
9212 comm
->load_step
= 0;
9215 clear_ivec(comm
->load_lim
);
9220 void print_dd_statistics(t_commrec
*cr
, t_inputrec
*ir
, FILE *fplog
)
9222 gmx_domdec_comm_t
*comm
;
9226 comm
= cr
->dd
->comm
;
9228 gmx_sumd(ddnatNR
-ddnatZONE
, comm
->sum_nat
, cr
);
9235 fprintf(fplog
, "\n D O M A I N D E C O M P O S I T I O N S T A T I S T I C S\n\n");
9237 for (ddnat
= ddnatZONE
; ddnat
< ddnatNR
; ddnat
++)
9239 av
= comm
->sum_nat
[ddnat
-ddnatZONE
]/comm
->ndecomp
;
9244 " av. #atoms communicated per step for force: %d x %.1f\n",
9248 if (cr
->dd
->vsite_comm
)
9251 " av. #atoms communicated per step for vsites: %d x %.1f\n",
9252 (EEL_PME(ir
->coulombtype
) || ir
->coulombtype
== eelEWALD
) ? 3 : 2,
9257 if (cr
->dd
->constraint_comm
)
9260 " av. #atoms communicated per step for LINCS: %d x %.1f\n",
9261 1 + ir
->nLincsIter
, av
);
9265 gmx_incons(" Unknown type for DD statistics");
9268 fprintf(fplog
, "\n");
9270 if (comm
->bRecordLoad
&& EI_DYNAMICS(ir
->eI
))
9272 print_dd_load_av(fplog
, cr
->dd
);
9276 void dd_partition_system(FILE *fplog
,
9277 gmx_large_int_t step
,
9279 gmx_bool bMasterState
,
9281 t_state
*state_global
,
9282 gmx_mtop_t
*top_global
,
9284 t_state
*state_local
,
9287 gmx_localtop_t
*top_local
,
9290 gmx_shellfc_t shellfc
,
9291 gmx_constr_t constr
,
9293 gmx_wallcycle_t wcycle
,
9297 gmx_domdec_comm_t
*comm
;
9298 gmx_ddbox_t ddbox
= {0};
9300 gmx_large_int_t step_pcoupl
;
9301 rvec cell_ns_x0
, cell_ns_x1
;
9302 int i
, j
, n
, ncgindex_set
, ncg_home_old
= -1, ncg_moved
, nat_f_novirsum
;
9303 gmx_bool bBoxChanged
, bNStGlobalComm
, bDoDLB
, bCheckDLB
, bTurnOnDLB
, bLogLoad
;
9304 gmx_bool bRedist
, bSortCG
, bResortAll
;
9305 ivec ncells_old
= {0, 0, 0}, ncells_new
= {0, 0, 0}, np
;
9312 bBoxChanged
= (bMasterState
|| DEFORM(*ir
));
9313 if (ir
->epc
!= epcNO
)
9315 /* With nstpcouple > 1 pressure coupling happens.
9316 * one step after calculating the pressure.
9317 * Box scaling happens at the end of the MD step,
9318 * after the DD partitioning.
9319 * We therefore have to do DLB in the first partitioning
9320 * after an MD step where P-coupling occured.
9321 * We need to determine the last step in which p-coupling occurred.
9322 * MRS -- need to validate this for vv?
9327 step_pcoupl
= step
- 1;
9331 step_pcoupl
= ((step
- 1)/n
)*n
+ 1;
9333 if (step_pcoupl
>= comm
->partition_step
)
9339 bNStGlobalComm
= (step
% nstglobalcomm
== 0);
9341 if (!comm
->bDynLoadBal
)
9347 /* Should we do dynamic load balacing this step?
9348 * Since it requires (possibly expensive) global communication,
9349 * we might want to do DLB less frequently.
9351 if (bBoxChanged
|| ir
->epc
!= epcNO
)
9353 bDoDLB
= bBoxChanged
;
9357 bDoDLB
= bNStGlobalComm
;
9361 /* Check if we have recorded loads on the nodes */
9362 if (comm
->bRecordLoad
&& dd_load_count(comm
))
9364 if (comm
->eDLB
== edlbAUTO
&& !comm
->bDynLoadBal
)
9366 /* Check if we should use DLB at the second partitioning
9367 * and every 100 partitionings,
9368 * so the extra communication cost is negligible.
9370 n
= max(100, nstglobalcomm
);
9371 bCheckDLB
= (comm
->n_load_collect
== 0 ||
9372 comm
->n_load_have
% n
== n
-1);
9379 /* Print load every nstlog, first and last step to the log file */
9380 bLogLoad
= ((ir
->nstlog
> 0 && step
% ir
->nstlog
== 0) ||
9381 comm
->n_load_collect
== 0 ||
9383 (step
+ ir
->nstlist
> ir
->init_step
+ ir
->nsteps
)));
9385 /* Avoid extra communication due to verbose screen output
9386 * when nstglobalcomm is set.
9388 if (bDoDLB
|| bLogLoad
|| bCheckDLB
||
9389 (bVerbose
&& (ir
->nstlist
== 0 || nstglobalcomm
<= ir
->nstlist
)))
9391 get_load_distribution(dd
, wcycle
);
9396 dd_print_load(fplog
, dd
, step
-1);
9400 dd_print_load_verbose(dd
);
9403 comm
->n_load_collect
++;
9407 /* Since the timings are node dependent, the master decides */
9411 (dd_force_imb_perf_loss(dd
) >= DD_PERF_LOSS
);
9414 fprintf(debug
, "step %s, imb loss %f\n",
9415 gmx_step_str(step
, sbuf
),
9416 dd_force_imb_perf_loss(dd
));
9419 dd_bcast(dd
, sizeof(bTurnOnDLB
), &bTurnOnDLB
);
9422 turn_on_dlb(fplog
, cr
, step
);
9427 comm
->n_load_have
++;
9430 cgs_gl
= &comm
->cgs_gl
;
9435 /* Clear the old state */
9436 clear_dd_indices(dd
, 0, 0);
9439 set_ddbox(dd
, bMasterState
, cr
, ir
, state_global
->box
,
9440 TRUE
, cgs_gl
, state_global
->x
, &ddbox
);
9442 get_cg_distribution(fplog
, step
, dd
, cgs_gl
,
9443 state_global
->box
, &ddbox
, state_global
->x
);
9445 dd_distribute_state(dd
, cgs_gl
,
9446 state_global
, state_local
, f
);
9448 dd_make_local_cgs(dd
, &top_local
->cgs
);
9450 /* Ensure that we have space for the new distribution */
9451 dd_check_alloc_ncg(fr
, state_local
, f
, dd
->ncg_home
);
9453 if (fr
->cutoff_scheme
== ecutsGROUP
)
9455 calc_cgcm(fplog
, 0, dd
->ncg_home
,
9456 &top_local
->cgs
, state_local
->x
, fr
->cg_cm
);
9459 inc_nrnb(nrnb
, eNR_CGCM
, dd
->nat_home
);
9461 dd_set_cginfo(dd
->index_gl
, 0, dd
->ncg_home
, fr
, comm
->bLocalCG
);
9463 else if (state_local
->ddp_count
!= dd
->ddp_count
)
9465 if (state_local
->ddp_count
> dd
->ddp_count
)
9467 gmx_fatal(FARGS
, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local
->ddp_count
, dd
->ddp_count
);
9470 if (state_local
->ddp_count_cg_gl
!= state_local
->ddp_count
)
9472 gmx_fatal(FARGS
, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local
->ddp_count_cg_gl
, state_local
->ddp_count
);
9475 /* Clear the old state */
9476 clear_dd_indices(dd
, 0, 0);
9478 /* Build the new indices */
9479 rebuild_cgindex(dd
, cgs_gl
->index
, state_local
);
9480 make_dd_indices(dd
, cgs_gl
->index
, 0);
9481 ncgindex_set
= dd
->ncg_home
;
9483 if (fr
->cutoff_scheme
== ecutsGROUP
)
9485 /* Redetermine the cg COMs */
9486 calc_cgcm(fplog
, 0, dd
->ncg_home
,
9487 &top_local
->cgs
, state_local
->x
, fr
->cg_cm
);
9490 inc_nrnb(nrnb
, eNR_CGCM
, dd
->nat_home
);
9492 dd_set_cginfo(dd
->index_gl
, 0, dd
->ncg_home
, fr
, comm
->bLocalCG
);
9494 set_ddbox(dd
, bMasterState
, cr
, ir
, state_local
->box
,
9495 TRUE
, &top_local
->cgs
, state_local
->x
, &ddbox
);
9497 bRedist
= comm
->bDynLoadBal
;
9501 /* We have the full state, only redistribute the cgs */
9503 /* Clear the non-home indices */
9504 clear_dd_indices(dd
, dd
->ncg_home
, dd
->nat_home
);
9507 /* Avoid global communication for dim's without pbc and -gcom */
9508 if (!bNStGlobalComm
)
9510 copy_rvec(comm
->box0
, ddbox
.box0
);
9511 copy_rvec(comm
->box_size
, ddbox
.box_size
);
9513 set_ddbox(dd
, bMasterState
, cr
, ir
, state_local
->box
,
9514 bNStGlobalComm
, &top_local
->cgs
, state_local
->x
, &ddbox
);
9519 /* For dim's without pbc and -gcom */
9520 copy_rvec(ddbox
.box0
, comm
->box0
);
9521 copy_rvec(ddbox
.box_size
, comm
->box_size
);
9523 set_dd_cell_sizes(dd
, &ddbox
, dynamic_dd_box(&ddbox
, ir
), bMasterState
, bDoDLB
,
9526 if (comm
->nstDDDumpGrid
> 0 && step
% comm
->nstDDDumpGrid
== 0)
9528 write_dd_grid_pdb("dd_grid", step
, dd
, state_local
->box
, &ddbox
);
9531 /* Check if we should sort the charge groups */
9532 if (comm
->nstSortCG
> 0)
9534 bSortCG
= (bMasterState
||
9535 (bRedist
&& (step
% comm
->nstSortCG
== 0)));
9542 ncg_home_old
= dd
->ncg_home
;
9547 wallcycle_sub_start(wcycle
, ewcsDD_REDIST
);
9549 dd_redistribute_cg(fplog
, step
, dd
, ddbox
.tric_dir
,
9551 !bSortCG
, nrnb
, &ncgindex_set
, &ncg_moved
);
9553 wallcycle_sub_stop(wcycle
, ewcsDD_REDIST
);
9556 get_nsgrid_boundaries(ddbox
.nboundeddim
, state_local
->box
,
9558 &comm
->cell_x0
, &comm
->cell_x1
,
9559 dd
->ncg_home
, fr
->cg_cm
,
9560 cell_ns_x0
, cell_ns_x1
, &grid_density
);
9564 comm_dd_ns_cell_sizes(dd
, &ddbox
, cell_ns_x0
, cell_ns_x1
, step
);
9567 switch (fr
->cutoff_scheme
)
9570 copy_ivec(fr
->ns
.grid
->n
, ncells_old
);
9571 grid_first(fplog
, fr
->ns
.grid
, dd
, &ddbox
,
9572 state_local
->box
, cell_ns_x0
, cell_ns_x1
,
9573 fr
->rlistlong
, grid_density
);
9576 nbnxn_get_ncells(fr
->nbv
->nbs
, &ncells_old
[XX
], &ncells_old
[YY
]);
9579 gmx_incons("unimplemented");
9581 /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9582 copy_ivec(ddbox
.tric_dir
, comm
->tric_dir
);
9586 wallcycle_sub_start(wcycle
, ewcsDD_GRID
);
9588 /* Sort the state on charge group position.
9589 * This enables exact restarts from this step.
9590 * It also improves performance by about 15% with larger numbers
9591 * of atoms per node.
9594 /* Fill the ns grid with the home cell,
9595 * so we can sort with the indices.
9597 set_zones_ncg_home(dd
);
9599 switch (fr
->cutoff_scheme
)
9602 set_zones_size(dd
, state_local
->box
, &ddbox
, 0, 1);
9604 nbnxn_put_on_grid(fr
->nbv
->nbs
, fr
->ePBC
, state_local
->box
,
9606 comm
->zones
.size
[0].bb_x0
,
9607 comm
->zones
.size
[0].bb_x1
,
9609 comm
->zones
.dens_zone0
,
9612 ncg_moved
, bRedist
? comm
->moved
: NULL
,
9613 fr
->nbv
->grp
[eintLocal
].kernel_type
,
9614 fr
->nbv
->grp
[eintLocal
].nbat
);
9616 nbnxn_get_ncells(fr
->nbv
->nbs
, &ncells_new
[XX
], &ncells_new
[YY
]);
9619 fill_grid(&comm
->zones
, fr
->ns
.grid
, dd
->ncg_home
,
9620 0, dd
->ncg_home
, fr
->cg_cm
);
9622 copy_ivec(fr
->ns
.grid
->n
, ncells_new
);
9625 gmx_incons("unimplemented");
9628 bResortAll
= bMasterState
;
9630 /* Check if we can user the old order and ns grid cell indices
9631 * of the charge groups to sort the charge groups efficiently.
9633 if (ncells_new
[XX
] != ncells_old
[XX
] ||
9634 ncells_new
[YY
] != ncells_old
[YY
] ||
9635 ncells_new
[ZZ
] != ncells_old
[ZZ
])
9642 fprintf(debug
, "Step %s, sorting the %d home charge groups\n",
9643 gmx_step_str(step
, sbuf
), dd
->ncg_home
);
9645 dd_sort_state(dd
, fr
->cg_cm
, fr
, state_local
,
9646 bResortAll
? -1 : ncg_home_old
);
9647 /* Rebuild all the indices */
9648 ga2la_clear(dd
->ga2la
);
9651 wallcycle_sub_stop(wcycle
, ewcsDD_GRID
);
9654 wallcycle_sub_start(wcycle
, ewcsDD_SETUPCOMM
);
9656 /* Setup up the communication and communicate the coordinates */
9657 setup_dd_communication(dd
, state_local
->box
, &ddbox
, fr
, state_local
, f
);
9659 /* Set the indices */
9660 make_dd_indices(dd
, cgs_gl
->index
, ncgindex_set
);
9662 /* Set the charge group boundaries for neighbor searching */
9663 set_cg_boundaries(&comm
->zones
);
9665 if (fr
->cutoff_scheme
== ecutsVERLET
)
9667 set_zones_size(dd
, state_local
->box
, &ddbox
,
9668 bSortCG
? 1 : 0, comm
->zones
.n
);
9671 wallcycle_sub_stop(wcycle
, ewcsDD_SETUPCOMM
);
9674 write_dd_pdb("dd_home",step,"dump",top_global,cr,
9675 -1,state_local->x,state_local->box);
9678 wallcycle_sub_start(wcycle
, ewcsDD_MAKETOP
);
9680 /* Extract a local topology from the global topology */
9681 for (i
= 0; i
< dd
->ndim
; i
++)
9683 np
[dd
->dim
[i
]] = comm
->cd
[i
].np
;
9685 dd_make_local_top(dd
, &comm
->zones
, dd
->npbcdim
, state_local
->box
,
9686 comm
->cellsize_min
, np
,
9688 fr
->cutoff_scheme
== ecutsGROUP
? fr
->cg_cm
: state_local
->x
,
9689 vsite
, top_global
, top_local
);
9691 wallcycle_sub_stop(wcycle
, ewcsDD_MAKETOP
);
9693 wallcycle_sub_start(wcycle
, ewcsDD_MAKECONSTR
);
9695 /* Set up the special atom communication */
9696 n
= comm
->nat
[ddnatZONE
];
9697 for (i
= ddnatZONE
+1; i
< ddnatNR
; i
++)
9702 if (vsite
&& vsite
->n_intercg_vsite
)
9704 n
= dd_make_local_vsites(dd
, n
, top_local
->idef
.il
);
9708 if (dd
->bInterCGcons
|| dd
->bInterCGsettles
)
9710 /* Only for inter-cg constraints we need special code */
9711 n
= dd_make_local_constraints(dd
, n
, top_global
, fr
->cginfo
,
9712 constr
, ir
->nProjOrder
,
9713 top_local
->idef
.il
);
9717 gmx_incons("Unknown special atom type setup");
9722 wallcycle_sub_stop(wcycle
, ewcsDD_MAKECONSTR
);
9724 wallcycle_sub_start(wcycle
, ewcsDD_TOPOTHER
);
9726 /* Make space for the extra coordinates for virtual site
9727 * or constraint communication.
9729 state_local
->natoms
= comm
->nat
[ddnatNR
-1];
9730 if (state_local
->natoms
> state_local
->nalloc
)
9732 dd_realloc_state(state_local
, f
, state_local
->natoms
);
9735 if (fr
->bF_NoVirSum
)
9737 if (vsite
&& vsite
->n_intercg_vsite
)
9739 nat_f_novirsum
= comm
->nat
[ddnatVSITE
];
9743 if (EEL_FULL(ir
->coulombtype
) && dd
->n_intercg_excl
> 0)
9745 nat_f_novirsum
= dd
->nat_tot
;
9749 nat_f_novirsum
= dd
->nat_home
;
9758 /* Set the number of atoms required for the force calculation.
9759 * Forces need to be constrained when using a twin-range setup
9760 * or with energy minimization. For simple simulations we could
9761 * avoid some allocation, zeroing and copying, but this is
9762 * probably not worth the complications ande checking.
9764 forcerec_set_ranges(fr
, dd
->ncg_home
, dd
->ncg_tot
,
9765 dd
->nat_tot
, comm
->nat
[ddnatCON
], nat_f_novirsum
);
9767 /* We make the all mdatoms up to nat_tot_con.
9768 * We could save some work by only setting invmass
9769 * between nat_tot and nat_tot_con.
9771 /* This call also sets the new number of home particles to dd->nat_home */
9772 atoms2md(top_global
, ir
,
9773 comm
->nat
[ddnatCON
], dd
->gatindex
, 0, dd
->nat_home
, mdatoms
);
9775 /* Now we have the charges we can sort the FE interactions */
9776 dd_sort_local_top(dd
, mdatoms
, top_local
);
9780 /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9781 split_vsites_over_threads(top_local
->idef
.il
, mdatoms
, FALSE
, vsite
);
9786 /* Make the local shell stuff, currently no communication is done */
9787 make_local_shells(cr
, mdatoms
, shellfc
);
9790 if (ir
->implicit_solvent
)
9792 make_local_gb(cr
, fr
->born
, ir
->gb_algorithm
);
9795 setup_bonded_threading(fr
, &top_local
->idef
);
9797 if (!(cr
->duty
& DUTY_PME
))
9799 /* Send the charges and/or c6/sigmas to our PME only node */
9800 gmx_pme_send_parameters(cr
, mdatoms
->nChargePerturbed
, mdatoms
->nTypePerturbed
,
9801 mdatoms
->chargeA
, mdatoms
->chargeB
,
9802 mdatoms
->c6A
, mdatoms
->c6B
,
9803 mdatoms
->sigmaA
, mdatoms
->sigmaB
,
9804 dd_pme_maxshift_x(dd
), dd_pme_maxshift_y(dd
));
9809 set_constraints(constr
, top_local
, ir
, mdatoms
, cr
);
9812 if (ir
->ePull
!= epullNO
)
9814 /* Update the local pull groups */
9815 dd_make_local_pull_groups(dd
, ir
->pull
, mdatoms
);
9820 /* Update the local rotation groups */
9821 dd_make_local_rotation_groups(dd
, ir
->rot
);
9825 add_dd_statistics(dd
);
9827 /* Make sure we only count the cycles for this DD partitioning */
9828 clear_dd_cycle_counts(dd
);
9830 /* Because the order of the atoms might have changed since
9831 * the last vsite construction, we need to communicate the constructing
9832 * atom coordinates again (for spreading the forces this MD step).
9834 dd_move_x_vsites(dd
, state_local
->box
, state_local
->x
);
9836 wallcycle_sub_stop(wcycle
, ewcsDD_TOPOTHER
);
9838 if (comm
->nstDDDump
> 0 && step
% comm
->nstDDDump
== 0)
9840 dd_move_x(dd
, state_local
->box
, state_local
->x
);
9841 write_dd_pdb("dd_dump", step
, "dump", top_global
, cr
,
9842 -1, state_local
->x
, state_local
->box
);
9845 /* Store the partitioning step */
9846 comm
->partition_step
= step
;
9848 /* Increase the DD partitioning counter */
9850 /* The state currently matches this DD partitioning count, store it */
9851 state_local
->ddp_count
= dd
->ddp_count
;
9854 /* The DD master node knows the complete cg distribution,
9855 * store the count so we can possibly skip the cg info communication.
9857 comm
->master_cg_ddp_count
= (bSortCG
? 0 : dd
->ddp_count
);
9860 if (comm
->DD_debug
> 0)
9862 /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9863 check_index_consistency(dd
, top_global
->natoms
, ncg_mtop(top_global
),
9864 "after partitioning");